In [141]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('darkgrid')

train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Separate our target field 'Survived' from our training data set.

In [142]:
has_family_train = (train_data['SibSp'] + train_data['Parch'] > 0).astype(int)
has_family_test = (test_data['SibSp'] + test_data['Parch'] > 0).astype(int)

In [143]:
train_data['has_family'] = has_family_train
test_data['has_family'] = has_family_test

In [144]:
train_data = train_data.drop(columns = ['Name', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Embarked'])
test_data = test_data.drop(columns = ['Name', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Embarked'])

In [145]:
print("Null value in train data: ")
for i in train_data.columns:
    print(i + ' ' + str(train_data[i].isnull().sum()))
    
print("\nNull value in test data: ")
for i in test_data.columns:
    print(i + ' ' + str(test_data[i].isnull().sum()))

Null value in train data: 
PassengerId 0
Survived 0
Pclass 0
Sex 0
Age 177
Cabin 687
has_family 0

Null value in test data: 
PassengerId 0
Pclass 0
Sex 0
Age 86
Cabin 327
has_family 0


In [146]:
train_data = train_data.drop(columns = ['Cabin'])
test_data = test_data.drop(columns = ['Cabin'])
train_data['Sex'] = (train_data['Sex'] == 'male').astype(int)
test_data['Sex'] = (test_data['Sex'] == 'male').astype(int)

train_data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Sex              int32
Age            float64
has_family       int32
dtype: object

In [147]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,has_family
0,1,0,3,1,22.0,1
1,2,1,1,0,38.0,1
2,3,1,3,0,26.0,0
3,4,1,1,0,35.0,1
4,5,0,3,1,35.0,0


In [148]:
train_data = train_data.dropna()
train_data_without_age = train_data.drop(columns = ['Age'])

train_data['Age'] = (train_data['Age'] // 10).astype(int)

In [149]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,has_family
0,1,0,3,1,2,1
1,2,1,1,0,3,1
2,3,1,3,0,2,0
3,4,1,1,0,3,1
4,5,0,3,1,3,0


In [150]:
train_data_without_age.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,has_family
0,1,0,3,1,1
1,2,1,1,0,1
2,3,1,3,0,0
3,4,1,1,0,1
4,5,0,3,1,0


In [151]:
x_train_with_age = train_data.get(['PassengerId', 'Pclass', 'Sex', 'Age', 'has_family'])
y_train_with_age = train_data.get(['Survived']).values.ravel()
x_train_without_age = train_data_without_age.get(['PassengerId', 'Pclass', 'Sex', 'has_family'])
y_train_without_age = train_data_without_age.get(['Survived']).values.ravel()

print("\nNull value in test data: ")
for i in test_data.columns:
    print(i + ' ' + str(test_data[i].isnull().sum()))
    
test_data_without_age = test_data.get(['PassengerId', 'Pclass', 'Sex', 'has_family'])
test_data_with_age = test_data.dropna()
test_data_with_null_age = test_data[test_data['Age'].isnull()].drop(columns = ['Age'])


Null value in test data: 
PassengerId 0
Pclass 0
Sex 0
Age 86
has_family 0


In [152]:
print(x_train_with_age.shape, y_train_with_age.shape, x_train_without_age.shape, y_train_without_age.shape, test_data.shape, test_data_with_age.shape, test_data_with_null_age.shape)

(714, 5) (714,) (714, 4) (714,) (418, 5) (332, 5) (86, 4)


In [153]:
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [154]:
# Logistic Regression

logreg_with_age = LogisticRegression(solver='lbfgs', max_iter=1000)
logreg_with_age.fit(x_train_with_age, y_train_with_age)
pred_with_age = logreg_with_age.predict(test_data_with_age)
acc_log = round(logreg_with_age.score(x_train_with_age, y_train_with_age) * 100, 2)
acc_log

78.71

In [155]:
logreg_without_age = LogisticRegression(solver='lbfgs', max_iter=1000)
logreg_without_age.fit(x_train_without_age, y_train_without_age)
pred_without_age = logreg_without_age.predict(test_data_with_null_age)
acc_log = round(logreg_without_age.score(x_train_without_age, y_train_without_age) * 100, 2)
acc_log

78.15

In [156]:
test_data_with_null_age['Survived'] = pred_without_age
test_data_with_null_age['Age'] = np.nan
test_data_with_age['Survived'] = pred_with_age

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_with_age['Survived'] = pred_with_age


In [157]:
test_data_with_age.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,has_family,Survived
0,892,3,1,34.5,0,0
1,893,3,0,47.0,1,0
2,894,2,1,62.0,0,0
3,895,3,1,27.0,0,0
4,896,3,0,22.0,1,0


In [158]:
test_data_with_null_age.head()

Unnamed: 0,PassengerId,Pclass,Sex,has_family,Survived,Age
10,902,3,1,0,0,
22,914,1,0,0,1,
29,921,3,1,1,0,
33,925,3,0,1,1,
36,928,3,0,0,1,


In [159]:
predict = pd.concat([test_data_with_age, test_data_with_null_age])

In [160]:
predict_result = predict.get(['PassengerId', 'Survived'])

In [161]:
type(predict_result)

pandas.core.frame.DataFrame

In [162]:
predict_result.to_csv('submission.csv', index=False)