In [1459]:
import pandas as pd
import math
# from fancyimpute import KNN, MICE
import re
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [1460]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
# train.Fare.hist(by=train['Survived'], bins=50)

full = pd.concat([train.drop(columns='Survived'), test], axis=0)
test_passenger_ids = test['PassengerId']


In [1461]:
print(full.isna().any())


PassengerId    False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare            True
Cabin           True
Embarked        True
dtype: bool


In [1462]:
full[full['Embarked'].isnull()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [1463]:
"""Replacing the missing values of Emabarked with S, which is the mode of the data."""

'Replacing the missing values of Emabarked with S, which is the mode of the data.'

In [1464]:
#Impute missing Embarked feature
full.Embarked.fillna('S', inplace=True)

#Convert Embarked to numerical value
full['Embarked'] = LabelEncoder().fit_transform(full['Embarked'])

In [1465]:
"""Dropping Ticket column as it doesn't seem to be useful"""
full.drop(columns=['Ticket'], inplace=True)

In [1466]:
#Add deck feature
full['Cabin'] = full['Cabin'].fillna('@') #because its ord is 1 less than 'A'

#Convert cabin to deck info
full['Deck'] = full.apply(lambda x: x.Cabin.upper()[0], axis=1)

#Convert categorial deck info to numbers
full['Deck'] = LabelEncoder().fit_transform(full['Deck'])

#Drop cabin as it is not useful now
full.drop(columns=['Cabin'], inplace=True)

In [1467]:
print(full.isna().any())

PassengerId    False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Fare            True
Embarked       False
Deck           False
dtype: bool


In [1468]:
# Add Title attribute
TITLE_REGEXP = ' [A-Za-z]+\.'

full['Title'] = full.apply(lambda row: re.search(TITLE_REGEXP, row.Name).group(0).strip(), axis=1)
# print(full['Title'].groupby(full['Title']).groups.keys())
full['Title'] = full['Title'].replace('Mlle.', 'Miss.')
full['Title'] = full['Title'].replace('Ms.', 'Miss.')
full['Title'] = full['Title'].replace('Mme.', 'Mrs.')
# print(full['Title'].groupby(full['Title']).agg('count'))

#Drop name as it is not interesting anymore
full.drop(columns=['Name'], inplace=True)
full

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title
0,1,3,male,22.0,1,0,7.2500,2,0,Mr.
1,2,1,female,38.0,1,0,71.2833,0,3,Mrs.
2,3,3,female,26.0,0,0,7.9250,2,0,Miss.
3,4,1,female,35.0,1,0,53.1000,2,3,Mrs.
4,5,3,male,35.0,0,0,8.0500,2,0,Mr.
5,6,3,male,,0,0,8.4583,1,0,Mr.
6,7,1,male,54.0,0,0,51.8625,2,5,Mr.
7,8,3,male,2.0,3,1,21.0750,2,0,Master.
8,9,3,female,27.0,0,2,11.1333,2,0,Mrs.
9,10,2,female,14.0,1,0,30.0708,0,0,Mrs.


In [1469]:
#Imputing Age data based on title
titles = full['Title'].unique()

full['Age'].fillna(-1, inplace=True)
title_median_dict = {}

for title in titles:
    median = full.Age[(full['Age'] != -1) & (full['Title'] == title)].median()
    title_median_dict[title] = median
    
for i, row in full.iterrows():
    if row['Age'] == -1:
        row['Age'] = title_median_dict[row['Title']]

full.head()
print(full.isna().any())

PassengerId    False
Pclass         False
Sex            False
Age            False
SibSp          False
Parch          False
Fare            True
Embarked       False
Deck           False
Title          False
dtype: bool


In [1470]:
#Impute missing fare data similarly to missing age data
full['Fare'].fillna(-1, inplace=True)

for i, row in full.iterrows():
    if row['Fare'] == -1:
        row['Fare'] = full[(full['Fare'] != -1) & (full['Pclass'] == row['Pclass'])].median()
print(full.isna().any())

PassengerId    False
Pclass         False
Sex            False
Age            False
SibSp          False
Parch          False
Fare           False
Embarked       False
Deck           False
Title          False
dtype: bool


In [1474]:
#Binning title info
title_mapping = {}
common_titles = ['Mr.', 'Miss.', 'Mrs.', 'Master.', 'Dr.', 'Rev.']
print(full['Title'].value_counts())

def get_title_numerical(title):
    if title in common_titles:
        return common_titles.index(title)+1
    return 7 # rare title

full['Title'] = full['Title'].apply(get_title_numerical)


Mr.          757
Miss.        264
Mrs.         198
Master.       61
Dr.            8
Rev.           8
Col.           4
Major.         2
Lady.          1
Don.           1
Dona.          1
Jonkheer.      1
Capt.          1
Sir.           1
Countess.      1
Name: Title, dtype: int64


In [1475]:
#Convert Sex to numerical
full['Sex'] = LabelEncoder().fit_transform(full['Sex'])

In [1476]:
#Add family-size feature
full['familySize'] = full.apply(lambda row: row.SibSp + row.Parch+1, axis=1)


In [1477]:
#Add alone feature
full['isAlone'] = full.apply(lambda x: 1 if x.SibSp==0 and x.Parch==0 else 0, axis =1)

In [1478]:

# colnames = x_full.columns
# print(colnames)
# print(x_full.isna().any())
# x_full = pd.DataFrame(MICE().complete(x_full))
# x_full.columns = colnames


In [1479]:
#Bin fares
bins = [-1,10,50,100,200,400,1000000]
x_full['Fare'] = np.searchsorted(bins, x_full['Fare'].values)
x_full

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title,familySize,isAlone
0,3,1,3,1,0,1,1,10,1,2,0
1,1,0,5,1,0,1,2,3,3,2,0
2,3,0,4,0,0,1,1,10,2,1,1
3,1,0,4,1,0,1,1,3,3,2,0
4,3,1,4,0,0,1,1,10,1,1,1
5,3,1,8,0,0,1,3,10,1,1,1
6,1,1,5,0,0,1,1,5,1,1,1
7,3,1,1,3,1,1,1,10,4,5,0
8,3,0,4,0,2,1,1,10,3,3,0
9,2,0,2,1,0,1,2,10,3,2,0


In [1480]:
#Bin ages
bins = [0,12,17,25,35,55,75,120]
x_full['Age'] = np.searchsorted(bins, x_full['Age'].values)

# print(x_full.isna().any())
x_full

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title,familySize,isAlone
0,3,1,1,1,0,1,1,10,1,2,0
1,1,0,1,1,0,1,2,3,3,2,0
2,3,0,1,0,0,1,1,10,2,1,1
3,1,0,1,1,0,1,1,3,3,2,0
4,3,1,1,0,0,1,1,10,1,1,1
5,3,1,1,0,0,1,3,10,1,1,1
6,1,1,1,0,0,1,1,5,1,1,1
7,3,1,1,3,1,1,1,10,4,5,0
8,3,0,1,0,2,1,1,10,3,3,0
9,2,0,1,1,0,1,2,10,3,2,0


In [1481]:
X_train = x_full[:891]
X_test = x_full[891:]
print(len(X_train))
print(len(X_test))
X_train.columns

891
418


Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Deck',
       'Title', 'familySize', 'isAlone'],
      dtype='object')

In [1482]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, log_loss

classifiers = [
    KNeighborsClassifier(3),
    LinearSVC(),
    SVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
	AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
#     LinearDiscriminantAnalysis(),
#     QuadraticDiscriminantAnalysis(),
    LogisticRegression(),
    ExtraTreesClassifier()]

v_classifiers = [
    KNeighborsClassifier(3),
#     LinearSVC(),
    SVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=100),
	AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
#     LinearDiscriminantAnalysis(),
#     QuadraticDiscriminantAnalysis(),
    LogisticRegression(),
    ExtraTreesClassifier()]

voting_classifier = VotingClassifier(estimators=[(str(i),v_classifiers[i]) for i in range(len(v_classifiers))] , voting='soft')

sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)



In [1483]:
#Labels for the training and validation set
Y_train = train.Survived


In [1484]:
for train_index, val_index in sss.split(X_train, Y_train):
    x_tr, x_va = X_train.iloc[train_index], X_train.iloc[val_index]
    y_tr, y_va = Y_train.iloc[train_index], Y_train.iloc[val_index]
    
    for clf in classifiers+[voting_classifier]:
        name = clf.__class__.__name__
        print(name)
        clf.fit(x_tr, y_tr)
        predictions = clf.predict(x_va)
        acc = accuracy_score(y_va, predictions)
        if name in acc_dict:
            acc_dict[name] += acc
        else:
            acc_dict[name] = acc
            
            
for clf in acc_dict:
    acc_dict[clf] = acc_dict[clf]/10.0
    
print(acc_dict)

KNeighborsClassifier
LinearSVC
SVC
DecisionTreeClassifier
RandomForestClassifier
AdaBoostClassifier
GradientBoostingClassifier
GaussianNB
LogisticRegression
ExtraTreesClassifier
VotingClassifier


  if diff:


KNeighborsClassifier
LinearSVC
SVC
DecisionTreeClassifier
RandomForestClassifier
AdaBoostClassifier
GradientBoostingClassifier
GaussianNB
LogisticRegression
ExtraTreesClassifier
VotingClassifier


  if diff:


KNeighborsClassifier
LinearSVC
SVC
DecisionTreeClassifier
RandomForestClassifier
AdaBoostClassifier
GradientBoostingClassifier
GaussianNB
LogisticRegression
ExtraTreesClassifier
VotingClassifier


  if diff:


KNeighborsClassifier
LinearSVC
SVC
DecisionTreeClassifier
RandomForestClassifier
AdaBoostClassifier
GradientBoostingClassifier
GaussianNB
LogisticRegression
ExtraTreesClassifier
VotingClassifier


  if diff:


KNeighborsClassifier
LinearSVC
SVC
DecisionTreeClassifier
RandomForestClassifier
AdaBoostClassifier
GradientBoostingClassifier
GaussianNB
LogisticRegression
ExtraTreesClassifier
VotingClassifier


  if diff:


KNeighborsClassifier
LinearSVC
SVC
DecisionTreeClassifier
RandomForestClassifier
AdaBoostClassifier
GradientBoostingClassifier
GaussianNB
LogisticRegression
ExtraTreesClassifier
VotingClassifier


  if diff:


KNeighborsClassifier
LinearSVC
SVC
DecisionTreeClassifier
RandomForestClassifier
AdaBoostClassifier
GradientBoostingClassifier
GaussianNB
LogisticRegression
ExtraTreesClassifier
VotingClassifier


  if diff:


KNeighborsClassifier
LinearSVC
SVC
DecisionTreeClassifier
RandomForestClassifier
AdaBoostClassifier
GradientBoostingClassifier
GaussianNB
LogisticRegression
ExtraTreesClassifier
VotingClassifier


  if diff:


KNeighborsClassifier
LinearSVC
SVC
DecisionTreeClassifier
RandomForestClassifier
AdaBoostClassifier
GradientBoostingClassifier
GaussianNB
LogisticRegression
ExtraTreesClassifier
VotingClassifier


  if diff:


KNeighborsClassifier
LinearSVC
SVC
DecisionTreeClassifier
RandomForestClassifier
AdaBoostClassifier
GradientBoostingClassifier
GaussianNB
LogisticRegression
ExtraTreesClassifier
VotingClassifier
{'KNeighborsClassifier': 0.8901270334437061, 'LinearSVC': 0.8869000974498247, 'SVC': 0.9206530704805171, 'DecisionTreeClassifier': 0.9193297367178197, 'RandomForestClassifier': 0.9095850112024723, 'AdaBoostClassifier': 0.9112320809187938, 'GradientBoostingClassifier': 0.9125567203040099, 'GaussianNB': 0.8834887421982595, 'LogisticRegression': 0.8815224408514059, 'ExtraTreesClassifier': 0.9031867279919702, 'VotingClassifier': 0.9076364675555555}


  if diff:


In [1485]:
#We'll use the best classifer out of all the above

final_predictions = []
for clf in classifiers:
    if clf.__class__.__name__ == 'SVC':
        final_predictions = clf.predict(X_test)

In [1486]:
final_predictions

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,

In [1487]:
final_predictions2 = [0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0]
result = pd.DataFrame()
result['PassengerId'] = test_passenger_ids
result['Survived'] = final_predictions

In [1488]:
result

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [1489]:
result.to_csv('result_9_imputing_age_and_fare_using_pclass_value.csv', index=False)