In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
df_train['name'] = df_train['Name'].str.split()
df_test['name'] = df_test['Name'].str.split()

In [4]:
df_train['Title']=df_train['Name'].str.extract('([A-Za-z]+)\.',expand=True)
df_test['Title']=df_test['Name'].str.extract('([A-Za-z]+)\.',expand=True)

In [5]:
df_train = df_train.drop('Cabin', axis = 1)
df_test = df_test.drop('Cabin', axis = 1)

In [6]:
mapping = {'Mlle': 'Miss', 'Major': 'Mr', 'Col': 'Mr', 'Sir': 'Mr', 'Don': 'Mr', 'Mme': 'Miss',
          'Jonkheer': 'Mr', 'Lady': 'Mrs', 'Capt': 'Mr', 'Countess': 'Mrs', 'Ms': 'Miss', 'Dona': 'Mrs'}

In [7]:
df_train.replace({'Title': mapping}, inplace=True)
df_test.replace({'Title': mapping}, inplace=True)

In [8]:
df_train['Male'] = df_train['Sex'].map({'male': 1, 'female': 0})
df_test['Male'] = df_test['Sex'].map({'male': 1, 'female': 0})

In [9]:
df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch'] + 1
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch'] + 1

In [10]:
titles=['Mr','Miss','Mrs','Master','Rev','Dr']
for title in titles:
    age_to_impute = df_train.groupby('Title')['Age'].median()[titles.index(title)]
    #print(age_to_impute)
    df_train.loc[(df_train['Age'].isnull()) & (df_train['Title'] == title), 'Age'] = age_to_impute
    
titles=['Mr','Miss','Mrs','Master','Rev','Dr']
for title in titles:
    age_to_impute = df_test.groupby('Title')['Age'].median()[titles.index(title)]
    #print(age_to_impute)
    df_test.loc[(df_test['Age'].isnull()) & (df_test['Title'] == title), 'Age'] = age_to_impute

In [11]:
df_train = df_train.drop('Sex', axis = 1)
df_test = df_test.drop('Sex', axis = 1)

In [12]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Embarked,name,Title,Male,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,S,"[Braund,, Mr., Owen, Harris]",Mr,1,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C,"[Cumings,, Mrs., John, Bradley, (Florence, Bri...",Mrs,0,2
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,S,"[Heikkinen,, Miss., Laina]",Miss,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,S,"[Futrelle,, Mrs., Jacques, Heath, (Lily, May, ...",Mrs,0,2
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,S,"[Allen,, Mr., William, Henry]",Mr,1,1


In [13]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Embarked,name,Title,Male,FamilySize
0,892,3,"Kelly, Mr. James",34.5,0,0,330911,7.8292,Q,"[Kelly,, Mr., James]",Mr,1,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",47.0,1,0,363272,7.0,S,"[Wilkes,, Mrs., James, (Ellen, Needs)]",Mrs,0,2
2,894,2,"Myles, Mr. Thomas Francis",62.0,0,0,240276,9.6875,Q,"[Myles,, Mr., Thomas, Francis]",Mr,1,1
3,895,3,"Wirz, Mr. Albert",27.0,0,0,315154,8.6625,S,"[Wirz,, Mr., Albert]",Mr,1,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",22.0,1,1,3101298,12.2875,S,"[Hirvonen,, Mrs., Alexander, (Helga, E, Lindqv...",Mrs,0,3


In [14]:
df_train = df_train.drop(['Ticket', 'Name', 'name'], axis = 1)
df_test = df_test.drop(['Ticket', 'Name', 'name'], axis = 1)

# Fare normalizing

In [15]:
df_train['Fare'].fillna(df_train['Fare'].mean(),inplace=True)
df_test['Fare'].fillna(df_test['Fare'].mean(),inplace=True)

In [16]:
df_train['normFare'] = df_train['Fare'] / df_train['FamilySize']
df_test['normFare'] = df_test['Fare'] / df_test['FamilySize']

In [17]:
df_train = df_train.drop('Fare', axis=1)
df_test = df_test.drop('Fare', axis=1)

# Dummies

In [18]:
df_train = pd.get_dummies(df_train, columns=['Title'], drop_first=True)
df_test = pd.get_dummies(df_test, columns=['Title'], drop_first=True)

In [19]:
#df_train = pd.get_dummies(df_train, columns=['Embarked'], drop_first=True)
#df_test = pd.get_dummies(df_test, columns=['Embarked'], drop_first=True)
df_train = df_train.drop('Embarked', axis = 1)
df_test = df_test.drop('Embarked', axis = 1)

In [20]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Male,FamilySize,normFare,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rev
0,1,0,3,22.0,1,0,1,2,3.625,0,0,1,0,0
1,2,1,1,38.0,1,0,0,2,35.64165,0,0,0,1,0
2,3,1,3,26.0,0,0,0,1,7.925,0,1,0,0,0
3,4,1,1,35.0,1,0,0,2,26.55,0,0,0,1,0
4,5,0,3,35.0,0,0,1,1,8.05,0,0,1,0,0


In [21]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Male,FamilySize,normFare,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rev
0,892,3,34.5,0,0,1,1,7.8292,0,0,1,0,0
1,893,3,47.0,1,0,0,2,3.5,0,0,0,1,0
2,894,2,62.0,0,0,1,1,9.6875,0,0,1,0,0
3,895,3,27.0,0,0,1,1,8.6625,0,0,1,0,0
4,896,3,22.0,1,1,0,3,4.095833,0,0,0,1,0


In [22]:
#df_train['Alone'] = (df_train['SibSp'] == 0) & (df_train['Parch'] == 0)
#df_test['Alone'] = (df_test['SibSp'] == 0) & (df_test['Parch'] == 0)

In [23]:
#df_train['Alone'] = df_train['Alone']*1
#df_test['Alone'] = df_test['Alone']*1

In [24]:
#df_train['Fav'] = (df_train['Male'] == 0) | (df_train['Age'] < 16)
#df_test['Fav'] = (df_test['Male'] == 0) | (df_test['Age'] < 16)

In [25]:
#df_train['Fav'] = df_train['Fav']*1
#df_test['Fav'] = df_test['Fav']*1

In [26]:
X = df_train.drop('PassengerId', axis = 1)
X = shuffle(X)

In [27]:
y = X['Survived']

In [28]:
X = X.drop('Survived', axis = 1)

In [29]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Male,FamilySize,normFare,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rev
117,2,29.0,1,0,1,2,10.5,0,0,1,0,0
258,1,35.0,0,0,0,1,512.3292,0,1,0,0,0
78,2,0.83,0,2,1,3,9.666667,1,0,0,0,0
498,1,25.0,1,2,0,4,37.8875,0,0,0,1,0
737,1,35.0,0,0,1,1,512.3292,0,0,1,0,0


# Splitting the data

In [30]:
X_train = X[:600]

In [31]:
X_valid = X[600:]

In [32]:
y_train = y[:600]
y_valid = y[600:]

In [33]:
X_train.shape

(600, 12)

In [34]:
X_valid.shape

(291, 12)

In [35]:
y_train.shape

(600,)

In [36]:
y_valid.shape

(291,)

# Decision tree classifier

In [37]:
dt = DecisionTreeClassifier(max_depth = 5, random_state=17)

In [38]:
X_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Male,FamilySize,normFare,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rev
117,2,29.0,1,0,1,2,10.5,0,0,1,0,0
258,1,35.0,0,0,0,1,512.3292,0,1,0,0,0
78,2,0.83,0,2,1,3,9.666667,1,0,0,0,0
498,1,25.0,1,2,0,4,37.8875,0,0,0,1,0
737,1,35.0,0,0,1,1,512.3292,0,0,1,0,0


In [39]:
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=17, splitter='best')

In [40]:
pred = dt.predict(X_valid)

In [41]:
accuracy_score(pred, y_valid)

0.8350515463917526

In [42]:
max_feat = X_train.shape[1]

In [43]:
max_feat

12

In [44]:
tree_params = {'max_depth': range(1,20),
               'max_features': range(1,max_feat)}

In [45]:

tree = GridSearchCV(dt, tree_params, cv=5, n_jobs=-1, verbose=True)

In [46]:
tree.fit(X_train, y_train)

Fitting 5 folds for each of 209 candidates, totalling 1045 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 1045 out of 1045 | elapsed:    2.9s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=5,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=17,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': range(1, 20),
                         'max_features': range(1, 12)},
             pre_dispatch='2*n_jobs

In [47]:
treeCV_accuracy = tree.best_score_

In [48]:
best_dt = tree.best_estimator_

In [49]:
pred = best_dt.predict(X_valid)

In [50]:
accuracy_score(pred, y_valid)

0.8109965635738832

In [51]:
tree_tuned_score = round(accuracy_score(pred, y_valid), 3)

# KNN

In [52]:
knn = KNeighborsClassifier(n_neighbors=10)

In [53]:
scaler = StandardScaler()

In [54]:
X_train_scaled = scaler.fit_transform(X_train)

In [55]:
X_valid_scaled = scaler.transform(X_valid)

In [56]:
knn.fit(X_train_scaled, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='uniform')

In [57]:
knn_pred = knn.predict(X_valid_scaled)

In [58]:
accuracy_score(knn_pred, y_valid)

0.8350515463917526

In [59]:
from sklearn.pipeline import Pipeline

In [60]:
knn_pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n_jobs=-1))])

In [61]:
knn_params = {'knn__n_neighbors': range(1, 10)}
knn_grid = GridSearchCV(knn_pipe, knn_params,
                        cv=5, n_jobs=-1, verbose=True)
knn_grid.fit(X_train_scaled, y_train)
knn_grid.best_params_, knn_grid.best_score_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.


Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    0.2s finished


({'knn__n_neighbors': 8}, 0.8216666666666667)

In [62]:
best_knn = knn_grid.best_estimator_

In [63]:
knnCV_accuracy = knn_grid.best_score_

In [64]:
knn_tuned_pred = best_knn.predict(X_valid_scaled)

In [65]:
knn_score = accuracy_score(knn_tuned_pred, y_valid)

# Random Forest

In [66]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=100, n_jobs=-1, 
                                random_state=17)
print(np.mean(cross_val_score(forest, X_train, y_train, cv=5)))

0.8134115795078362


In [67]:
%%time
forest_params = {'max_depth': range(2, 20),
                 'max_features': range(4, X_train.shape[1])}

forest_grid = GridSearchCV(forest, forest_params,
                           cv=5, n_jobs=-1, verbose=True)

forest_grid.fit(X_train, y_train)

forest_grid.best_params_, forest_grid.best_score_

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 402 tasks      | elapsed:    4.8s


Wall time: 8.88 s


[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:    8.6s finished


({'max_depth': 6, 'max_features': 8}, 0.84)

In [68]:
forest_predict = forest_grid.best_estimator_.predict(X_valid)

In [69]:
forest_score = accuracy_score(forest_predict, y_valid)

In [70]:
forestCV_accuracy = forest_grid.best_score_

In [71]:
print('Tree CV accuracy: {}'.format(round(treeCV_accuracy*100, 2)))
print('Tree score: {}'.format(round(tree_tuned_score*100, 2)))

print('\nKNN CV accuracy: {}'.format(round(knnCV_accuracy*100,2)))
print('KNN score: {}'.format(round(knn_score*100, 2)))

print('\nForest CV accuracy: {}'.format(round(forestCV_accuracy*100,2)))
print('Forest score: {}'.format(round(forest_score*100, 2)))


Tree CV accuracy: 83.0
Tree score: 81.1

KNN CV accuracy: 82.17
KNN score: 83.51

Forest CV accuracy: 84.0
Forest score: 82.47


In [72]:
forest = RandomForestClassifier(n_estimators=100, n_jobs=-1, 
                                random_state=17)

In [73]:
%%time
forest_params = {'max_depth': range(2, 20),
                 'max_features': range(4, X_train.shape[1])}
forest_grid = GridSearchCV(forest, forest_params,
                           cv=5, n_jobs=-1, verbose=True)
forest_grid.fit(X, y)

forest_grid.best_params_, forest_grid.best_score_

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 402 tasks      | elapsed:    5.0s


Wall time: 9.33 s


[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:    9.1s finished


({'max_depth': 4, 'max_features': 4}, 0.8338945005611672)

In [74]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Male,FamilySize,normFare,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rev
0,892,3,34.5,0,0,1,1,7.8292,0,0,1,0,0
1,893,3,47.0,1,0,0,2,3.5,0,0,0,1,0
2,894,2,62.0,0,0,1,1,9.6875,0,0,1,0,0
3,895,3,27.0,0,0,1,1,8.6625,0,0,1,0,0
4,896,3,22.0,1,1,0,3,4.095833,0,0,0,1,0


In [75]:
id_test = df_test['PassengerId']

In [76]:
id_test.head()

0    892
1    893
2    894
3    895
4    896
Name: PassengerId, dtype: int64

In [77]:
df_test = df_test.drop('PassengerId', axis = 1)

In [78]:
forest_predict = forest_grid.best_estimator_.predict(df_test)

In [79]:
forest_predict

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,

In [80]:
sample_sub = pd.read_csv('gender_submission.csv', 
                             index_col='PassengerId')

In [81]:
sample_sub['Survived'] = forest_predict

In [82]:
sample_sub.to_csv('forest_predict.csv')

In [83]:
X_scaled = scaler.fit_transform(X)

In [84]:
X_test_scaled = scaler.transform(df_test)

In [85]:
knn_params = {'knn__n_neighbors': range(1, 10)}
knn_grid = GridSearchCV(knn_pipe, knn_params,
                        cv=5, n_jobs=-1, verbose=True)
knn_grid.fit(X_scaled, y)
knn_grid.best_params_, knn_grid.best_score_

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    0.2s finished


({'knn__n_neighbors': 7}, 0.8260381593714927)

In [86]:
best_knn = knn_grid.best_estimator_

In [87]:
knn_tuned_pred = best_knn.predict(X_test_scaled)

In [88]:
sample_sub = pd.read_csv('gender_submission.csv', 
                             index_col='PassengerId')

In [89]:
sample_sub['Survived'] = knn_tuned_pred

In [90]:
sample_sub.to_csv('kNN_predict.csv')

In [91]:
forest_predict.shape

(418,)

In [92]:
forest_grid.best_estimator_.feature_importances_

array([0.11852573, 0.05357667, 0.0292448 , 0.01006715, 0.25647121,
       0.05527675, 0.10300724, 0.01914008, 0.04095118, 0.25512997,
       0.05017329, 0.00843593])