## Feature Engineering - titanic dataset

In [69]:
import pandas as pd
import numpy as np

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print(train.shape)
print(test.shape)

(891, 12)
(418, 11)


In [70]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [71]:
train['SibSp'].value_counts()

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

In [72]:
train['Parch'].value_counts()

0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64

Creating new feature - FamMembers for number of family members each passenger was with (adding SibSp and Parch columns)

In [73]:
train['FamMembers'] = train['SibSp'] + train['Parch']

In [74]:
test['FamMembers'] = test['SibSp'] + test['Parch']

Creating new feature - 0 or 1 based on whether passenger was travelling alone

In [75]:
train['LoneTravel'] = train['FamMembers'].apply(lambda x: 0 if x == 0 else 1)

In [76]:
test['LoneTravel'] = test['FamMembers'].apply(lambda x: 0 if x == 0 else 1)

Filling NaN values in Cabin column with 'Unknown'

In [77]:
cabin = train[['Cabin']] #isolate Cabin column as DF

In [78]:
cabin = cabin.fillna(value='Unknown') #fill NaN values with 'Unknown'

In [79]:
cabin = cabin.reset_index()

In [80]:
cabin.head()

Unnamed: 0,index,Cabin
0,0,Unknown
1,1,C85
2,2,Unknown
3,3,C123
4,4,Unknown


In [81]:
train = train.reset_index() #creating new index column that will align with Cabin DF index column

In [82]:
# train.head()

In [83]:
train = train.merge(cabin, how='left',on='index')

In [84]:
# train.head()
# train.shape

In [85]:
cabin = test[['Cabin']]
cabin = cabin.fillna(value='Unknown')

In [86]:
cabin = cabin.reset_index()

In [87]:
cabin.head()

Unnamed: 0,index,Cabin
0,0,Unknown
1,1,Unknown
2,2,Unknown
3,3,Unknown
4,4,Unknown


In [88]:
test = test.reset_index()

In [89]:
# test.head()

In [90]:
test = test.merge(cabin, how='left',on='index')

In [91]:
# test.head()
test.shape

(418, 15)

Label Encoding Categorical Features - Name, Sex, Ticket, Embarked and **cabin**

In [92]:
train_lab = train.copy() #creating copies of train and test sets
test_lab = test.copy()

In [93]:
train_lab = train_lab.drop(['PassengerId','Survived'],axis=1)
test_lab = test_lab.drop(['PassengerId'],axis=1)

In [94]:
all_data = pd.concat([train_lab,test_lab]).reset_index(drop=True)
all_data.shape

(1309, 14)

In [95]:
cols = ('Name','Sex','Ticket','Embarked','Cabin_y')

In [96]:
from sklearn.preprocessing import LabelEncoder

label_df = all_data.copy()

for c in cols:
    if label_df[c].dtype == 'object':
        le = LabelEncoder()
        label_df[c] = le.fit_transform(label_df[c].astype(str))

In [97]:
label_df.head()

Unnamed: 0,index,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin_x,Embarked,FamMembers,LoneTravel,Cabin_y
0,0,3,155,1,22.0,1,0,720,7.25,,2,1,1,186
1,1,1,286,0,38.0,1,0,816,71.2833,C85,0,1,1,106
2,2,3,523,0,26.0,0,0,914,7.925,,2,0,0,186
3,3,1,422,0,35.0,1,0,65,53.1,C123,2,1,1,70
4,4,3,22,1,35.0,0,0,649,8.05,,2,0,0,186


In [98]:
label_df.dtypes

index           int64
Pclass          int64
Name            int64
Sex             int64
Age           float64
SibSp           int64
Parch           int64
Ticket          int64
Fare          float64
Cabin_x        object
Embarked        int64
FamMembers      int64
LoneTravel      int64
Cabin_y         int64
dtype: object

In [99]:
all_data = label_df

In [100]:
ntrain = train.shape[0]
ntest = test.shape[0]

In [101]:
train_lab = all_data[:ntrain]
test_lab = all_data[ntrain:]

Imputing Age with MICE

In [102]:
train_mat = train_lab.select_dtypes(include=['number']).as_matrix()
test_mat = test_lab.select_dtypes(include=['number']).as_matrix()

In [103]:
from fancyimpute import MICE

mice = MICE(n_imputations=100,impute_type='col')

train_imputed = mice.complete(train_mat)
test_imputed = mice.complete(test_mat)

[MICE] Completing matrix with shape (891, 13)
[MICE] Starting imputation round 1/110, elapsed time 0.002
[MICE] Starting imputation round 2/110, elapsed time 0.003
[MICE] Starting imputation round 3/110, elapsed time 0.003
[MICE] Starting imputation round 4/110, elapsed time 0.003
[MICE] Starting imputation round 5/110, elapsed time 0.003
[MICE] Starting imputation round 6/110, elapsed time 0.003
[MICE] Starting imputation round 7/110, elapsed time 0.003
[MICE] Starting imputation round 8/110, elapsed time 0.003
[MICE] Starting imputation round 9/110, elapsed time 0.003
[MICE] Starting imputation round 10/110, elapsed time 0.003
[MICE] Starting imputation round 11/110, elapsed time 0.003
[MICE] Starting imputation round 12/110, elapsed time 0.003
[MICE] Starting imputation round 13/110, elapsed time 0.003
[MICE] Starting imputation round 14/110, elapsed time 0.003
[MICE] Starting imputation round 15/110, elapsed time 0.003
[MICE] Starting imputation round 16/110, elapsed time 0.019
[MI

[MICE] Starting imputation round 56/110, elapsed time 0.078
[MICE] Starting imputation round 57/110, elapsed time 0.078
[MICE] Starting imputation round 58/110, elapsed time 0.083
[MICE] Starting imputation round 59/110, elapsed time 0.085
[MICE] Starting imputation round 60/110, elapsed time 0.085
[MICE] Starting imputation round 61/110, elapsed time 0.085
[MICE] Starting imputation round 62/110, elapsed time 0.090
[MICE] Starting imputation round 63/110, elapsed time 0.090
[MICE] Starting imputation round 64/110, elapsed time 0.090
[MICE] Starting imputation round 65/110, elapsed time 0.090
[MICE] Starting imputation round 66/110, elapsed time 0.094
[MICE] Starting imputation round 67/110, elapsed time 0.094
[MICE] Starting imputation round 68/110, elapsed time 0.094
[MICE] Starting imputation round 69/110, elapsed time 0.094
[MICE] Starting imputation round 70/110, elapsed time 0.098
[MICE] Starting imputation round 71/110, elapsed time 0.098
[MICE] Starting imputation round 72/110,

In [104]:
train_cols = train_lab[['index','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Embarked','FamMembers','LoneTravel','Cabin_y']].columns
test_cols = test_lab[['index','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Embarked','FamMembers','LoneTravel','Cabin_y']].columns

train_imputed = pd.DataFrame(train_imputed, columns=train_cols)
test_imputed = pd.DataFrame(test_imputed, columns=test_cols)

In [105]:
print(train_imputed['Age'].isnull().value_counts()) #ensuring that there are no missing age values
print(test_imputed['Age'].isnull().value_counts())

False    891
Name: Age, dtype: int64
False    418
Name: Age, dtype: int64


In [106]:
print('Pre-Imputation Age Mean:',train['Age'].mean())
print('Post-Imputation Age Mean:',train_imputed['Age'].mean())
print('Pre-Imputation SD:',train['Age'].std())
print('Post-Imputation SD:',train_imputed['Age'].std())

Pre-Imputation Age Mean: 29.69911764705882
Post-Imputation Age Mean: 29.661681888539757
Pre-Imputation SD: 14.526497332334044
Post-Imputation SD: 13.051928588584154


Delete columns not needed from training and test sets

In [107]:
train_imputed = train_imputed.drop(['index','SibSp','Parch'],axis=1)
test_imputed = test_imputed.drop(['index','SibSp','Parch'],axis=1)

Train-test split!

In [110]:
train_y = train['Survived']

In [112]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_imputed, train_y, test_size=.33, random_state=43)

## Random Forest Classifier

In [113]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_jobs=-1) #first try using default hyperparameters, no class weights

rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [114]:
predictions = rf.predict(X_train)

In [115]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_train,predictions)

array([[364,   2],
       [  8, 222]], dtype=int64)

In [116]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_train,predictions)

0.97987645521501543

In [117]:
from sklearn.metrics import average_precision_score

average_precision_score(y_train,predictions)

0.97002219767393383

Grid search w/o class weights

In [119]:
from sklearn.model_selection import GridSearchCV

param_grid = [{'n_estimators':[150,200,350,500],'max_features':[2,3,4],'max_depth':[2,3,4,5],'random_state':[43]}]
grid_search = GridSearchCV(rf,param_grid,cv=5)
grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [150, 200, 350, 500], 'max_features': [2, 3, 4], 'max_depth': [2, 3, 4, 5], 'random_state': [43]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [120]:
grid_search.best_params_

{'max_depth': 4, 'max_features': 3, 'n_estimators': 350, 'random_state': 43}

In [135]:
best_rf = RandomForestClassifier(max_depth=4,max_features=3,n_estimators=350,random_state=43) #fitting model with best params
# obtained through gridsearch
best_rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features=3, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=350, n_jobs=1,
            oob_score=False, random_state=43, verbose=0, warm_start=False)

In [136]:
predictions = best_rf.predict(X_train)

In [137]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_train,predictions)

array([[345,  21],
       [ 54, 176]], dtype=int64)

In [138]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_train,predictions)

0.85392017106201012

In [139]:
from sklearn.metrics import average_precision_score

average_precision_score(y_train,predictions)

0.77425002110739005

Trying the same grid search using class weights

In [127]:
from sklearn.model_selection import GridSearchCV

param_grid = [{'n_estimators':[150,200,350,500],'max_features':[2,3,4],'max_depth':[2,3,4,5],'random_state':[43],'class_weight':['balanced']}]
grid_search = GridSearchCV(rf,param_grid,cv=5)
grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [150, 200, 350, 500], 'max_features': [2, 3, 4], 'max_depth': [2, 3, 4, 5], 'random_state': [43], 'class_weight': ['balanced']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [128]:
grid_search.best_params_

{'class_weight': 'balanced',
 'max_depth': 3,
 'max_features': 4,
 'n_estimators': 150,
 'random_state': 43}

In [129]:
best_rf2 = RandomForestClassifier(class_weight='balanced',max_depth=3,max_features=4,n_estimators=150,random_state=43,n_jobs=-1)
best_rf2.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=3, max_features=4,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=150, n_jobs=-1, oob_score=False, random_state=43,
            verbose=0, warm_start=False)

In [130]:
predictions = best_rf2.predict(X_train)

In [131]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_train,predictions)

array([[309,  57],
       [ 39, 191]], dtype=int64)

In [133]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_train,predictions)

0.83734853884533134

In [134]:
from sklearn.metrics import average_precision_score

average_precision_score(y_train,predictions)

0.70500496531340306

Using best random forest model - without class weights - to predict on test subset

In [140]:
predictions = best_rf.predict(X_test)

In [141]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test,predictions)

array([[168,  15],
       [ 42,  70]], dtype=int64)

In [142]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test,predictions)

0.77151639344262302

In [143]:
from sklearn.metrics import average_precision_score

average_precision_score(y_test,predictions)

0.6570787637088733