### Random Forest Classifier

In [81]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
%matplotlib inline

In [82]:
df = pd.read_csv('data/train.csv')

#### Feature Selection - First Cut
##### Drop:  PassengerId, Survived , Name, Cabin, Embarked, Ticket
##### Keep: Pclass, Sex, Age, SibSp, Parch, Fare

In [83]:
predict_features = df.drop(['PassengerId','Survived','Name','Ticket','Cabin','Embarked'], axis=1)
predict_features.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
count,891.0,714.0,891.0,891.0,891.0
mean,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.42,0.0,0.0,0.0
25%,2.0,20.125,0.0,0.0,7.9104
50%,3.0,28.0,0.0,0.0,14.4542
75%,3.0,38.0,1.0,0.0,31.0
max,3.0,80.0,8.0,6.0,512.3292


In [84]:
predict_features.head(20)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,male,22.0,1,0,7.25
1,1,female,38.0,1,0,71.2833
2,3,female,26.0,0,0,7.925
3,1,female,35.0,1,0,53.1
4,3,male,35.0,0,0,8.05
5,3,male,,0,0,8.4583
6,1,male,54.0,0,0,51.8625
7,3,male,2.0,3,1,21.075
8,3,female,27.0,0,2,11.1333
9,2,female,14.0,1,0,30.0708


In [85]:
features_with_ages = predict_features.loc[predict_features.Age.notnull()]
features_with_ages.describe()
p1_ages = features_with_ages[features_with_ages['Pclass']==1]['Age']
p2_ages = features_with_ages[features_with_ages['Pclass']==2]['Age']
p3_ages = features_with_ages[features_with_ages['Pclass']==3]['Age']
print("Mean age of P1 class: ",np.mean(p1_ages))
print("Mean age of P2 class: ",np.mean(p2_ages))
print("Mean age of P3 class: ",np.mean(p3_ages))
predict_features.loc[(predict_features.Age.isnull() & ( predict_features.Pclass == 1)),'Age'] = np.mean(p1_ages)
predict_features.loc[(predict_features.Age.isnull() & ( predict_features.Pclass == 2)),'Age'] = np.mean(p2_ages)
predict_features.loc[(predict_features.Age.isnull() & ( predict_features.Pclass == 3)),'Age'] = np.mean(p3_ages)


Mean age of P1 class:  38.233440860215055
Mean age of P2 class:  29.87763005780347
Mean age of P3 class:  25.14061971830986


In [88]:
predict_features =pd.get_dummies(predict_features, columns=['Sex'],drop_first=True)
#male_encoded.head()

#predict_features = predict_features.drop(['Sex'],axis=1)
#predict_features = pd.concat([predict_features, male_encoded], axis=1)
predict_features.head()


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male
0,3,22.0,1,0,7.25,1
1,1,38.0,1,0,71.2833,0
2,3,26.0,0,0,7.925,0
3,1,35.0,1,0,53.1,0
4,3,35.0,0,0,8.05,1


In [65]:
y_predict = df['Survived']    

In [71]:
train_X, val_X, train_y, val_y = train_test_split(predict_features, y_predict, random_state = 214)
val_X.shape

(223, 6)

In [72]:
rf_classifier = RandomForestClassifier(random_state=1)
rf_classifier.fit(train_X, train_y)
survivor_preds = rf_classifier.predict(val_X)
survivor_preds

array([1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 1], dtype=int64)

In [73]:
print(confusion_matrix(val_y,survivor_preds))
tn, fp, fn, tp = confusion_matrix(val_y,survivor_preds).ravel()
(tn, fp, fn, tp )

[[122  20]
 [ 27  54]]


(122, 20, 27, 54)

In [68]:
f1_score(val_y, survivor_preds)

0.7307692307692308

In [43]:
print("Recall= ", tp/(tp+fn))
print("Precision= ", tp/(tp+fp))

Recall=  0.6666666666666666
Precision=  0.7605633802816901


#### Try different number of trees

In [79]:
tree_options = [10,30,50,100,200,500,600,1000,2000,4000]
for trees in tree_options:
    print("Tree count= ", trees)
    rf_classifier = RandomForestClassifier(trees, random_state=1)
    rf_classifier.fit(train_X, train_y)
    survivor_preds = rf_classifier.predict(val_X)
    tn, fp, fn, tp = confusion_matrix(val_y,survivor_preds).ravel()
    print("Accuracy: ", (tn+tp)/(val_X.shape[0]) )

Tree count=  10
Accuracy:  0.7892376681614349
Tree count=  30
Accuracy:  0.7982062780269058
Tree count=  50
Accuracy:  0.7937219730941704
Tree count=  100
Accuracy:  0.8161434977578476
Tree count=  200
Accuracy:  0.8116591928251121
Tree count=  500
Accuracy:  0.8071748878923767
Tree count=  600
Accuracy:  0.8116591928251121
Tree count=  1000
Accuracy:  0.8116591928251121
Tree count=  2000
Accuracy:  0.8116591928251121
Tree count=  4000
Accuracy:  0.8116591928251121


#### Best tree count = 600

In [80]:
rf_classifier = RandomForestClassifier(600, random_state=1)
rf_classifier.fit(train_X, train_y)
survivor_preds = rf_classifier.predict(val_X)
tn, fp, fn, tp = confusion_matrix(val_y,survivor_preds).ravel()
print("Accuracy: ", (tn+tp)/(val_X.shape[0]) )

Accuracy:  0.8116591928251121


##### Accuracy with missing ages computed from mean of pClass = 0.81166
Accuracy with age computed from mean of all non missing ages:   0.80717