### Random Forest Classifier

In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
%matplotlib inline

In [2]:
df = pd.read_csv('data/train.csv')


#### Feature Selection - First Cut
##### Drop:  PassengerId, Survived , Name, Cabin, Embarked, Ticket
##### Keep: Pclass, Sex, Age, SibSp, Parch, Fare

In [4]:
predict_features = df.drop(['PassengerId','Survived','Name','Ticket','Cabin','Embarked'], axis=1)
predict_features.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
count,891.0,714.0,891.0,891.0,891.0
mean,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.42,0.0,0.0,0.0
25%,2.0,20.125,0.0,0.0,7.9104
50%,3.0,28.0,0.0,0.0,14.4542
75%,3.0,38.0,1.0,0.0,31.0
max,3.0,80.0,8.0,6.0,512.3292


##### Replace missing ages with mean age of population


In [8]:
mean_age = predict_features['Age'].mean(axis=0,skipna=True)
mean_age

29.69911764705882

In [26]:
predict_features.loc[predict_features.Age.isnull(),'Age'] = mean_age
# predict_features

In [25]:
one_hot_training_predictors = pd.get_dummies(predict_features)
# one_hot_training_predictors

In [22]:
y_predict = df['Survived']                   

In [35]:
train_X, val_X, train_y, val_y = train_test_split(one_hot_training_predictors, y_predict, random_state = 214)
val_X.shape

(223, 7)

In [24]:
rf_classifier = RandomForestClassifier(random_state=1)
rf_classifier.fit(train_X, train_y)
survivor_preds = rf_classifier.predict(val_X)
survivor_preds

array([1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 0], dtype=int64)

In [30]:
print(confusion_matrix(val_y,survivor_preds))
tn, fp, fn, tp = confusion_matrix(val_y,survivor_preds).ravel()
(tn, fp, fn, tp )

[[124  18]
 [ 30  51]]


(124, 18, 30, 51)

In [40]:
f1_score(val_y, survivor_preds)

0.68

In [32]:
print("Recall= ", tp/(tp+fn))
print("Precision= ", tp/(tp+fp))

Recall=  0.6296296296296297
Precision=  0.7391304347826086


#### Try different number of trees

In [43]:
tree_options = [10,30,50,100,200,500,1000,2000,4000]
for trees in tree_options:
    print("Tree count= ", trees)
    rf_classifier = RandomForestClassifier(trees, random_state=1)
    rf_classifier.fit(train_X, train_y)
    survivor_preds = rf_classifier.predict(val_X)
    tn, fp, fn, tp = confusion_matrix(val_y,survivor_preds).ravel()
    print("Accuracy: ", (tn+tp)/(val_X.shape[0]) )

Tree count=  10
Accuracy:  0.7847533632286996
Tree count=  30
Accuracy:  0.7892376681614349
Tree count=  50
Accuracy:  0.7892376681614349
Tree count=  100
Accuracy:  0.8026905829596412
Tree count=  200
Accuracy:  0.8026905829596412
Tree count=  500
Accuracy:  0.8071748878923767
Tree count=  1000
Accuracy:  0.8071748878923767
Tree count=  2000
Accuracy:  0.8071748878923767
Tree count=  4000
Accuracy:  0.8071748878923767


#### Best tree count = 500

In [45]:
rf_classifier = RandomForestClassifier(500, random_state=1)
rf_classifier.fit(train_X, train_y)
survivor_preds = rf_classifier.predict(val_X)
tn, fp, fn, tp = confusion_matrix(val_y,survivor_preds).ravel()
print("Accuracy: ", (tn+tp)/(val_X.shape[0]) )

Accuracy:  0.8071748878923767
