# Random Forest Classifier for Titanic

* <b>PassengerID</b>: random ID assigned to passenger
* <b>Pclass</b>: Ticket class of passenger (i.e, 1st class, 2nd class, and 3rd class)
* <b>Name</b>: Name of the passenger
* <b>Sex</b>: Male or Female
* <b>Age</b>: Age of passenger
* <b>SibSp</b>: Number of siblings accompanying passenger
* <b>Parch</b>: Number of parents and children accompanying passenger
* <b>Ticket</b>: Ticket number of passenger
* <b>Fare</b>: Fare paid for ticket
* <b>Cabin</b>: Cabin number of passenger
* <b>Embarked</b>: Port from which passenger embarked
* <b>Survived</b>: 0 and 1 for died and survived, respectively

### Importing the Data and Libraries

In [28]:
# Import the libraries
import pandas as pd

# Import the ml model
from sklearn.ensemble import RandomForestClassifier

# Import GridSearchCV for finding the model with the best parameters
from sklearn.model_selection import GridSearchCV

# Import functions for measuring metrics of the ml model
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [29]:
# Import the Train, Validation, and Test data sets
trainF = pd.read_csv('./train_features.csv')
trainL = pd.read_csv('./train_labels.csv')

valF = pd.read_csv('./validation_features.csv')
valL = pd.read_csv('./validation_labels.csv')

testF = pd.read_csv('./test_features.csv')
testL = pd.read_csv('./test_labels.csv')


### Configuring the Hyperparameters

In [30]:
def printResults(gridSearchResults):

    print( 'Best ML Hyperparameters: {}\n'.format(gridSearchResults.best_params_) )

    means = gridSearchResults.cv_results_['mean_test_score']
    stnDvs = gridSearchResults.cv_results_['std_test_score']

    for mean, stnDvs, parameters in zip( means, stnDvs, gridSearchResults.cv_results_['params'] ):
        print(
            '{} (+/-{}) for {}'.format(
                round(mean, 3),
                round(stnDvs*2, 3),
                parameters
            ) 
        )

In [31]:
# Instantiate the model and set the hyperparameters 
rfModel = RandomForestClassifier()

hyperparameters = {
    'n_estimators': [5, 50, 100],
    'max_depth': [2, 10, 20, None]
}

# GridSearchCV will search and rank the models according their parameters
# Note:
# sklearn refers to 'hyperparameters' as 'parameters'
gridSearch = GridSearchCV( rfModel, hyperparameters, cv=5 )

# .fit() will have the model, rfModel, learn the parameters of the features
gridSearch.fit( trainF, trainL.values.ravel() )

printResults( gridSearch )

Best ML Hyperparameters: {'max_depth': 20, 'n_estimators': 5}

0.769 (+/-0.081) for {'max_depth': 2, 'n_estimators': 5}
0.788 (+/-0.075) for {'max_depth': 2, 'n_estimators': 50}
0.795 (+/-0.047) for {'max_depth': 2, 'n_estimators': 100}
0.81 (+/-0.06) for {'max_depth': 10, 'n_estimators': 5}
0.816 (+/-0.025) for {'max_depth': 10, 'n_estimators': 50}
0.827 (+/-0.041) for {'max_depth': 10, 'n_estimators': 100}
0.833 (+/-0.043) for {'max_depth': 20, 'n_estimators': 5}
0.816 (+/-0.026) for {'max_depth': 20, 'n_estimators': 50}
0.816 (+/-0.025) for {'max_depth': 20, 'n_estimators': 100}
0.79 (+/-0.03) for {'max_depth': None, 'n_estimators': 5}
0.812 (+/-0.038) for {'max_depth': None, 'n_estimators': 50}
0.81 (+/-0.037) for {'max_depth': None, 'n_estimators': 100}


### Comparison of Top 3 Models

In [32]:
# 0.826 (+/-0.028) for {'max_depth': None, 'n_estimators': 50}
# 0.825 (+/-0.04) for {'max_depth': 10, 'n_estimators': 5}
# 0.825 (+/-0.056) for {'max_depth': 10, 'n_estimators': 100}

In [33]:
rfModel1 = RandomForestClassifier(n_estimators=50, max_depth=None)
rfModel1.fit( trainF, trainL.values.ravel() )

rfModel2 = RandomForestClassifier(n_estimators=5, max_depth=10)
rfModel2.fit( trainF, trainL.values.ravel() )

rfModel3 = RandomForestClassifier(n_estimators=100, max_depth=10)
rfModel3.fit( trainF, trainL.values.ravel() )

RandomForestClassifier(max_depth=10)

In [34]:
# Iterate over the top 3 models and have each predict the label.
# Then, compare the metrics for each model to find best of 3.

for rfML in [rfModel1, rfModel2, rfModel3]:

    predictedLabels = rfML.predict(valF)

    # Find the metrics for the model: (a) accuracy, (b) precision, and (c) recall
    accuracy = round(accuracy_score(valL, predictedLabels), 3)
    precision = round(precision_score(valL, predictedLabels), 3)
    recall = round(recall_score(valL, predictedLabels), 3)

    # Print the metrics for each model
    print(
        'Max depth: {} and Estimators: {}'.format(rfML.max_depth, rfML.n_estimators)
    )
    print(
        'Accuracy: {}, Precision: {}, Recall: {}'.format( accuracy, precision, recall )
    )
    print('-------------------------------------------------')

Max depth: None and Estimators: 50
Accuracy: 0.753, Precision: 0.71, Recall: 0.671
-------------------------------------------------
Max depth: 10 and Estimators: 5
Accuracy: 0.781, Precision: 0.766, Recall: 0.671
-------------------------------------------------
Max depth: 10 and Estimators: 100
Accuracy: 0.77, Precision: 0.742, Recall: 0.671
-------------------------------------------------


In [35]:
# Choose the best model and apply to it the test dataset
predictedLabels = rfModel3.predict( testF )

# Find the metrics for the model: (a) accuracy, (b) precision, and (c) recall
accuracy = round(accuracy_score(testL, predictedLabels), 3)
precision = round(precision_score(testL, predictedLabels), 3)
recall = round(recall_score(testL, predictedLabels), 3)

# Print the metrics for each model
print(
    'Max depth: {} and Estimators: {}'.format(rfML.max_depth, rfML.n_estimators)
)
print(
    'Accuracy: {}, Precision: {}, Recall: {}'.format( accuracy, precision, recall )
)
print('-------------------------------------------------')

Max depth: 10 and Estimators: 100
Accuracy: 0.837, Precision: 0.812, Recall: 0.754
-------------------------------------------------


In [36]:
newDataFeatures = {
    'Pclass': [1,3],
    'Name': ['Rose Bukater','Jack Dawson'],
    'Sex': [1,0],
    'Age': [17, 20],
    'SibSp': [0, 0],
    'Parch': [2, 0],
    'Fare': [50, 0],
    'cabin_ind': [1, 0],
    'Embarked': [2,2]
}

newDataFeaturesDf = pd.DataFrame(data=newDataFeatures)
newDataFeaturesDf.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,cabin_ind,Embarked
0,1,Rose Bukater,1,17,0,2,50,1,2
1,3,Jack Dawson,0,20,0,0,0,0,2


### Predict 'Survived' for Rose and Jack

In [37]:
# Predict 'Survived' for Rose and Jack
# Remove 'Name' because it was for reference only
newDataFeaturesDf.drop('Name', axis=1, inplace=True)

predictedLabels = rfModel3.predict( newDataFeaturesDf )

# Append the label (Survived) to the dataframe
newDataFeaturesDf['Survived'] = predictedLabels

In [38]:
newDataFeaturesDf.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,cabin_ind,Embarked,Survived
0,1,1,17,0,2,50,1,2,1
1,3,0,20,0,0,0,0,2,0
