# Titanic: Random Forest Classifier

### 1 - Import the libraries and data files

In [None]:
import pandas as pd

# Import the class of the machine learning model
from sklearn.ensemble import RandomForestClassifier

# Import GridSearchCV for finding the optimal hyperparamters 
# (i.e, configuration of model)
from sklearn.model_selection import GridSearchCV

# Import metrics for measuring performance of the model
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [None]:
trainF = pd.read_csv('./train_features.csv')
trainL = pd.read_csv('./train_labels.csv')

valF = pd.read_csv('./val_features.csv')
valL = pd.read_csv('./val_labels.csv')

testF = pd.read_csv('./test_features.csv')
testL = pd.read_csv('./test_labels.csv')

### 2 - Find and configure the best hyperparameters

In [None]:
# ::: Definitions :::

# Parameters:
# Parameters are the weights of features. This is what the model
# has to find.

# Hyperparamters:
# Hyperparamters are the configurations for the model. This is what
# the data scientists must determine.

In [None]:
def printResults(gridSearchResults):

    print( 'Best ML Hyperparameters: {}\n'.format(gridSearchResults.best_params_) )

    means = gridSearchResults.cv_results_['mean_test_score']
    stnDvs = gridSearchResults.cv_results_['std_test_score']

    for mean, stnDvs, parameters in zip( means, stnDvs, gridSearchResults.cv_results_['params'] ):
        print(
            '{} (+/-{}) for {}'.format(
                round(mean, 3),
                round(stnDvs*2, 3),
                parameters
            ) 
        )



In [None]:
# Create instance of RandomForestClassifier model
rf_model = RandomForestClassifier()

hyperparameters_to_try = {
    'n_estimators': [5, 50, 100], # n_estimators are number of decision trees
    'max_depth': [2, 10, 20, None] # different heights of trees
}


# GridSearchCV
# Will search for models with best parameters and their
# corresonding hyperparameters
gridSearch = GridSearchCV(rf_model, hyperparameters_to_try, cv=5, verbose=0)

# .fit() will make model learn witht training features and labels 
gridSearch.fit( trainF, trainL.values.ravel() )

# Print the results with clear formatting
printResults( gridSearch )

Best ML Hyperparameters: {'max_depth': 20, 'n_estimators': 100}

0.782 (+/-0.105) for {'max_depth': 2, 'n_estimators': 5}
0.805 (+/-0.099) for {'max_depth': 2, 'n_estimators': 50}
0.788 (+/-0.069) for {'max_depth': 2, 'n_estimators': 100}
0.805 (+/-0.083) for {'max_depth': 10, 'n_estimators': 5}
0.831 (+/-0.048) for {'max_depth': 10, 'n_estimators': 50}
0.831 (+/-0.059) for {'max_depth': 10, 'n_estimators': 100}
0.807 (+/-0.07) for {'max_depth': 20, 'n_estimators': 5}
0.816 (+/-0.076) for {'max_depth': 20, 'n_estimators': 50}
0.833 (+/-0.073) for {'max_depth': 20, 'n_estimators': 100}
0.784 (+/-0.05) for {'max_depth': None, 'n_estimators': 5}
0.812 (+/-0.067) for {'max_depth': None, 'n_estimators': 50}
0.824 (+/-0.085) for {'max_depth': None, 'n_estimators': 100}


### 3 - Comparison of top 3 models

### 4 - Predict future passegners