In [18]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score

import warnings
warnings.filterwarnings('ignore',category=FutureWarning)

### Fitting a basic model using cross validation

In [3]:
train_features = pd.read_csv('./train_features.csv')
train_labels = pd.read_csv('./train_labels.csv',header=None)

In [13]:
#  a method to print mean standard deviation across 5 forest.
def print_results(results):
    print('Best params: {}\n'.format(results.best_params_))
    
    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means,stds, results.cv_results_['params']):
        print("{} (+/-{}) for {}".format(round(mean,3),round(std,3),params))

In [16]:
# Identifying best Hyperparameters using GridSearchCV
rf = RandomForestClassifier()
parameters = {
    'n_estimators': [5, 50, 100],
    'max_depth': [2,10,20,None]
}

cv = GridSearchCV(rf,parameters,cv=5)
cv.fit(train_features,train_labels.values.ravel())

print_results(cv)


Best params: {'max_depth': 10, 'n_estimators': 50}

0.788 (+/-0.062) for {'max_depth': 2, 'n_estimators': 5}
0.796 (+/-0.059) for {'max_depth': 2, 'n_estimators': 50}
0.801 (+/-0.066) for {'max_depth': 2, 'n_estimators': 100}
0.801 (+/-0.044) for {'max_depth': 10, 'n_estimators': 5}
0.828 (+/-0.024) for {'max_depth': 10, 'n_estimators': 50}
0.815 (+/-0.025) for {'max_depth': 10, 'n_estimators': 100}
0.801 (+/-0.035) for {'max_depth': 20, 'n_estimators': 5}
0.813 (+/-0.024) for {'max_depth': 20, 'n_estimators': 50}
0.815 (+/-0.014) for {'max_depth': 20, 'n_estimators': 100}
0.801 (+/-0.029) for {'max_depth': None, 'n_estimators': 5}
0.818 (+/-0.019) for {'max_depth': None, 'n_estimators': 50}
0.813 (+/-0.013) for {'max_depth': None, 'n_estimators': 100}




### Evalute results on validation set

In [20]:
tr_features = pd.read_csv('./train_features.csv')
tr_labels = pd.read_csv('./train_labels.csv',header=None)

val_features = pd.read_csv('./val_features.csv')
val_labels = pd.read_csv('./val_labels.csv',header=None)

te_features = pd.read_csv('./test_features.csv')
te_labels = pd.read_csv('./test_labels.csv',header=None)

From the above GridSearcCV results we will take best 3 hyperparameters combinations

- 0.828 (+/-0.024) for {'max_depth': 10, 'n_estimators': 50}
- 0.818 (+/-0.019) for {'max_depth': None, 'n_estimators': 50}
- 0.815 (+/-0.025) for {'max_depth': 10, 'n_estimators': 100}

and refit these models on full training set

In [23]:
rf1 = RandomForestClassifier(n_estimators=50, max_depth=10)
rf1.fit(tr_features,tr_labels.values.ravel())

rf2 = RandomForestClassifier(n_estimators=50, max_depth=None)
rf2.fit(tr_features,tr_labels.values.ravel())

rf3 = RandomForestClassifier(n_estimators=100, max_depth=10)
rf3.fit(tr_features,tr_labels.values.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### Evaluate the three models

In [26]:
for mdl in [rf1,rf2,rf3]:
    y_pred = mdl.predict(val_features)
    accuracy = round(accuracy_score(val_labels,y_pred),3)
    precision = round(precision_score(val_labels,y_pred),3)
    recall = round(recall_score(val_labels,y_pred),3)
    print('Max Depth: {} / # of Estimate: {} : Accuracy: {} / Precision: {} / Recall: {}'.format(mdl.max_depth,
                                                                                                mdl.n_estimators,
                                                                                                accuracy,
                                                                                                precision,
                                                                                                recall))

Max Depth: 10 / # of Estimate: 50 : Accuracy: 0.827 / Precision: 0.846 / Recall: 0.724
Max Depth: None / # of Estimate: 50 : Accuracy: 0.821 / Precision: 0.824 / Recall: 0.737
Max Depth: 10 / # of Estimate: 100 : Accuracy: 0.832 / Precision: 0.848 / Recall: 0.737


#### Observations
We can observe that the model that perform best on cross validation, actually didn't perform well on validation set, third model with 100 estimators and 10 max_depth perform best here. 

### Evaluting best model on test set

In [28]:
y_pred = rf3.predict(te_features)
accuracy = round(accuracy_score(te_labels,y_pred),3)
precision = round(precision_score(te_labels,y_pred),3)
recall = round(recall_score(te_labels,y_pred),3)
print('Max Depth: {} / # of Estimate: {} : Accuracy: {} / Precision: {} / Recall: {}'.format(rf3.max_depth, rf3.n_estimators,accuracy,precision,recall))

Max Depth: 10 / # of Estimate: 100 : Accuracy: 0.798 / Precision: 0.754 / Recall: 0.662



**Model with 100 estimators and max depth 10 is 79.8% accurate with 75.4% precision and 66.2% recall.**