# Random Forest Exercises

Create a new notebook, random_forests, and work with titanic data to do the following:

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import acquire
import prepare
import env

In [2]:
df = acquire.get_titanic_data()

In [3]:
df = prepare.prep_titanic(df)

In [4]:
train, validate, test = prepare.split_data(df)

## 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [5]:
x_train = train.drop(columns=['survived', 'passenger_id', 'sex', 'embark_town'])
y_train = train.survived

x_validate = validate.drop(columns=['survived', 'passenger_id', 'sex', 'embark_town'])
y_validate = validate.survived

x_test = test.drop(columns=['survived', 'passenger_id', 'sex', 'embark_town'])
y_test = test.survived

In [6]:
rf = RandomForestClassifier(bootstrap=True,
                           class_weight=None,
                           criterion='gini',
                           min_samples_leaf=1,
                           n_estimators=100,
                           max_depth=10,
                           random_state=311)

In [7]:
rf.fit(x_train, y_train)

In [8]:
rf.score(x_train, y_train)

0.9269662921348315

In [9]:
y_pred = rf.predict(x_train)

y_predict_proba = rf.predict_proba(x_train)

## 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [10]:
prepare.evaluate_clf(rf, x_train, y_train, y_pred)

(0.9269662921348315,
           Pred 0  Pred 1
 Actual 0     318       6
 Actual 1      33     177,
                     0           1  accuracy   macro avg  weighted avg
 precision    0.905983    0.967213  0.926966    0.936598      0.930062
 recall       0.981481    0.842857  0.926966    0.912169      0.926966
 f1-score     0.942222    0.900763  0.926966    0.921493      0.925918
 support    324.000000  210.000000  0.926966  534.000000    534.000000,
                 metric       score
 0             accuracy    0.926966
 1   true_positive_rate    0.842857
 2  false_positive_rate    0.018519
 3   true_negative_rate    0.981481
 4  false_negative_rate    0.157143
 5            precision    0.967213
 6               recall    0.842857
 7             f1_score    0.900763
 8          support_pos  210.000000
 9          support_neg  324.000000)

## 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [11]:
cm = confusion_matrix(y_train, y_pred)
prepare.get_cm_metrics(cm)

Unnamed: 0,metric,score
0,accuracy,0.926966
1,true_positive_rate,0.842857
2,false_positive_rate,0.018519
3,true_negative_rate,0.981481
4,false_negative_rate,0.157143
5,precision,0.967213
6,recall,0.842857
7,f1_score,0.900763
8,support_pos,210.0
9,support_neg,324.0


## 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [30]:
# min_samples_leaf = 3, max_depth = 8
rf3 = RandomForestClassifier(bootstrap=True,
                           class_weight=None,
                           criterion='gini',
                           min_samples_leaf=3,
                           n_estimators=100,
                           max_depth=8,
                           random_state=311)

In [31]:
rf3.fit(x_train, y_train)

In [14]:
rf3.score(x_train, y_train)

0.8576779026217228

In [15]:
y_pred3 = rf3.predict(x_train)

y_predict_proba3 = rf3.predict_proba(x_train)

In [16]:
prepare.evaluate_clf(rf3, x_train, y_train, y_pred)

(0.8576779026217228,
           Pred 0  Pred 1
 Actual 0     318       6
 Actual 1      33     177,
                     0           1  accuracy   macro avg  weighted avg
 precision    0.905983    0.967213  0.926966    0.936598      0.930062
 recall       0.981481    0.842857  0.926966    0.912169      0.926966
 f1-score     0.942222    0.900763  0.926966    0.921493      0.925918
 support    324.000000  210.000000  0.926966  534.000000    534.000000,
                 metric       score
 0             accuracy    0.926966
 1   true_positive_rate    0.842857
 2  false_positive_rate    0.018519
 3   true_negative_rate    0.981481
 4  false_negative_rate    0.157143
 5            precision    0.967213
 6               recall    0.842857
 7             f1_score    0.900763
 8          support_pos  210.000000
 9          support_neg  324.000000)

In [32]:
# Random forest with sample leaf at 5
rf5 = RandomForestClassifier(bootstrap=True,
                           class_weight=None,
                           criterion='gini',
                           min_samples_leaf=5,
                           n_estimators=100,
                           max_depth=8,
                           random_state=311)

In [33]:
rf5.fit(x_train, y_train)

In [23]:
rf5.score(x_train, y_train)

0.8408239700374532

In [24]:
y_pred5 = rf5.predict(x_train)

y_predict_proba5 = rf5.predict_proba(x_train)

In [25]:
prepare.evaluate_clf(rf5, x_train, y_train, y_pred)

(0.8408239700374532,
           Pred 0  Pred 1
 Actual 0     318       6
 Actual 1      33     177,
                     0           1  accuracy   macro avg  weighted avg
 precision    0.905983    0.967213  0.926966    0.936598      0.930062
 recall       0.981481    0.842857  0.926966    0.912169      0.926966
 f1-score     0.942222    0.900763  0.926966    0.921493      0.925918
 support    324.000000  210.000000  0.926966  534.000000    534.000000,
                 metric       score
 0             accuracy    0.926966
 1   true_positive_rate    0.842857
 2  false_positive_rate    0.018519
 3   true_negative_rate    0.981481
 4  false_negative_rate    0.157143
 5            precision    0.967213
 6               recall    0.842857
 7             f1_score    0.900763
 8          support_pos  210.000000
 9          support_neg  324.000000)

In [26]:
cm5 = cm3 = confusion_matrix(y_train, y_pred5)
prepare.get_cm_metrics(cm5)

Unnamed: 0,metric,score
0,accuracy,0.840824
1,true_positive_rate,0.680952
2,false_positive_rate,0.055556
3,true_negative_rate,0.944444
4,false_negative_rate,0.319048
5,precision,0.888199
6,recall,0.680952
7,f1_score,0.770889
8,support_pos,210.0
9,support_neg,324.0


In [18]:
cm3 = confusion_matrix(y_train, y_pred3)
prepare.get_cm_metrics(cm3)

Unnamed: 0,metric,score
0,accuracy,0.857678
1,true_positive_rate,0.728571
2,false_positive_rate,0.058642
3,true_negative_rate,0.941358
4,false_negative_rate,0.271429
5,precision,0.889535
6,recall,0.728571
7,f1_score,0.801047
8,support_pos,210.0
9,support_neg,324.0


In [19]:
cm = confusion_matrix(y_train, y_pred)
prepare.get_cm_metrics(cm)

Unnamed: 0,metric,score
0,accuracy,0.926966
1,true_positive_rate,0.842857
2,false_positive_rate,0.018519
3,true_negative_rate,0.981481
4,false_negative_rate,0.157143
5,precision,0.967213
6,recall,0.842857
7,f1_score,0.900763
8,support_pos,210.0
9,support_neg,324.0


## 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [34]:
y_val_pred1 = rf.predict(x_validate)
y_val_pred2 = rf3.predict(x_validate)
y_val_pred3 = rf5.predict(x_validate)

In [36]:
print(classification_report(y_validate, y_val_pred1))

print(classification_report(y_validate, y_val_pred2))

print(classification_report(y_validate, y_val_pred3))

              precision    recall  f1-score   support

           0       0.81      0.84      0.83       107
           1       0.75      0.70      0.72        71

    accuracy                           0.79       178
   macro avg       0.78      0.77      0.78       178
weighted avg       0.79      0.79      0.79       178

              precision    recall  f1-score   support

           0       0.80      0.87      0.83       107
           1       0.77      0.68      0.72        71

    accuracy                           0.79       178
   macro avg       0.79      0.77      0.78       178
weighted avg       0.79      0.79      0.79       178

              precision    recall  f1-score   support

           0       0.79      0.87      0.83       107
           1       0.77      0.66      0.71        71

    accuracy                           0.79       178
   macro avg       0.78      0.77      0.77       178
weighted avg       0.79      0.79      0.78       178



All three models are equal in accuracy on validate data; model 1 is slightly more precise.