# Random Forest Exercises

Create a new notebook, random_forests, and work with titanic data to do the following:

In [5]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import acquire
import prepare
import env

In [2]:
df = acquire.get_titanic_data()

In [3]:
df = prepare.prep_titanic(df)

In [4]:
train, validate, test = prepare.split_data(df)

## 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [7]:
x_train = train.drop(columns=['survived', 'passenger_id', 'sex', 'embark_town'])
y_train = train.survived

x_validate = validate.drop(columns=['survived', 'passenger_id', 'sex', 'embark_town'])
y_validate = validate.survived

x_test = test.drop(columns=['survived', 'passenger_id', 'sex', 'embark_town'])
y_test = test.survived

In [32]:
rf = RandomForestClassifier(bootstrap=True,
                           class_weight=None,
                           criterion='gini',
                           min_samples_leaf=1,
                           n_estimators=100,
                           max_depth=10,
                           random_state=311)

In [53]:
rf.fit(x_train, y_train)

In [54]:
rf.score(x_train, y_train)

0.9269662921348315

In [55]:
y_pred = rf.predict(x_train)

y_predict_proba = rf.predict_proba(x_train)

## 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [56]:
prepare.evaluate_clf(rf, x_train, y_train, y_pred)

(0.9269662921348315,
           Pred 0  Pred 1
 Actual 0     318       6
 Actual 1      33     177,
                     0           1  accuracy   macro avg  weighted avg
 precision    0.905983    0.967213  0.926966    0.936598      0.930062
 recall       0.981481    0.842857  0.926966    0.912169      0.926966
 f1-score     0.942222    0.900763  0.926966    0.921493      0.925918
 support    324.000000  210.000000  0.926966  534.000000    534.000000,
                 metric       score
 0             accuracy    0.926966
 1   true_positive_rate    0.842857
 2  false_positive_rate    0.018519
 3   true_negative_rate    0.981481
 4  false_negative_rate    0.157143
 5            precision    0.967213
 6               recall    0.842857
 7             f1_score    0.900763
 8          support_pos  210.000000
 9          support_neg  324.000000)

## 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [37]:
cm = confusion_matrix(y_train, y_pred)
prepare.get_cm_metrics(cm)

Unnamed: 0,metric,score
0,accuracy,0.926966
1,true_positive_rate,0.842857
2,false_positive_rate,0.018519
3,true_negative_rate,0.981481
4,false_negative_rate,0.157143
5,precision,0.967213
6,recall,0.842857
7,f1_score,0.900763
8,support_pos,210.0
9,support_neg,324.0


In [38]:
rf.fit(x_validate, y_validate)

In [39]:
rf.score(x_validate, y_validate)

0.9550561797752809

In [41]:
y_val_pred = rf.predict(x_validate)

y_val_predict_proba = rf.predict_proba(x_validate)

In [57]:
print('Train')

print(prepare.evaluate_clf(rf, x_train, y_train, y_pred))

print('*************************************************')

print('Validate')

print(prepare.evaluate_clf(rf, x_validate, y_validate, y_val_pred))

Train
(0.9269662921348315,           Pred 0  Pred 1
Actual 0     318       6
Actual 1      33     177,                     0           1  accuracy   macro avg  weighted avg
precision    0.905983    0.967213  0.926966    0.936598      0.930062
recall       0.981481    0.842857  0.926966    0.912169      0.926966
f1-score     0.942222    0.900763  0.926966    0.921493      0.925918
support    324.000000  210.000000  0.926966  534.000000    534.000000,                 metric       score
0             accuracy    0.926966
1   true_positive_rate    0.842857
2  false_positive_rate    0.018519
3   true_negative_rate    0.981481
4  false_negative_rate    0.157143
5            precision    0.967213
6               recall    0.842857
7             f1_score    0.900763
8          support_pos  210.000000
9          support_neg  324.000000)
*************************************************
Validate
(0.7865168539325843,           Pred 0  Pred 1
Actual 0     105       2
Actual 1       6      65,     

## 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [20]:
# min_samples_leaf = 3, max_depth = 8
rf = RandomForestClassifier(bootstrap=True,
                           class_weight=None,
                           criterion='gini',
                           min_samples_leaf=3,
                           n_estimators=100,
                           max_depth=8,
                           random_state=311)

In [23]:
rf.fit(x_train, y_train)

In [25]:
rf.score(x_train, y_train)

0.8576779026217228

In [29]:
y_pred = rf.predict(x_train)

y_predict_proba = rf.predict_proba(x_train)

In [30]:
prepare.evaluate_clf(rf, x_train, y_train, y_pred)

(0.8576779026217228,
           Pred 0  Pred 1
 Actual 0     305      19
 Actual 1      57     153,
                     0           1  accuracy   macro avg  weighted avg
 precision    0.842541    0.889535  0.857678    0.866038      0.861022
 recall       0.941358    0.728571  0.857678    0.834965      0.857678
 f1-score     0.889213    0.801047  0.857678    0.845130      0.854541
 support    324.000000  210.000000  0.857678  534.000000    534.000000,
                 metric       score
 0             accuracy    0.857678
 1   true_positive_rate    0.728571
 2  false_positive_rate    0.058642
 3   true_negative_rate    0.941358
 4  false_negative_rate    0.271429
 5            precision    0.889535
 6               recall    0.728571
 7             f1_score    0.801047
 8          support_pos  210.000000
 9          support_neg  324.000000)

In [31]:
cm = confusion_matrix(y_train, y_pred)
prepare.get_cm_metrics(cm)

Unnamed: 0,metric,score
0,accuracy,0.857678
1,true_positive_rate,0.728571
2,false_positive_rate,0.058642
3,true_negative_rate,0.941358
4,false_negative_rate,0.271429
5,precision,0.889535
6,recall,0.728571
7,f1_score,0.801047
8,support_pos,210.0
9,support_neg,324.0
