In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
from env import get_connection


from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier

import acquire
import prepare

####  Note
Ensamble: A technique that combines many models to make predictions

Bootstrapping: Random sampling of the train dataset and using different samples to train each tree

Aggregation: The combination of predictions from all trees to make oe final prediction

Bagging: Technique that combines bootstrapping and aggregation

#### Create a new notebook, random_forests, and work with titanic data to do the following:

1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

2. Evaluate your results using the model score, confusion matrix, and classification report.

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.

5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [2]:
# acquire data using function from acquire 
titanic_original = acquire.get_titanic_data()

In [3]:
# clean data using function from prepare 
titanic_clean = prepare.prep_titanic(titanic_original)

In [4]:
# check clean titanic data
titanic_clean.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,1,0,7.25,0,1,0,1
1,1,1,1,0,71.2833,0,0,0,0
2,1,3,0,0,7.925,1,0,0,1
3,1,1,1,0,53.1,0,0,0,1
4,0,3,0,0,8.05,1,1,0,1


In [5]:
# split data into train, validate, test using function from prepare
train, val, test = prepare.train_validate_test_split(titanic_clean,'survived')
train.shape, val.shape, test.shape

((133, 9), (134, 9), (624, 9))

In [6]:
# create label
X_train = train.drop(columns = ['survived'])
y_train = train['survived']

X_val =val.drop(columns = ['survived'])
y_val = val['survived']

X_test = test.drop(columns = ['survived'])
y_test = test['survived']

In [7]:
# create model1
seed =42

rf = RandomForestClassifier(min_samples_leaf =1,max_depth=10, random_state=42, max_samples=0.5 )

In [8]:
# fit model
rf.fit(X_train, y_train)

In [9]:
# train accuracy
train_acc = rf.score(X_train, y_train)
train_acc

0.9097744360902256

In [10]:
# val accuracy
val_acc = rf.score(X_val, y_val)
val_acc

0.746268656716418

In [11]:
# train predictions from random forest algorithm
train_preds = rf.predict(X_train)

In [12]:
# val predictions from random forest algorithm
val_preds = rf.predict(X_val)

In [13]:
# classification report from train predictions
print(classification_report(y_train, train_preds))

              precision    recall  f1-score   support

           0       0.92      0.94      0.93        82
           1       0.90      0.86      0.88        51

    accuracy                           0.91       133
   macro avg       0.91      0.90      0.90       133
weighted avg       0.91      0.91      0.91       133



In [14]:
# classification report from val predictions
print(classification_report(y_val, val_preds))

              precision    recall  f1-score   support

           0       0.80      0.80      0.80        83
           1       0.67      0.67      0.67        51

    accuracy                           0.75       134
   macro avg       0.73      0.73      0.73       134
weighted avg       0.75      0.75      0.75       134



In [15]:
# calculate truenegative, falsepositive, falsenegative,truepositive 
TN, FP, FN, TP = confusion_matrix(y_train, train_preds).ravel()
TN, FP, FN, TP

(77, 5, 7, 44)

In [16]:
# accuracy
ALL = TP + FP + FN + TN
acc = (TP + TN) / ALL
acc

0.9097744360902256

In [17]:
# true positive rate or precision
precision = TPR = TP / (TP + FP)
precision

0.8979591836734694

In [18]:
# false positive rate
FPR = FP / (FP + TN)
FPR 

0.06097560975609756

In [19]:
# true neagative rate
TNR = TN / (TN + FP)
TNR 

0.9390243902439024

In [20]:
# false negative rate
FNR = FN / (FN + TP)

In [21]:
# recall
recall = TP / (TP + FN)
recall

0.8627450980392157

In [22]:
# f1-score
f1_score = 2 * (precision*recall) / (precision+recall)
f1_score

0.8799999999999999

In [23]:
# support
support_pos = TP + FN
support_neg = FP + TN
support_pos, support_neg

(51, 82)

In [24]:
# create model2
seed =42

rf2 = RandomForestClassifier(min_samples_leaf =5,max_depth=5, random_state=42, max_samples=0.5 )

In [25]:
#fit the random forest algorithm to the training data
rf2.fit(X_train, y_train)

In [26]:
# train accuracy
train_acc2 = rf2.score(X_train, y_train)
train_acc2

0.8195488721804511

In [27]:
# val accuracy
val_acc2 = rf2.score(X_val, y_val)
val_acc2

0.7686567164179104

In [28]:
# train predictions from random forest algorithm
train_preds2 = rf2.predict(X_train)

In [29]:
# val predictions from random forest algorithm
val_preds2 = rf2.predict(X_val)

In [30]:
# classification report
print(classification_report(y_train, train_preds2))

              precision    recall  f1-score   support

           0       0.82      0.91      0.86        82
           1       0.83      0.67      0.74        51

    accuracy                           0.82       133
   macro avg       0.82      0.79      0.80       133
weighted avg       0.82      0.82      0.81       133



In [31]:
# classification report from val predictions
print(classification_report(y_val, val_preds2))

              precision    recall  f1-score   support

           0       0.77      0.89      0.83        83
           1       0.76      0.57      0.65        51

    accuracy                           0.77       134
   macro avg       0.77      0.73      0.74       134
weighted avg       0.77      0.77      0.76       134

