In [1]:
# imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
from env import get_connection


from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import acquire
import prepare


####  Note
Ensamble: A technique that combines many models to make predictions

Bootstrapping: Random sampling of the train dataset and using different samples to train each tree

Aggregation: The combination of predictions from all trees to make oe final prediction

Bagging: Technique that combines bootstrapping and aggregation

#### Create a new notebook, random_forests, and work with titanic data to do the following:

1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

2. Evaluate your results using the model score, confusion matrix, and classification report.

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.

5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [2]:
# acquire data using function from acquire 
titanic_original = acquire.get_titanic_data()

# peek into data
titanic_original.head(3)

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1


In [3]:
# clean data using function from prepare 
titanic_clean = prepare.prep_titanic(titanic_original)

In [4]:
# peek into clean data
titanic_clean.head(3)

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,1,0,7.25,0,1,0,1
1,1,1,1,0,71.2833,0,0,0,0
2,1,3,0,0,7.925,1,0,0,1


In [5]:
# split data using funciton form prepare module 
train, val, test = prepare.train_validate_test_split(titanic_clean,'survived')

# get shape of train, validate and test data
train.shape, val.shape, test.shape

((498, 9), (214, 9), (179, 9))

In [6]:
# peek into train data
train.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
779,1,1,0,1,211.3375,0,0,0,1
159,0,3,8,2,69.55,0,1,0,1
738,0,3,0,0,7.8958,1,1,0,1
486,1,1,1,0,90.0,0,0,0,1
125,1,3,1,0,11.2417,0,1,0,0


In [7]:
# create labels
X_train = train.drop(columns = ['survived'])
y_train = train['survived']

X_val =val.drop(columns = ['survived'])
y_val = val['survived']

X_test = test.drop(columns = ['survived'])
y_test = test['survived']

In [8]:
# create model1
seed =42

rf = RandomForestClassifier(min_samples_leaf =1,max_depth=10, random_state=42, max_samples=0.5 )

In [9]:
# fit model
rf.fit(X_train, y_train)

In [10]:
# train predictions 
train_preds = rf.predict(X_train)

In [11]:
# val predictions 
val_preds = rf.predict(X_val)

#### 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [12]:
# train accuracy( model accuracy)
train_acc = rf.score(X_train, y_train)
train_acc

0.8975903614457831

In [13]:
# val accuracy
val_acc = rf.score(X_val, y_val)
val_acc

0.7990654205607477

In [14]:
# confusion matrix (y_train_pred, y_train)
cm = confusion_matrix(y_train, train_preds)
cm

array([[295,  12],
       [ 39, 152]])

In [15]:
# classification report from train predictions
print(classification_report(y_train, train_preds))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92       307
           1       0.93      0.80      0.86       191

    accuracy                           0.90       498
   macro avg       0.91      0.88      0.89       498
weighted avg       0.90      0.90      0.90       498



In [16]:
# classification report from val predictions
print(classification_report(y_val, val_preds))

              precision    recall  f1-score   support

           0       0.81      0.89      0.84       132
           1       0.78      0.66      0.72        82

    accuracy                           0.80       214
   macro avg       0.79      0.77      0.78       214
weighted avg       0.80      0.80      0.80       214



#### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [17]:
# calculate truenegative, falsepositive, falsenegative,truepositive 
TN, FP, FN, TP = cm.ravel()

In [18]:
# accuracy
ALL = TP + FP + FN + TN
acc = (TP + TN) / ALL
acc

0.8975903614457831

In [19]:
# true positive rate or precision
precision = TPR = TP / (TP + FP)
precision

0.926829268292683

In [20]:
# false positive rate
FPR = FP / (FP + TN)
FPR 

0.03908794788273615

In [21]:
# true neagative rate
TNR = TN / (TN + FP)
TNR 

0.9609120521172638

In [22]:
# false negative rate
FNR = FN / (FN + TP)

In [23]:
# recall
recall = TP / (TP + FN)
recall

0.7958115183246073

In [24]:
# f1-score
f1_score = 2 * (precision*recall) / (precision+recall)
f1_score

0.8563380281690142

In [25]:
# support
support_pos = TP + FN
support_neg = FP + TN
support_pos, support_neg

(191, 307)

#### 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [26]:
# create model2
seed =42

rf2 = RandomForestClassifier(min_samples_leaf =5,max_depth=5, random_state=42, max_samples=0.5 )

In [27]:
#fit the random forest algorithm to the training data
rf2.fit(X_train, y_train)

In [28]:
# train predictions from random forest algorithm
train_preds2 = rf2.predict(X_train)

In [29]:
# val predictions from random forest algorithm
val_preds2 = rf2.predict(X_val)

In [30]:
# train accuracy (model accuracy)
train_acc2 = rf2.score(X_train, y_train)
train_acc2

0.8253012048192772

In [31]:
# val accuracy
val_acc2 = rf2.score(X_val, y_val)
val_acc2

0.8037383177570093

In [32]:
# confusion matrix (y_train_pred, y_train)
cm = confusion_matrix(y_train, train_preds)
cm

array([[295,  12],
       [ 39, 152]])

In [33]:
# classification report on train
print(classification_report(y_train, train_preds2))

              precision    recall  f1-score   support

           0       0.82      0.92      0.87       307
           1       0.84      0.67      0.75       191

    accuracy                           0.83       498
   macro avg       0.83      0.80      0.81       498
weighted avg       0.83      0.83      0.82       498



In [34]:
# classification report from val predictions
print(classification_report(y_val, val_preds2))

              precision    recall  f1-score   support

           0       0.80      0.90      0.85       132
           1       0.80      0.65      0.72        82

    accuracy                           0.80       214
   macro avg       0.80      0.77      0.78       214
weighted avg       0.80      0.80      0.80       214



#### 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [35]:
# accuracy (model accuracy) on train
train_acc, train_acc2

(0.8975903614457831, 0.8253012048192772)

Model 1 perfroms better because it has higher accuracy score than model 2

#### 6. After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [36]:
# accuracy (model accuracy) on validate
val_acc, val_acc2

(0.7990654205607477, 0.8037383177570093)

Model 1 perfroms better because it has higher accuracy score than model 1