In [149]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd 
import math

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt
import seaborn as sns

In [60]:
import acquire

# Decision Tree Model Exercises

### Decicion Tree with the Titanic Data

In [61]:
# Aquire: 

In [64]:
titanic_df = acquire.get_titanic_data()
titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [65]:
titanic_df.shape

(891, 13)

In [66]:
# Preoare: 

In [67]:
def prep_titanic(titanic_df):
    '''
    This function will clean the titanic data...
    '''
    titanic_df = titanic_df.drop_duplicates()
    cols_to_drop = ['deck', 'embarked', 'class', 'age']
    titanic_df = titanic_df.drop(columns=cols_to_drop)
    titanic_df.embark_town = titanic_df.embark_town.fillna(value=titanic_df.embark_town.mode())
    dummy_df = pd.get_dummies(titanic_df[['sex', 'embark_town']], dummy_na=False, drop_first=[True, True])
    titanic_df = pd.concat([titanic_df, dummy_df], axis=1)
    return titanic_df

In [68]:
titanic_df = prep_titanic(titanic_df)
titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,male,1,0,7.25,Southampton,0,1,0,1
1,1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,2,1,3,female,0,0,7.925,Southampton,1,0,0,1
3,3,1,1,female,1,0,53.1,Southampton,0,0,0,1
4,4,0,3,male,0,0,8.05,Southampton,1,1,0,1


In [69]:
titanic_df = titanic_df.drop(columns=['sex', 'embark_town'])

In [70]:
titanic_df.shape

(891, 10)

In [71]:
titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,1,0,7.25,0,1,0,1
1,1,1,1,1,0,71.2833,0,0,0,0
2,2,1,3,0,0,7.925,1,0,0,1
3,3,1,1,1,0,53.1,0,0,0,1
4,4,0,3,0,0,8.05,1,1,0,1


In [72]:
# Split:

In [73]:
def split_data(titanic_df):
    '''
    Takes in a dataframe and return train, validate, test subset dataframes
    '''
    train, test = train_test_split(titanic_df, test_size = .2, random_state=123, stratify=titanic_df.survived)
    train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train.survived)
    return train, validate, test

In [74]:
train, validate, test = split_data(titanic_df)

In [75]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
583,583,0,1,0,0,40.125,1,1,0,0
165,165,1,3,0,2,20.525,0,1,0,1
50,50,0,3,4,1,39.6875,0,1,0,1
259,259,1,2,0,1,26.0,0,0,0,1
306,306,1,1,0,0,110.8833,1,0,0,0


In [76]:
X_train = train.drop(columns=['survived'])
y_train = train.survived

x_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [77]:
train.shape

(498, 10)

In [78]:
validate.shape

(214, 10)

In [79]:
test.shape

(179, 10)

In [80]:
X_train.shape

(498, 9)

1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [81]:
# baseline prediction = most common value

In [82]:
baseline = y_train.mode()

In [83]:
baseline

0    0
dtype: int64

In [84]:
match_bsl_prediction = y_train == 0

In [85]:
baseline_accuracy = match_bsl_prediction.mean()

In [86]:
baseline_accuracy

0.6164658634538153

In [87]:
# basline accuracy = 62%

2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)


In [88]:
tree1_clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [89]:
tree1_clf = tree1_clf.fit(X_train, y_train)

In [90]:
#visualize the decision tree
import graphviz
from graphviz import Graph

dot_data = export_graphviz(tree1_clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

In [93]:
y_pred = tree1_clf.predict(X_train)
y_pred[0:5]

array([0, 0, 0, 1, 1])

In [94]:
y_pred_proba = tree1_clf.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.62222222, 0.37777778],
       [0.62222222, 0.37777778],
       [0.89285714, 0.10714286],
       [0.14814815, 0.85185185],
       [0.        , 1.        ]])

3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.


In [95]:
# accuracy:

In [96]:
print('Accuracy of Decision Tree 1 classifier on training set: {:.2f}'
      .format(tree1_clf.score(X_train, y_train)))

Accuracy of Decision Tree 1 classifier on training set: 0.82


In [97]:
# confusion matrix:

In [98]:
confusion_matrix(y_train, y_pred)

array([[274,  33],
       [ 56, 135]])

In [99]:
y_train.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [100]:
labels = sorted(y_train.unique())
print('Actual on the left, predicted on the top')
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Actual on the left, predicted on the top


Unnamed: 0,0,1
0,274,33
1,56,135


In [101]:
# classification report: 

In [102]:

print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.89      0.86       307
           1       0.80      0.71      0.75       191

    accuracy                           0.82       498
   macro avg       0.82      0.80      0.81       498
weighted avg       0.82      0.82      0.82       498



In [103]:
# make classification report prettier in a df
class_report = classification_report(y_train, y_pred, output_dict=(True))
print("Tree1 depth")
pd.DataFrame(class_report)

Tree1 depth


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.830303,0.803571,0.821285,0.816937,0.820051
recall,0.892508,0.706806,0.821285,0.799657,0.821285
f1-score,0.860283,0.752089,0.821285,0.806186,0.818787
support,307.0,191.0,0.821285,498.0,498.0


4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.


- **Precision**: the higher this number is, the more you were able to pinpoint all positives correctly. If this is a low score, you predicted a lot of positives where there were none.
- **Recall**: if this score is high, you didn’t miss a lot of positives. But as it gets lower, you are not predicting the positives that are actually there.
- **f1-score**: The balanced harmonic mean of Recall and Precision, giving both metrics equal weight. The higher the F-Measure is, the better.
- **Support**: number of occurrences of each class in where y is true.

In [104]:
class_report = classification_report(y_train, y_pred, output_dict=(True))
print("Tree1 depth")
pd.DataFrame(class_report)

Tree1 depth


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.830303,0.803571,0.821285,0.816937,0.820051
recall,0.892508,0.706806,0.821285,0.799657,0.821285
f1-score,0.860283,0.752089,0.821285,0.806186,0.818787
support,307.0,191.0,0.821285,498.0,498.0


5. Run through steps 2-4 using a different max_depth value.

In [105]:
for i in range(2, 11):
    # Make the model
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)

    # Fit the model (on train and only train)
    tree = tree.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    y_pred = tree.predict(X_train)

    # Produce the classification report on the actual y values and this model's predicted y values
    report = classification_report(y_train, y_pred, output_dict=True)
    print(f"Tree with max depth of {i}")
    print(pd.DataFrame(report))
    print()

Tree with max depth of 2
                    0           1  accuracy   macro avg  weighted avg
precision    0.820433    0.760000  0.799197    0.790217      0.797255
recall       0.863192    0.696335  0.799197    0.779764      0.799197
f1-score     0.841270    0.726776  0.799197    0.784023      0.797358
support    307.000000  191.000000  0.799197  498.000000    498.000000

Tree with max depth of 3
                    0           1  accuracy   macro avg  weighted avg
precision    0.830303    0.803571  0.821285    0.816937      0.820051
recall       0.892508    0.706806  0.821285    0.799657      0.821285
f1-score     0.860283    0.752089  0.821285    0.806186      0.818787
support    307.000000  191.000000  0.821285  498.000000    498.000000

Tree with max depth of 4
                    0           1  accuracy   macro avg  weighted avg
precision    0.815642    0.892857  0.837349    0.854250      0.845257
recall       0.951140    0.654450  0.837349    0.802795      0.837349
f1-score     

In [106]:
# to figure out which model to use, create a df with just themax_depth, train_accuracy, validate_accuracy, difference

In [107]:
metrics = []  

In [108]:
for i in range(1, 11):
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)
    
    #run the model on train and only TRAIN data 
    tree = tree.fit(X_train, y_train)
    
    #use/test the model to evaluate models performance on train data first...
    in_sample_accuracy = tree.score(X_train, y_train)
    out_sample_accuracy = tree.score(x_validate, y_validate)
    
    output = {'max_depth': i, 'train_accuracy': in_sample_accuracy, 'validate_accuracy': out_sample_accuracy}
    
    metrics.append(output)
    
tree_df = pd.DataFrame(metrics)
tree_df["difference"] = tree_df.train_accuracy - tree_df.validate_accuracy

tree_df

Unnamed: 0,max_depth,train_accuracy,validate_accuracy,difference
0,1,0.799197,0.761682,0.037515
1,2,0.799197,0.761682,0.037515
2,3,0.821285,0.775701,0.045584
3,4,0.837349,0.761682,0.075667
4,5,0.849398,0.761682,0.087715
5,6,0.871486,0.738318,0.133168
6,7,0.893574,0.752336,0.141238
7,8,0.913655,0.733645,0.18001
8,9,0.935743,0.724299,0.211444
9,10,0.947791,0.728972,0.218819


In [109]:
# to avoid over-fitting, set a threshhold by looking at the difference

In [110]:
threshold = 0.10  #threshold set for amount of overfit that is tolerated

models = []
metrics = []

for i in range(1, 11):
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)
    #^^^ creates the model
    
    tree = tree.fit(X_train, y_train)   #fit model to train data and only TRAIN data
    
    in_sample_accuracy = tree.score(X_train, y_train)
    out_sample_accuracy = tree.score(x_validate, y_validate)
    #^^^evaluates the models performance on train data first
    
    difference = in_sample_accuracy - out_sample_accuracy
    #^^calculates the difference in accuracy
    
    if difference > threshold:
        break
    #^^adds conditions to check the accuracy vs the threshold
    
    output = {
        'max_depth': i,
        'train_accuracy': in_sample_accuracy,
        'validate_accuracy': out_sample_accuracy,
        'difference': difference}
    #^^^formats the output for each models performance o train and validate
    
    metrics.append(output)
    
    models.append(output)
    
model_df = pd.DataFrame(metrics)
model_df["difference"] = tree_df.train_accuracy - tree_df.validate_accuracy


model_df.head()

Unnamed: 0,max_depth,train_accuracy,validate_accuracy,difference
0,1,0.799197,0.761682,0.037515
1,2,0.799197,0.761682,0.037515
2,3,0.821285,0.775701,0.045584
3,4,0.837349,0.761682,0.075667
4,5,0.849398,0.761682,0.087715


6. Which model performs better on your in-sample data?



Answer: the model with a max depth of 5 (the higher the max_depth, the higher the accuracy.

7. Which model performs best on your out-of-sample data, the validate set?

Answer: the model with a max depth of 3

### Decicion Tree with the Telco Data

1. Work through these same exercises using the Telco dataset.

2. Experiment with this model on other datasets with a higher number of output classes.

### Decicion Tree with the Other Data

# Random Forest Model Exercises

### Random Forest with the Titanic Data

1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [111]:
# Do the same Aquire and Prepare steps as we did for the decision tree if in another notebook

In [112]:
# min_sample_leaf=1 / max_depth=10 - Accuracy of random forest classifier on training set: 0.97

In [113]:
# create model
rf1_clf = RandomForestClassifier(max_depth=10, min_samples_leaf=1, random_state=123)  

In [114]:
rf1_clf

RandomForestClassifier(max_depth=10, random_state=123)

In [115]:
rf1_clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, random_state=123)

In [116]:
print(rf1_clf.feature_importances_)

[0.23385239 0.09052441 0.05149459 0.03453432 0.22859942 0.02186412
 0.30248482 0.0108785  0.02576743]


In [117]:
y_pred = rf1_clf.predict(X_train)
y_pred

array([0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,

In [118]:
y_pred_proba = rf1_clf.predict_proba(X_train)

In [119]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf1_clf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.97


In [120]:
#for i in range(5, 16):
#    # Make the model
#    forest = RandomForestClassifier(max_depth=2, min_samples_leaf=1, random_state=123)

#    # Fit the model (on train and only train)
#    forest = forest.fit(X_train, y_train)

#    # Use the model
#    # We'll evaluate the model's performance on train, first
#    y_pred = forest.predict(X_train)

#    # Produce the classification report on the actual y values and this model's predicted y values
#    report = classification_report(y_train, y_pred, output_dict=True)
#    print(f"Tree with max depth of {i}")
#    print(pd.DataFrame(report))
#    print()

2. Evaluate your results using the model score, confusion matrix, and classification report.

In [121]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf1_clf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.97


In [122]:
print(confusion_matrix(y_train, y_pred))

[[307   0]
 [ 15 176]]


In [123]:
pd.DataFrame(confusion_matrix(y_train, y_pred))

Unnamed: 0,0,1
0,307,0
1,15,176


In [124]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98       307
           1       1.00      0.92      0.96       191

    accuracy                           0.97       498
   macro avg       0.98      0.96      0.97       498
weighted avg       0.97      0.97      0.97       498



In [125]:
class_report = classification_report(y_train, y_pred, output_dict=(True))
print("Tree1 depth")
pd.DataFrame(class_report)

Tree1 depth


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.953416,1.0,0.96988,0.976708,0.971283
recall,1.0,0.921466,0.96988,0.960733,0.96988
f1-score,0.976153,0.959128,0.96988,0.96764,0.969623
support,307.0,191.0,0.96988,498.0,498.0


3. Print and clearly label the following:  Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [126]:
# If not-survived is our positive case
TP = 285
FP = 68
FN = 22
TN = 123
ALL = TP + FP + FN + TN

accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

Accuracy: 0.8192771084337349
True Positive Rate: 0.9283387622149837
False Positive Rate: 0.35602094240837695
True Negative Rate: 0.643979057591623
False Negative Rate: 0.07166123778501629
Precision: 0.8073654390934845
Recall: 0.9283387622149837
F1 Score: 0.8636363636363636
Support (0): 307
Support (1): 191


4. Run through steps increasing your min_samples_leaf and decreasing your max_depth. 

In [127]:
max_depth = 16

for i in range(1, max_depth):
    # Create Model
    depth = max_depth - i
    n = i
    forest = RandomForestClassifier(max_depth=depth, min_samples_leaf=n, random_state=123)

    # Fit the model (on train and only train)
    forest = forest.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    y_pred = forest.predict(X_train)

    # Produce the classification report on the actual y values and this model's predicted y values
    report = classification_report(y_train, y_pred, output_dict=True)
    print(f"Tree with max depth of {i}")
    print(pd.DataFrame(report))
    print()

Tree with max depth of 1
               0      1  accuracy  macro avg  weighted avg
precision    1.0    1.0       1.0        1.0           1.0
recall       1.0    1.0       1.0        1.0           1.0
f1-score     1.0    1.0       1.0        1.0           1.0
support    307.0  191.0       1.0      498.0         498.0

Tree with max depth of 2
                    0           1  accuracy   macro avg  weighted avg
precision    0.912121    0.964286  0.929719    0.938203      0.932128
recall       0.980456    0.848168  0.929719    0.914312      0.929719
f1-score     0.945055    0.902507  0.929719    0.923781      0.928736
support    307.000000  191.000000  0.929719  498.000000    498.000000

Tree with max depth of 3
                    0           1  accuracy   macro avg  weighted avg
precision    0.898507    0.963190  0.919679    0.930849      0.923315
recall       0.980456    0.821990  0.919679    0.901223      0.919679
f1-score     0.937695    0.887006  0.919679    0.912350      0.91825

In [128]:
metrics = []
max_depth = 16

for i in range(1, max_depth):
    # Create model
    depth = max_depth - i
    n = i
    forest = RandomForestClassifier(max_depth=depth, min_samples_leaf=n, random_state=123)

    # Fit the model (on train and only train)
    forest = forest.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = forest.score(X_train, y_train)
    
    out_of_sample_accuracy = forest.score(x_validate, y_validate)

    output = {
        "min_samples_per_leaf": n,
        "max_depth": depth,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

Unnamed: 0,min_samples_per_leaf,max_depth,train_accuracy,validate_accuracy,difference
0,1,15,1.0,0.771028,0.228972
1,2,14,0.929719,0.775701,0.154018
2,3,13,0.919679,0.78972,0.129959
3,4,12,0.87751,0.799065,0.078445
4,5,11,0.873494,0.78972,0.083774
5,6,10,0.863454,0.785047,0.078407
6,7,9,0.853414,0.785047,0.068367
7,8,8,0.851406,0.785047,0.066359
8,9,7,0.841365,0.775701,0.065665
9,10,6,0.833333,0.780374,0.05296


5. What are the differences in the evaluation metrics?  Which performs better on your in-sample data?  Why?

In [129]:
metrics = []
max_depth = 16

for i in range(1, max_depth):
    # Create model
    depth = max_depth - i
    n = i
    forest = RandomForestClassifier(max_depth=depth, min_samples_leaf=n, random_state=123)

    # Fit the model (on train and only train)
    forest = forest.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = forest.score(X_train, y_train)
    
    out_of_sample_accuracy = forest.score(x_validate, y_validate)

    output = {
        "min_samples_per_leaf": n,
        "max_depth": depth,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

Unnamed: 0,min_samples_per_leaf,max_depth,train_accuracy,validate_accuracy,difference
0,1,15,1.0,0.771028,0.228972
1,2,14,0.929719,0.775701,0.154018
2,3,13,0.919679,0.78972,0.129959
3,4,12,0.87751,0.799065,0.078445
4,5,11,0.873494,0.78972,0.083774
5,6,10,0.863454,0.785047,0.078407
6,7,9,0.853414,0.785047,0.068367
7,8,8,0.851406,0.785047,0.066359
8,9,7,0.841365,0.775701,0.065665
9,10,6,0.833333,0.780374,0.05296


After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [130]:
# min_samples_per_leaf	max_depth	train_accuracy	validate_accuracy	difference
# 4	                    12	        0.877510	    0.799065	        0.078445

# KNN Model Exercises

1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)



In [131]:
# Do the same Aquire and Prepare steps as we did for the decision tree if in another notebook

In [132]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [133]:
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [134]:
y_pred = knn.predict(X_train)

In [135]:
#y_pred

In [136]:
y_pred_proba = knn.predict_proba(X_train)

2. Evaluate your results using the model score, confusion matrix, and classification report.

In [137]:
# Accuracy
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.74


In [138]:
# confusion matrix
print(confusion_matrix(y_train, y_pred))

[[270  37]
 [ 92  99]]


In [139]:
pd.DataFrame(confusion_matrix(y_train, y_pred))

Unnamed: 0,0,1
0,270,37
1,92,99


In [140]:
# classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.88      0.81       307
           1       0.73      0.52      0.61       191

    accuracy                           0.74       498
   macro avg       0.74      0.70      0.71       498
weighted avg       0.74      0.74      0.73       498



In [141]:
pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.745856,0.727941,0.740964,0.736899,0.738985
recall,0.879479,0.518325,0.740964,0.698902,0.740964
f1-score,0.807175,0.605505,0.740964,0.70634,0.729827
support,307.0,191.0,0.740964,498.0,498.0


3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [142]:
# If not-survived (0) is our positive case
TP = 270
FP = 37
FN = 92
TN = 99
ALL = TP + FP + FN + TN

accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

Accuracy: 0.7409638554216867
True Positive Rate: 0.7458563535911602
False Positive Rate: 0.27205882352941174
True Negative Rate: 0.7279411764705882
False Negative Rate: 0.2541436464088398
Precision: 0.8794788273615635
Recall: 0.7458563535911602
F1 Score: 0.8071748878923767
Support (0): 362
Support (1): 136


4. Run through steps 2-4 setting k to 10

In [143]:
for k in range(1, 21):
            
    # define the thing
    knn = KNeighborsClassifier(n_neighbors=k)
    
    # fit the thing (remmeber only fit on training data)
    knn = knn.fit(X_train, y_train)
    
    # predict on train
    y_pred = knn.predict(X_train)

    # Produce the classification report on the actual y values and this model's predicted y values
    report = classification_report(y_train, y_pred, output_dict=True)
    print(f"KNN with k value of {k}")
    print(pd.DataFrame(report))
    print()


KNN with k value of 1
               0      1  accuracy  macro avg  weighted avg
precision    1.0    1.0       1.0        1.0           1.0
recall       1.0    1.0       1.0        1.0           1.0
f1-score     1.0    1.0       1.0        1.0           1.0
support    307.0  191.0       1.0      498.0         498.0

KNN with k value of 2
                    0           1  accuracy   macro avg  weighted avg
precision    0.741546    1.000000  0.785141    0.870773      0.840672
recall       1.000000    0.439791  0.785141    0.719895      0.785141
f1-score     0.851595    0.610909  0.785141    0.731252      0.759284
support    307.000000  191.000000  0.785141  498.000000    498.000000

KNN with k value of 3
                    0           1  accuracy   macro avg  weighted avg
precision    0.819820    0.793939  0.811245    0.806880      0.809894
recall       0.889251    0.685864  0.811245    0.787557      0.811245
f1-score     0.853125    0.735955  0.811245    0.794540      0.808186
support

5. Run through setps 2-4 setting k to 20



In [144]:
# See loop in question above

6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?



Answer: k set to 10 preforms better than k set on 20 on the in-sample data because it is on the brink of overfitting

In [145]:
metrics = []

# loop through different values of k
for k in range(1, 21):
            
    # define the thing
    knn = KNeighborsClassifier(n_neighbors=k)
    
    # fit the thing (remmeber only fit on training data)
    knn.fit(X_train, y_train)
    
    # use the thing (calculate accuracy)
    train_accuracy = knn.score(X_train, y_train)
    validate_accuracy = knn.score(x_validate, y_validate)
    
    output = {
        "k": k,
        "train_accuracy": train_accuracy,
        "validate_accuracy": validate_accuracy
    }
    
    metrics.append(output)


df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

Unnamed: 0,k,train_accuracy,validate_accuracy,difference
0,1,1.0,0.560748,0.439252
1,2,0.785141,0.607477,0.177664
2,3,0.811245,0.616822,0.194423
3,4,0.73494,0.593458,0.141482
4,5,0.740964,0.570093,0.17087
5,6,0.708835,0.593458,0.115377
6,7,0.718876,0.593458,0.125418
7,8,0.706827,0.626168,0.080659
8,9,0.722892,0.607477,0.115415
9,10,0.706827,0.598131,0.108696


7. Which model performs best on our out-of-sample data from validate?

Answer: k set to 20 preforms better than k set to 10 on the out-of-sample data because it is not overfitting to the train data

# Logistic Regression Model Exercises

1. Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

In [176]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
583,583,0,1,0,0,40.125,1,1,0,0
165,165,1,3,0,2,20.525,0,1,0,1
50,50,0,3,4,1,39.6875,0,1,0,1
259,259,1,2,0,1,26.0,0,0,0,1
306,306,1,1,0,0,110.8833,1,0,0,0


In [193]:
# create model
logr1 = LogisticRegression(C=.5,random_state=123)

In [194]:
# fit model
logr1.fit(X_train, y_train)

LogisticRegression(C=0.5, random_state=123)

In [195]:
# accuracy
logr1.score(X_train, y_train)

0.7991967871485943

In [196]:
y_pred = logr1.predict(X_train)

In [197]:
y_pred

array([0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,

In [198]:
logr1.predict_proba(X_train)

array([[0.6455674 , 0.3544326 ],
       [0.86017107, 0.13982893],
       [0.96081307, 0.03918693],
       [0.16639037, 0.83360963],
       [0.13185881, 0.86814119],
       [0.84235119, 0.15764881],
       [0.76620198, 0.23379802],
       [0.71549661, 0.28450339],
       [0.87971155, 0.12028845],
       [0.87118627, 0.12881373],
       [0.32707984, 0.67292016],
       [0.52510663, 0.47489337],
       [0.16721068, 0.83278932],
       [0.57168496, 0.42831504],
       [0.77747448, 0.22252552],
       [0.89144132, 0.10855868],
       [0.90410016, 0.09589984],
       [0.38952557, 0.61047443],
       [0.7315491 , 0.2684509 ],
       [0.35804186, 0.64195814],
       [0.36044809, 0.63955191],
       [0.87347347, 0.12652653],
       [0.08252676, 0.91747324],
       [0.10213242, 0.89786758],
       [0.54589643, 0.45410357],
       [0.86833174, 0.13166826],
       [0.33859117, 0.66140883],
       [0.16046013, 0.83953987],
       [0.59388042, 0.40611958],
       [0.78032477, 0.21967523],
       [0.

In [199]:
logr1.classes_

array([0, 1])

In [200]:
y_pred_proba = logr1.predict_proba(X_train)
y_pred_proba = pd.DataFrame(y_pred_proba, columns = ['non-survivers', 'survivers'])
y_pred_proba.head()

Unnamed: 0,non-survivers,survivers
0,0.645567,0.354433
1,0.860171,0.139829
2,0.960813,0.039187
3,0.16639,0.83361
4,0.131859,0.868141


In [201]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.88      0.84       307
           1       0.78      0.66      0.72       191

    accuracy                           0.80       498
   macro avg       0.79      0.77      0.78       498
weighted avg       0.80      0.80      0.80       498



2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.



In [202]:
# model already included sex

3. Try out other combinations of features and models.

In [216]:
logr2 = LogisticRegression(C=1, penalty='l2', solver='lbfgs', multi_class='multinomial', random_state=123)
logr2.fit(X_train, y_train)



LogisticRegression(C=1, multi_class='multinomial', random_state=123)

In [217]:
logr2.score(X_train, y_train)

0.8052208835341366

In [222]:
logr3 = LogisticRegression(C=.1, penalty='l2', solver='lbfgs', multi_class='multinomial', random_state=123)
logr3.fit(X_train, y_train)

LogisticRegression(C=0.1, multi_class='multinomial', random_state=123)

In [223]:
logr3.score(X_train, y_train)

0.8152610441767069

In [224]:
y_pred = logr3.predict(X_train)

In [225]:
y_pred

array([0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [226]:
logr3.predict_proba(X_train)

array([[0.55492291, 0.44507709],
       [0.85509722, 0.14490278],
       [0.95407424, 0.04592576],
       [0.2181536 , 0.7818464 ],
       [0.13968164, 0.86031836],
       [0.7333112 , 0.2666888 ],
       [0.76131032, 0.23868968],
       [0.73662115, 0.26337885],
       [0.8060284 , 0.1939716 ],
       [0.88852172, 0.11147828],
       [0.39618983, 0.60381017],
       [0.57282375, 0.42717625],
       [0.21737501, 0.78262499],
       [0.61491927, 0.38508073],
       [0.76448398, 0.23551602],
       [0.89227686, 0.10772314],
       [0.90379826, 0.09620174],
       [0.33603276, 0.66396724],
       [0.75129234, 0.24870766],
       [0.40222449, 0.59777551],
       [0.42548905, 0.57451095],
       [0.87604768, 0.12395232],
       [0.14377442, 0.85622558],
       [0.14469227, 0.85530773],
       [0.59136837, 0.40863163],
       [0.87146699, 0.12853301],
       [0.26904534, 0.73095466],
       [0.15005561, 0.84994439],
       [0.63352492, 0.36647508],
       [0.79368403, 0.20631597],
       [0.

In [229]:
logr3.classes_

array([0, 1])

In [230]:
y_pred_proba = logr3.predict_proba(X_train)
y_pred_proba = pd.DataFrame(y_pred_proba, columns = ['non-survivers', 'survivers'])
y_pred_proba.head()

Unnamed: 0,non-survivers,survivers
0,0.554923,0.445077
1,0.855097,0.144903
2,0.954074,0.045926
3,0.218154,0.781846
4,0.139682,0.860318


In [231]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.89      0.86       307
           1       0.80      0.69      0.74       191

    accuracy                           0.82       498
   macro avg       0.81      0.79      0.80       498
weighted avg       0.81      0.82      0.81       498



In [236]:
logr3.score(x_validate, y_validate)

0.7616822429906542

In [237]:
train_accuracy = logr3.score(X_train, y_train)
validate_accuracy = logr3.score(x_validate, y_validate)

In [241]:
difference = train_accuracy - validate_accuracy
difference

0.05357880118605263

In [235]:
metrics = []

# loop through different values of c
for c in range(1, 10):
            
    # define the thing
    logit = LogisticRegression(C=c, random_state=123)
    
    # fit the thing (remmeber only fit on training data)
    logit.fit(X_train, y_train)
    
    # use the thing (calculate accuracy)
    train_accuracy = logit.score(X_train, y_train)
    validate_accuracy = logit.score(x_validate, y_validate)
    
    output = {
        "c": c,
        "train_accuracy": train_accuracy,
        "validate_accuracy": validate_accuracy
    }
    
    metrics.append(output)


df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df

Unnamed: 0,c,train_accuracy,validate_accuracy,difference
0,1,0.807229,0.761682,0.045547
1,2,0.801205,0.761682,0.039523
2,3,0.805221,0.761682,0.043539
3,4,0.801205,0.752336,0.048868
4,5,0.803213,0.761682,0.041531
5,6,0.799197,0.761682,0.037515
6,7,0.807229,0.771028,0.036201
7,8,0.805221,0.761682,0.043539
8,9,0.809237,0.771028,0.038209


4. Use you best 3 models to predict and evaluate on your validate sample.

5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?



Bonus1 How do different strategies for handling the missing values in the age column affect model performance?

Bonus2: How do different strategies for encoding sex affect model performance?



Bonus3: scikit-learn's LogisticRegression classifier is actually applying a regularization penalty to the coefficients by default. This penalty causes the magnitude of the coefficients in the resulting model to be smaller than they otherwise would be. This value can be modified with the C hyper parameter. Small values of C correspond to a larger penalty, and large values of C correspond to a smaller penalty.
Try out the following values for C and note how the coefficients and the model's performance on both the dataset it was trained on and on the validate split are affected.
C
=
.01
,
.1
,
1
,
10
,
100
,
1000


Bonus Bonus: how does scaling the data interact with your choice of C?