# KNN Model Exercises

Create a new notebook, knn_model, and work with the titanic dataset to answer the following:

In [2]:
import pandas as pd

import acquire
from env import get_db_url
import prepare

import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [3]:
titanic_db = acquire.get_titanic_data()

In [4]:
titanic_prep = prepare.prep_titanic(titanic_db)

In [5]:
train, validate, test = prepare.split_data(titanic_prep)

In [6]:
print(train.shape, validate.shape, test.shape)

(534, 12) (178, 12) (179, 12)


## 1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [35]:
x_train = train.drop(columns=['survived', 'passenger_id', 'sex', 'embark_town'])
y_train = train.survived

x_val = validate.drop(columns=['survived', 'passenger_id', 'sex', 'embark_town'])
y_val = validate.survived

x_test = test.drop(columns=['survived', 'passenger_id', 'sex', 'embark_town'])
y_test = test.survived

knn5 = KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn5.fit(x_train,y_train)

In [36]:
y_pred = knn5.predict(x_train)

In [37]:
y_pred_proba = knn5.predict_proba(x_train)

## 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [10]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn5.score(x_train, y_train)))

Accuracy of KNN classifier on training set: 0.80


In [11]:
print(confusion_matrix(y_train, y_pred))

[[280  44]
 [ 61 149]]


In [12]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       324
           1       0.77      0.71      0.74       210

    accuracy                           0.80       534
   macro avg       0.80      0.79      0.79       534
weighted avg       0.80      0.80      0.80       534



## 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [13]:
print(accuracy_score(y_train, y_pred))

0.8033707865168539


In [14]:
print(precision_score(y_train, y_pred))

0.772020725388601


In [15]:
print(recall_score(y_train, y_pred))

0.7095238095238096


In [16]:
print(f1_score(y_train, y_pred))

0.739454094292804


In [17]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       324
           1       0.77      0.71      0.74       210

    accuracy                           0.80       534
   macro avg       0.80      0.79      0.79       534
weighted avg       0.80      0.80      0.80       534



In [None]:
# Instructor Answer:
def print_cm_metrics(cm):
    tn, fp, fn, tp = cm.ravel()

    accuracy = (tp + tn)/(tn + fp + fn + tp)

    true_positive_rate = tp/(tp + fn)
    false_positive_rate = fp/(fp + tn)
    true_negative_rate = tn/(tn + fp)
    false_negative_rate = fn/(fn + tp)

    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    f1_score = 2*(precision*recall)/(precision+recall)

    support_pos = tp + fn
    support_neg = fp + tn

    dict = {
        'metric' : ['accuracy'
                    ,'true_positive_rate'
                    ,'false_positive_rate'
                    ,'true_negative_rate'
                    ,'false_negative_rate'
                    ,'precision'
                    ,'recall'
                    ,'f1_score'
                    ,'support_pos'
                    ,'support_neg']
        ,'score' : [accuracy
                    ,true_positive_rate
                    ,false_positive_rate
                    ,true_negative_rate
                    ,false_negative_rate
                    ,precision
                    ,recall
                    ,f1_score
                    ,support_pos
                    ,support_neg]
    }

    return pd.DataFrame(dict)

## 4. Run through steps 1-3 setting k to 10

In [27]:
knn10 = KNeighborsClassifier(n_neighbors=10, weights='uniform')
knn10.fit(x_train,y_train)

In [28]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn10.score(x_train, y_train)))

Accuracy of KNN classifier on training set: 0.77


In [29]:
print(confusion_matrix(y_train, y_pred))

[[280  44]
 [ 61 149]]


In [30]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       324
           1       0.77      0.71      0.74       210

    accuracy                           0.80       534
   macro avg       0.80      0.79      0.79       534
weighted avg       0.80      0.80      0.80       534



## 5. Run through steps 1-3 setting k to 20

In [31]:
knn20 = KNeighborsClassifier(n_neighbors=20, weights='uniform')
knn20.fit(x_train,y_train)

In [32]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn20.score(x_train, y_train)))

Accuracy of KNN classifier on training set: 0.76


In [33]:
print(confusion_matrix(y_train, y_pred))

[[280  44]
 [ 61 149]]


In [34]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       324
           1       0.77      0.71      0.74       210

    accuracy                           0.80       534
   macro avg       0.80      0.79      0.79       534
weighted avg       0.80      0.80      0.80       534



## 6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

The first model, k=5, runs the best for train data with an accuracy of 0.80.

## 7. Which model performs best on our out-of-sample data from validate?

In [38]:
y_val_pred1 = knn5.predict(x_val)
y_val_pred2 = knn10.predict(x_val)
y_val_pred3 = knn20.predict(x_val)

In [39]:
print(classification_report(y_val, y_val_pred1))

print(classification_report(y_val, y_val_pred2))

print(classification_report(y_val, y_val_pred3))

              precision    recall  f1-score   support

           0       0.77      0.79      0.78       107
           1       0.67      0.63      0.65        71

    accuracy                           0.73       178
   macro avg       0.72      0.71      0.72       178
weighted avg       0.73      0.73      0.73       178

              precision    recall  f1-score   support

           0       0.72      0.84      0.78       107
           1       0.68      0.51      0.58        71

    accuracy                           0.71       178
   macro avg       0.70      0.67      0.68       178
weighted avg       0.70      0.71      0.70       178

              precision    recall  f1-score   support

           0       0.71      0.88      0.78       107
           1       0.71      0.45      0.55        71

    accuracy                           0.71       178
   macro avg       0.71      0.66      0.67       178
weighted avg       0.71      0.71      0.69       178



Model 1, knn=5, performs the best on the validate model.