In [1]:
# imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
from env import get_connection

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score


import acquire
import prepare

#### Create a new notebook, knn_model, and work with the titanic dataset to answer the following:

1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

2. Evaluate your results using the model score, confusion matrix, and classification report.

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

4. Run through steps 1-3 setting k to 10

5. Run through steps 1-3 setting k to 20

6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

7. Which model performs best on our out-of-sample data from validate?

In [2]:
# acquire data using function from acquire 
titanic_original = acquire.get_titanic_data()

# peek into data
titanic_original.head(3)

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1


In [3]:
# clean data using function from prepare 
titanic_clean = prepare.prep_titanic(titanic_original)

In [4]:
# peek into clean data
titanic_clean.head(3)

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,1,0,7.25,0,1,0,1
1,1,1,1,0,71.2833,0,0,0,0
2,1,3,0,0,7.925,1,0,0,1


In [5]:
# split data using funciton form prepare module 
train, val, test = prepare.train_validate_test_split(titanic_clean,'survived')

# get shape of train, validate and test data
train.shape, val.shape, test.shape

((498, 9), (214, 9), (179, 9))

In [6]:
# check data in train
train.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
779,1,1,0,1,211.3375,0,0,0,1
159,0,3,8,2,69.55,0,1,0,1
738,0,3,0,0,7.8958,1,1,0,1
486,1,1,1,0,90.0,0,0,0,1
125,1,3,1,0,11.2417,0,1,0,0


In [7]:
# create labels
X_train = train.drop(columns = ['survived'])
y_train = train['survived']

X_val =val.drop(columns = ['survived'])
y_val = val['survived']

X_test = test.drop(columns = ['survived'])
y_test = test['survived']

In [8]:
# create model
knn = KNeighborsClassifier(n_neighbors=5)

In [9]:
# fit the model
knn.fit(X_train, y_train)

In [10]:
# train accuracy
knn.score(X_train, y_train)

0.8313253012048193

In [11]:
# val accuracy
knn.score(X_val, y_val)

0.7523364485981309

In [12]:
# train predictions on the training sample from KNN
train_preds = knn.predict(X_train)

In [13]:
# val prediction on the validate sample from KNN
val_preds = knn.predict(X_val)

#### 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [14]:
# train accuracy( model accuracy)
train_acc = knn.score(X_train, y_train)
train_acc

0.8313253012048193

In [15]:
# val accuracy
val_acc = knn.score(X_val, y_val)
val_acc

0.7523364485981309

In [16]:
# confusion matrix (y_train_pred, y_train)
cm = confusion_matrix(y_train, train_preds)
cm

array([[271,  36],
       [ 48, 143]])

In [17]:
# classification report from train predictions
print(classification_report(y_train, train_preds))

              precision    recall  f1-score   support

           0       0.85      0.88      0.87       307
           1       0.80      0.75      0.77       191

    accuracy                           0.83       498
   macro avg       0.82      0.82      0.82       498
weighted avg       0.83      0.83      0.83       498



In [18]:
# classification report as the dataframe from train predictions
report = classification_report(y_train, train_preds, output_dict=True)
pd.DataFrame(report)

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.84953,0.798883,0.831325,0.824206,0.830105
recall,0.882736,0.748691,0.831325,0.815714,0.831325
f1-score,0.865815,0.772973,0.831325,0.819394,0.830207
support,307.0,191.0,0.831325,498.0,498.0


#### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [19]:
# calculate truenegative, falsepositive, falsenegative,truepositive 
TN, FP, FN, TP = cm.ravel()

In [20]:
# accuracy
ALL = TP + FP + FN + TN
acc = (TP + TN) / ALL
acc

0.8313253012048193

In [21]:
# true positive rate or precision
precision = TPR = TP / (TP + FP)
precision

0.7988826815642458

In [22]:
# false positive rate
FPR = FP / (FP + TN)
FPR 

0.11726384364820847

In [23]:
# true neagative rate
TNR = TN / (TN + FP)
TNR 

0.8827361563517915

In [24]:
# false negative rate
FNR = FN / (FN + TP)

In [25]:
# recall
recall = TP / (TP + FN)
recall

0.7486910994764397

In [26]:
# f1-score
f1_score = 2 * (precision*recall) / (precision+recall)
f1_score

0.772972972972973

In [27]:
# support
support_pos = TP + FN
support_neg = FP + TN
support_pos, support_neg

(191, 307)

In [28]:
# classification report as the dataframe from  valpredictions
# report = classification_report(y_val, val_preds, output_dict=True)
# pd.DataFrame(report)

In [29]:
# TN, FP, FN, TP = confusion_matrix(y_train, train_preds).ravel()
# TN, FP, FN, TP

In [30]:
# def report(neighbor):
    
#     train_preds = knn.predict(X_train)
    
#     val_preds = knn.predict(X_val)
    
#     TN, FP, FN, TP = confusion_matrix(y_train, train_preds).ravel()
    
#     # accuracy
#     ALL = TP + FP + FN + TN
#     accuracy = (TP + TN) / ALL

#     # true positive rate, also called recall
#     TPR = recall = (TP/ (TP + FN))


#     # false positive rate
#     FPR = FP / (FP + TN)

#     # true negative rate
#     TNR = TN / (TN + FP)

#     # false negative rate
#     FNR = FN / (FN + TP)

#     # precision
#     precision = TP / (TP + FP)

#     # f1-score
#     f1_score = 2 * (precision*recall) / (precision+recall)

#     # support
#     support_positive = TP + FN
#     support_negative = FP + TN
#     print('Neighbor: ', neighbor)
#     print()
#     print('Accuracy: ', round((accuracy * 100),2))
#     print('True Positive rate: ', round((TPR * 100),2))
#     print('False Positive rate: ', round((FPR * 100),2))
#     print('True Negative rate: ', round((TNR * 100),2))
#     print('False Negative rate: ', round((FNR* 100),2))
#     print('Precision: ', round((precision * 100),2))
#     print('Recall: ', round((recall * 100),2))
#     print('f1-score: ', round((f1_score * 100),2))
#     print('Support Positive: ', support_positive )
#     print('Support Negative: ', support_negative )
#     print()
#     print()

#### 4. Run through steps 1-3 setting k to 10

In [31]:
num_neighbors = []
train_acc = []
val_acc = []
train_report = []
val_report = []
for i in range (10,21,10):
    num_neighbors.append(i)
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    train_acc.append(knn.score(X_train, y_train))
    val_acc.append(knn.score(X_val, y_val))
    train_preds = knn.predict(X_train)
    val_preds = knn.predict(X_val)
    report_train = classification_report(y_train, train_preds, output_dict=True)
    report_train_df = pd.DataFrame(report_train)
    report_val = classification_report(y_val, val_preds, output_dict=True)
    report_val_df = pd.DataFrame(report_val)
    train_report.append((f'In-sample, Train, Neighbor: {i}'))
    train_report.append(report_train_df)
    val_report.append((f'Out-sample,Val, Neighbor: {i}'))
    val_report.append(report_val_df)
#     print('In-sample, Train, Neighbor: ',i)
#     print(report_trian_df)
# #     print(pd.DataFrame(report_train))
#     print()
#     val_preds = knn.predict(X_val)
#     report_val = classification_report(y_val, val_preds, output_dict=True)
# #     print('Out-sample, Val, Neighbor: ',i)
# #     print(pd.DataFrame(report_val))
# #     print()

In [32]:
train_report

['In-sample, Train, Neighbor: 10',
                     0           1  accuracy   macro avg  weighted avg
 precision    0.804878    0.747059  0.785141    0.775968      0.782702
 recall       0.859935    0.664921  0.785141    0.762428      0.785141
 f1-score     0.831496    0.703601  0.785141    0.767549      0.782444
 support    307.000000  191.000000  0.785141  498.000000    498.000000,
 'In-sample, Train, Neighbor: 20',
                     0           1  accuracy   macro avg  weighted avg
 precision    0.737127    0.728682   0.73494    0.732905      0.733888
 recall       0.885993    0.492147   0.73494    0.689070      0.734940
 f1-score     0.804734    0.587500   0.73494    0.696117      0.721417
 support    307.000000  191.000000   0.73494  498.000000    498.000000]

In [33]:
val_report

['Out-sample,Val, Neighbor: 10',
                     0          1  accuracy   macro avg  weighted avg
 precision    0.776224   0.704225  0.752336    0.740225      0.748636
 recall       0.840909   0.609756  0.752336    0.725333      0.752336
 f1-score     0.807273   0.653595  0.752336    0.730434      0.748387
 support    132.000000  82.000000  0.752336  214.000000    214.000000,
 'Out-sample,Val, Neighbor: 20',
                     0          1  accuracy   macro avg  weighted avg
 precision    0.717949   0.655172  0.700935    0.686561      0.693894
 recall       0.848485   0.463415  0.700935    0.655950      0.700935
 f1-score     0.777778   0.542857  0.700935    0.660317      0.687761
 support    132.000000  82.000000  0.700935  214.000000    214.000000]

In [34]:
knn_metrics = pd.DataFrame({'neighbors': num_neighbors,
             'train_acc': train_acc,
             'val_acc': val_acc })

knn_metrics

Unnamed: 0,neighbors,train_acc,val_acc
0,10,0.785141,0.752336
1,20,0.73494,0.700935


### 6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

KNN with neighbors with 10 performed better on train data because of higher accuracy score on train data

#### 7. Which model performs best on our out-of-sample data from validate?

KNN with neighbors with 10 performed better on train data because of higher accuracy score on train data