In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier as knc
from sklearn.metrics import classification_report, accuracy_score , recall_score , precision_score, f1_score

from adam_prepare import titanic_pipeline

seed = 55

### Create a new notebook, knn_model, and work with the titanic dataset to answer the following:

#### 1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [2]:
train, val, test = titanic_pipeline()
train.head()

Unnamed: 0,survived,age,sibsp,parch,fare,alone,sex_male,class_First,class_Second,class_Third,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
748,0,19.0,1,0,53.1,0,True,True,False,False,False,False,True
45,0,29.0,0,0,8.05,1,True,False,False,True,False,False,True
28,1,29.0,0,0,7.8792,1,False,False,False,True,False,True,False
633,0,29.0,0,0,0.0,1,True,True,False,False,False,False,True
403,0,28.0,1,0,15.85,0,True,False,False,True,False,False,True


In [3]:
x_train = train.drop(columns=['survived'])
y_train = train.survived
x_val = val.drop(columns=['survived'])
y_val = val.survived

In [4]:
knn = knc()
knn.fit(x_train,y_train)

#### 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [5]:
pred_train = knn.predict(x_train)

In [6]:
knn.score(x_train,y_train) , knn.score(x_val,y_val)

(0.8073836276083467, 0.7014925373134329)

In [7]:
print(classification_report(y_train, pred_train))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85       384
           1       0.78      0.69      0.73       239

    accuracy                           0.81       623
   macro avg       0.80      0.78      0.79       623
weighted avg       0.81      0.81      0.80       623



In [8]:
pd.crosstab(y_train, pred_train)

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,339,45
1,75,164


#### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [9]:
def theometrics(TP, TN, FP, FN):
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    recall = TP / (TP + FN)
    true_positive_rate = TP / (TP + FN)
    false_positive_rate = FP / (FP + TN)
    true_negative_rate = TN / (TN + FP)
    false_negative_rate = FN / (FN + TP)
    precision = TP / (TP + FP)
    f1_score = 2 * (precision * recall) / (precision + recall)
    support = TP + FN
    data = {
        'Metric': ['Accuracy', 'Recall', 'True Positive Rate', 'False Positive Rate', 'True Negative Rate', 'False Negative Rate', 'Precision', 'F1-Score', 'Support'],
        'Value': [accuracy, recall, true_positive_rate, false_positive_rate, true_negative_rate, false_negative_rate, precision, f1_score, support]
    }
    theometrics = pd.DataFrame(data)
    return theometrics

True Positve == 164
False Positive == 45
False Negative == 75
True Negative == 339

In [10]:
theometrics(164,339,45,75)

Unnamed: 0,Metric,Value
0,Accuracy,0.807384
1,Recall,0.686192
2,True Positive Rate,0.686192
3,False Positive Rate,0.117188
4,True Negative Rate,0.882812
5,False Negative Rate,0.313808
6,Precision,0.784689
7,F1-Score,0.732143
8,Support,239.0


### Below I am running the defaults again but this time I am using the scaled data.
### All data below will be using the scaled data 
### To further improve results we can drop some of the less impactful colums as determined by our randomn forest.

In [11]:
mms = MinMaxScaler()

x_train[['age', 'fare']] = mms.fit_transform(x_train[['age', 'fare']])
x_val[['age', 'fare']] = mms.transform(x_val[['age', 'fare']])

In [12]:
knn_scaled = knc()
knn_scaled.fit(x_train,y_train)

In [13]:
pred_train = knn_scaled.predict(x_train)

In [14]:
knn_scaled.score(x_train,y_train) , knn_scaled.score(x_val,y_val)

(0.8619582664526485, 0.8208955223880597)

In [15]:
print(classification_report(y_train,pred_train))

              precision    recall  f1-score   support

           0       0.87      0.92      0.89       384
           1       0.85      0.77      0.81       239

    accuracy                           0.86       623
   macro avg       0.86      0.85      0.85       623
weighted avg       0.86      0.86      0.86       623



In [16]:
pd.crosstab(y_train, pred_train)

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,352,32
1,54,185


In [17]:
theometrics(185,352,32,54)

Unnamed: 0,Metric,Value
0,Accuracy,0.861958
1,Recall,0.774059
2,True Positive Rate,0.774059
3,False Positive Rate,0.083333
4,True Negative Rate,0.916667
5,False Negative Rate,0.225941
6,Precision,0.852535
7,F1-Score,0.811404
8,Support,239.0


#### 4. Run through steps 1-3 setting k to 10

In [18]:
knn10 = knc(n_neighbors =10)
knn10.fit(x_train,y_train)

In [19]:
pred_train = knn10.predict(x_train)

In [20]:
knn10.score(x_train,y_train) , knn10.score(x_val,y_val)

(0.8186195826645265, 0.8134328358208955)

In [21]:
print(classification_report(y_train, pred_train))

              precision    recall  f1-score   support

           0       0.82      0.91      0.86       384
           1       0.82      0.67      0.74       239

    accuracy                           0.82       623
   macro avg       0.82      0.79      0.80       623
weighted avg       0.82      0.82      0.81       623



In [22]:
pd.crosstab(y_train, pred_train)

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,349,35
1,78,161


In [23]:
theometrics(161,349,35,78)

Unnamed: 0,Metric,Value
0,Accuracy,0.81862
1,Recall,0.67364
2,True Positive Rate,0.67364
3,False Positive Rate,0.091146
4,True Negative Rate,0.908854
5,False Negative Rate,0.32636
6,Precision,0.821429
7,F1-Score,0.74023
8,Support,239.0


#### 5. Run through steps 1-3 setting k to 20

In [24]:
knn20 = knc(n_neighbors =20)
knn20.fit(x_train,y_train)

In [25]:
pred_train = knn20.predict(x_train)

In [26]:
knn20.score(x_train,y_train) , knn20.score(x_val,y_val)

(0.8154093097913323, 0.7985074626865671)

In [27]:
print(classification_report(y_train, pred_train))

              precision    recall  f1-score   support

           0       0.81      0.91      0.86       384
           1       0.83      0.66      0.73       239

    accuracy                           0.82       623
   macro avg       0.82      0.79      0.80       623
weighted avg       0.82      0.82      0.81       623



In [28]:
pd.crosstab(y_train, pred_train)

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,351,33
1,82,157


In [29]:
theometrics(157,351,33,82)

Unnamed: 0,Metric,Value
0,Accuracy,0.815409
1,Recall,0.656904
2,True Positive Rate,0.656904
3,False Positive Rate,0.085938
4,True Negative Rate,0.914062
5,False Negative Rate,0.343096
6,Precision,0.826316
7,F1-Score,0.731935
8,Support,239.0


#### 6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

The KNN 5 with the scaled features performed the best with both in sample and out of sample data. I think that it is because as there are more neighbors being asled it starts to inroduce more noise.

#### 7. Which model performs best on our out-of-sample data from validate?

The KNN 5 with the scaled features performed the best with both in sample and out of sample data. I think that it is because as there are more neighbors being asled it starts to inroduce more noise.