In [33]:
# importing required libraries
import pandas as pd # read the dataset
import seaborn as sns # visualize
import numpy as np # create multi-dimensional array
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import ConfusionMatrixDisplay

**Loading the data**

In [20]:
train = pd.read_csv('/titanic-train.csv')
train.shape

(891, 11)

In [21]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,male,35.0,0,0,373450,8.05,,S


In [22]:
test = pd.read_csv('/titanic-test.csv')
test.shape

(418, 11)

In [23]:
test

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,?,3,male,34.5,0,0,330911,7.8292,,Q
1,893,?,3,female,47.0,1,0,363272,7.0000,,S
2,894,?,2,male,62.0,0,0,240276,9.6875,,Q
3,895,?,3,male,27.0,0,0,315154,8.6625,,S
4,896,?,3,female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,?,3,male,,0,0,A.5. 3236,8.0500,,S
414,1306,?,1,female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,?,3,male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,?,3,male,,0,0,359309,8.0500,,S


In [24]:
# converting cabin to binary attribute using dummy variable
train['Cabin'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
test['Cabin'] = test["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

# converting sex cateogorical variable to dummy variable
train['Sex'] = train["Sex"].replace({'male':0,'female':1})
test['Sex'] = test["Sex"].replace({'male':0,'female':1})

In [25]:
full_data = [train,test]
full_data

[     PassengerId  Survived  Pclass  Sex   Age  SibSp  Parch            Ticket  \
 0              1         0       3    0  22.0      1      0         A/5 21171   
 1              2         1       1    1  38.0      1      0          PC 17599   
 2              3         1       3    1  26.0      0      0  STON/O2. 3101282   
 3              4         1       1    1  35.0      1      0            113803   
 4              5         0       3    0  35.0      0      0            373450   
 ..           ...       ...     ...  ...   ...    ...    ...               ...   
 886          887         0       2    0  27.0      0      0            211536   
 887          888         1       1    1  19.0      0      0            112053   
 888          889         0       3    1   NaN      1      2        W./C. 6607   
 889          890         1       1    0  26.0      0      0            111369   
 890          891         0       3    0  32.0      0      0            370376   
 
         Fare 

In [26]:
# Remove all NULLS in the Age column
for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    # Next line has been improved to avoid warning
    dataset.loc[np.isnan(dataset['Age']), 'Age'] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)

In [27]:
X_train = train[['Sex','Pclass','Age','SibSp','Parch','Cabin']]
Y_train = train['Survived']
X_test = test[['Sex','Pclass','Age','SibSp','Parch','Cabin']]

**Exercise 01: K Nearest Neighbors using Titanic data**

Create a kNN model Report accuracy when number of neighbors = {1, 3, 5, 7}. Pick the best K value and then change the weighted distance metric. Report if distance weighting helps or not.

In [28]:
def knn_model(n_neighbors, X_train, Y_train):
    neigh = KNeighborsClassifier(n_neighbors=n_neighbors)
    neigh.fit(X_train, Y_train)
    neigh_pred = neigh.predict(X_train)
    return neigh_pred

In [34]:
def model_evaluation(test_results, gold_results):
    tab = pd.crosstab(test_results,gold_results,rownames=['Actual'],colnames=['Predicted'])
    print(tab)

    matrix = confusion_matrix(test_results,gold_results)
    print('Confusion matrix : \n',matrix)

    # classification report for precision, recall f1-score and accuracy
    matrix = classification_report(test_results,gold_results)
    print('Classification report : \n',matrix)

In [35]:
neigh_pred = knn_model(1,X_train,Y_train) # 1 neighbor
neigh_pred

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1,

In [36]:
model_evaluation(Y_train,neigh_pred)

Predicted    0    1
Actual             
0          483   66
1           40  302
Confusion matrix : 
 [[483  66]
 [ 40 302]]
Classification report : 
               precision    recall  f1-score   support

           0       0.92      0.88      0.90       549
           1       0.82      0.88      0.85       342

    accuracy                           0.88       891
   macro avg       0.87      0.88      0.88       891
weighted avg       0.88      0.88      0.88       891



*Accuracy of K Nearest Neighbor with K=1 is 0.88*



In [37]:
neigh_pred = knn_model(3,X_train,Y_train) # 3 neighbor
model_evaluation(Y_train,neigh_pred)

Predicted    0    1
Actual             
0          500   49
1           79  263
Confusion matrix : 
 [[500  49]
 [ 79 263]]
Classification report : 
               precision    recall  f1-score   support

           0       0.86      0.91      0.89       549
           1       0.84      0.77      0.80       342

    accuracy                           0.86       891
   macro avg       0.85      0.84      0.85       891
weighted avg       0.86      0.86      0.85       891



*Accuracy of K Nearest Neighbor with K=3 is 0.86*


In [38]:
neigh_pred = knn_model(5,X_train,Y_train) # 5 neighbors
model_evaluation(Y_train,neigh_pred)

Predicted    0    1
Actual             
0          502   47
1           96  246
Confusion matrix : 
 [[502  47]
 [ 96 246]]
Classification report : 
               precision    recall  f1-score   support

           0       0.84      0.91      0.88       549
           1       0.84      0.72      0.77       342

    accuracy                           0.84       891
   macro avg       0.84      0.82      0.83       891
weighted avg       0.84      0.84      0.84       891



Accuracy of K Nearest Neighbor with K=5 is 0.84

In [39]:
neigh_pred = knn_model(7,X_train,Y_train) # 7 neighbors
model_evaluation(Y_train,neigh_pred)

Predicted    0    1
Actual             
0          497   52
1          108  234
Confusion matrix : 
 [[497  52]
 [108 234]]
Classification report : 
               precision    recall  f1-score   support

           0       0.82      0.91      0.86       549
           1       0.82      0.68      0.75       342

    accuracy                           0.82       891
   macro avg       0.82      0.79      0.80       891
weighted avg       0.82      0.82      0.82       891



Accuracy of K Nearest Neighbor with K=7 is 0.82

In [51]:
## choosing the model with one nearest neighbour. It has the highest accuracy of 0.88
# but the accuracy of the model increased for k=3 with weights='distance' parameters

# change the weighted distance metric
knn_clf = KNeighborsClassifier(n_neighbors=3, weights='distance')
knn_clf.fit(X_train, Y_train)
knn_pred = knn_clf.predict(X_train)

In [52]:
model_evaluation(Y_train, knn_pred)

Predicted    0    1
Actual             
0          534   15
1           57  285
Confusion matrix : 
 [[534  15]
 [ 57 285]]
Classification report : 
               precision    recall  f1-score   support

           0       0.90      0.97      0.94       549
           1       0.95      0.83      0.89       342

    accuracy                           0.92       891
   macro avg       0.93      0.90      0.91       891
weighted avg       0.92      0.92      0.92       891

