# Importing All the Libraries Required

In [48]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# Importing the Dataset

In [49]:
titanic_ds = pd.read_csv('Titanic_coursework_entire_dataset_23-24.csv')

titanic_ds

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Survival
0,1,3.0,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,S,0
1,2,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,1
2,3,3.0,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,S,1
3,4,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,S,1
4,5,3.0,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,S,0
...,...,...,...,...,...,...,...,...,...,...,...
885,886,2.0,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,S,0
886,887,1.0,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,S,1
887,888,3.0,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,S,0
888,889,1.0,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C,1


In [50]:
titanic_ds.isna().sum()

PassengerId      0
Pclass           2
Name             0
Sex              0
Age            176
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
Survival         0
dtype: int64

# Restructuring the Dataset

In [51]:
# encoding the string values to numericals

le_name = LabelEncoder()
le_sex = LabelEncoder()
le_ticket = LabelEncoder()
le_embarked = LabelEncoder()

titanic_ds['name_encoded'] = le_name.fit_transform(titanic_ds['Name'])
titanic_ds['sex_encoded'] = le_sex.fit_transform(titanic_ds['Sex'])
titanic_ds['ticket_encoded'] = le_ticket.fit_transform(titanic_ds['Ticket'])
titanic_ds['embarked_encoded'] = le_embarked.fit_transform(titanic_ds['Embarked'])

In [52]:
# a method to replace the NaN values with mean

nan_values = ['Pclass', 'Age', 'embarked_encoded']

for column in nan_values:
    mean = int(titanic_ds[column].mean(skipna=True))
    titanic_ds[column] = titanic_ds[column].replace(np.NaN, mean)

# Specifying the Feature and Target Values

In [53]:
x = titanic_ds.drop(['Survival', 'PassengerId', 'Name', 'Sex', 'Ticket', 'Embarked'], axis = 'columns')
y = titanic_ds['Survival']

x.head(10)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,name_encoded,sex_encoded,ticket_encoded,embarked_encoded
0,3.0,22.0,1,0,7.25,108,1,522,2
1,1.0,38.0,1,0,71.2833,190,0,595,0
2,3.0,26.0,0,0,7.925,353,0,668,2
3,1.0,35.0,1,0,53.1,272,0,49,2
4,3.0,35.0,0,0,8.05,15,1,471,2
5,3.0,29.0,0,0,8.4583,554,1,275,1
6,1.0,54.0,0,0,51.8625,515,1,85,2
7,3.0,2.0,3,1,21.075,623,1,394,2
8,3.0,27.0,0,2,11.1333,412,0,344,2
9,2.0,14.0,1,0,30.0708,575,0,132,0


# Splitting the Data

In [65]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.27, random_state = 42)

# Feature Scaling

In [66]:
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)

In [67]:
import math
math.sqrt(len(y_test))

15.524174696260024

# Model Evaluation

In [84]:
classifier = KNeighborsClassifier(n_neighbors=15, p = 2, metric = 'euclidean')
classifier.fit(x_train, y_train)

In [85]:
y_pred = classifier.predict(x_test)
y_pred

array([0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0],
      dtype=int64)

In [86]:
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))

Accuracy:  0.8381742738589212
Precision: 0.8409090909090909
Recall: 0.7474747474747475


# Hyperparameter Tuning

In [93]:
knn2 = KNeighborsClassifier(n_neighbors=19, p = 2, metric = 'euclidean')
knn2.fit(x_train, y_train)

In [94]:
y_pred = knn2.predict(x_test)

In [95]:
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))

Accuracy:  0.8423236514522822
Precision: 0.8588235294117647
Recall: 0.7373737373737373
