In [4]:
#Step 1: Let's begin by imporing the necessary packages

from scipy.spatial.distance import euclidean as euc
import numpy as np
np.random.seed(0)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#Step 2: Setting up a class for our K-Nearest-Neighbour algorithm and the relevant formulas

class KNN():
    
    #No need to modify the __init__ method.
    
    def fit(self, X_train, y_train):
        """Function that stores the trainning data (X_train) and it's corresponding labels (y_train).
        """
        self.X_train = X_train
        self.y_train = y_train
    
    def _get_distances(self, x):
        """Function that calculates each distance between an unlabelled data point and each point in the trainning
        data and subsequently appends such distance, along with an index into a distances list.
        """
        distances = []
        for ind, val in enumerate(self.X_train): #the loop is ennumerated pair the distances with an index for storage.
            dist_to_i = euc(x, val) #calculating the eucledian distance.
            distances.append((ind, dist_to_i))
        return distances
       
    def _get_k_nearest(self, dists, k):
        """Function that sorts the collection of distances from smallest to largest (in ascending order) and
        picks the first K entries from the sorted collection.
        """
        sorted_dists = sorted(dists, key=lambda x: x[1]) #sorting on the distances column.
        return sorted_dists[:k]
    
    def _get_label_prediction(self, k_nearest):
        """Function that will get the labels that correspond to each of the k-nearest point, and return the 
        most common label amongst the neighbours.
        """
        labels = [self.y_train[i] for i, _ in k_nearest] #creating a list of labels from the labelled data (y_train)
        #for each index in k-nearest.
        counts = np.bincount(labels) #total counts for each label
        return np.argmax(counts)
    
    def predict(self, X_test, k=3):
        """Function that when inputed an array of unlabelled data (X_test), calculates a prediction for 
        each point and returns an array of predictions. 
        """
        preds = []
        for i in X_test:
            dists = self._get_distances(i) #calculating the distance between each unlabelled and labelled point.
            k_nearest = self._get_k_nearest(dists, k) #identifying nearest points to the unlabelled data point.
            predicted_label = self._get_label_prediction(k_nearest) #determinning the most common label amongst KNN.
            preds.append(predicted_label)
        return preds

#Step 3: Initiating the KNN class and adding it the corresponding attributes and assigning it the relevant functions.

knn = KNN()
knn.fit
knn._get_distances
knn._get_label_prediction
knn.predict

#Step 4: Importing and cleaning the data

raw_df = pd.read_csv('titanic.csv')
df = raw_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=False)
df['Sex'] = df['Sex'].map({'female': 0, 'male': 1})
df['Age'] = df['Age'].fillna(value=df['Age'].median())
df = df.dropna()
one_hot_df = pd.get_dummies(df)
labels = one_hot_df['Survived']
one_hot_df.drop('Survived', axis=1, inplace=True)
one_hot_df = np.array(one_hot_df)
labels = np.array(labels)

#Step 5: Splitting our date into train and test

X_train, X_test, y_train, y_test = train_test_split(one_hot_df, labels, test_size=0.25)

#Step 6: Testing our KNN classifier

knn.fit(X_train, y_train)
preds = knn.predict(X_test, k=3)
print("Testing Accuracy: {}".format(accuracy_score(y_test, preds)))

Testing Accuracy: 0.6860986547085202
