In [None]:
import pandas as pd
import numpy as np

In [None]:
#Load the dataset on colab
df = pd.read_csv("https://github.com/andvise/DataAnalyticsDatasets/blob/8e8f6475f49d2a587e4f5c76cdf0b011b22c6ac1/titanic.csv?raw=true")

In [None]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,1,0,3,Mr. Owen Harris Braund,male,22.0,1,0.0,7.25
1,2,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0.0,71.2833
2,3,1,3,Miss. Laina Heikkinen,female,26.0,0,0.0,7.925
3,4,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,,53.1
4,5,0,3,Mr. William Henry Allen,male,35.0,0,0.0,8.05


In [None]:
#Display the attributes' name and their data type
df.dtypes

PassengerId                  int64
Survived                     int64
Pclass                       int64
Name                        object
Sex                         object
Age                        float64
Siblings/Spouses Aboard      int64
Parents/Children Aboard    float64
Fare                       float64
dtype: object

In [None]:
#Delete the columns PassengerId and Name
df.drop(['PassengerId', 'Name'],axis=1, inplace=True)

In [None]:
#Replace all missing values with 0
df = df.fillna(0)

In [None]:
#Transform the Sex column into a numerical one
df['Sex'] = df['Sex'].astype('category').cat.codes

In [None]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,1,22.0,1,0.0,7.25
1,1,1,0,38.0,1,0.0,71.2833
2,1,3,0,26.0,0,0.0,7.925
3,1,1,0,35.0,1,0.0,53.1
4,0,3,1,35.0,0,0.0,8.05


In [None]:
#Use Survived as the target label and the rest of the data frame as features
y = df['Survived']
x = df.loc[:, df.columns != 'Survived']
#x=df.drop('Survived', axis=1)

In [None]:
#Divide your dataset in 80% for training and 20% for test
dat = np.random.rand(len(df)) < 0.8
x_train = x[dat]
x_test = x[~dat]
y_train = y[dat]
y_test = y[~dat]

In [None]:
#Scale the columns using min-max scalers
x_train -= x_train.min()
x_train /= x_train.max()

x_test -= x_test.min()
x_test /= x_test.max()

In [None]:
#Print the shape of the train and test set
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(691, 6)
(196, 6)
(691,)
(196,)


K-NN IMPLEMENTATION

In [None]:
import pandas as pd
import numpy as np
#Load the dataset on colab
df = pd.read_csv("https://github.com/andvise/DataAnalyticsDatasets/blob/8e8f6475f49d2a587e4f5c76cdf0b011b22c6ac1/titanic.csv?raw=true")
df.dtypes
df.drop(['PassengerId', 'Name'],axis=1, inplace=True)
df = df.fillna(0)
df['Sex'] = df['Sex'].astype('category').cat.codes
y = df['Survived']
x = df.loc[:, df.columns != 'Survived']
dat = np.random.rand(len(df)) < 0.8

x_train = x[dat]
x_test = x[~dat]
y_train = y[dat]
y_test = y[~dat]

x_train -= x_train.min()
x_train /= x_train.max()

x_test -= x_test.min()
x_test /= x_test.max()

def Euclidean_distance(x_train, x_test):
    summ = 0
    for i in range(6):
        sub = (x_test.iloc[i] - x_train[i])**2
        summ += sub
    return summ

def Get_Neighbors(x_train, x_test, num):
    lst = []
    for i in range(len(x_test)):
        distance = []
        neighbors = []
        data = []
        for j in range(len(x_train)):
            dist = Euclidean_distance(x_test.iloc[i], x_train.iloc[j])
            distance.append((dist, j))
        distance.sort()
        for t in range(num):
            neighbors.append(distance[t][1])
        #return neighbors
        for n in neighbors:
            data.append(y_train.iloc[n])
        ans = max(set(data), key=data.count)
        lst.append(ans)
    return lst

def confusion_matrix(lst,y_test):
    matrix=np.zeros((2,2)) 
    for m in range(len(lst)): #confusion matrix is for 2 classes: 1,0
        #1=positive, 0=negative
        if int(lst[m])==0 and int(y_test.iloc[m])==0:
            matrix[0,0]+=1 #True Positives
        elif int(lst[m])==1 and int(y_test.iloc[m])==0:
            matrix[0,1]+=1 #False Negatives
        elif int(lst[m])==0 and int(y_test.iloc[m])==1:
            matrix[1,0]+=1 #False Positives
        elif int(lst[m])==1 and int(y_test.iloc[m])==1:
            matrix[1,1]+=1 #True Negatives
    precision=matrix[0,0]/(matrix[0,0]+matrix[0,1])
    print("Precision:",precision)
    recall=matrix[0,0]/(matrix[0,0]+matrix[1,0])
    print("Recall:",recall)
    specificity=matrix[1,1]/(matrix[0,1]+matrix[1,1])
    print("Specificity:",specificity)
    negative_pred_value=matrix[1,1]/(matrix[1,0]+matrix[1,1])
    print("Negative Predicted Value:",negative_pred_value)
    f1=2*((precision*recall)/(precision+recall))
    print("F1 score:",f1)
    return matrix

#predicted = Get_Neighbors(x_train, x_test, 3)

def KNN(x_test, y_test,k):
    predictions = Get_Neighbors(x_train, x_test, k)
    cm = confusion_matrix(predictions, y_test)
   # print('ConfusionMatrix', cm)
    accuracy = (predictions == y_test).sum() / len(y_test)
    #print('Accuracy', accuracy)
    return cm,accuracy

#Confusion Matrix is showing, 
#True positive(0th row, 0th column) means predicted positive and true value is also same. 
#False Negative(0th row, 1st column) means predicted negative and it's false
#False Positive(1st row, 0th column) means predicted positve but true value is negative\false.
#True negative(1st row, 1st column) means predicted negative and it's true.

HYPERPARMETERS SEARCH

In [None]:
k_best = [1, 3, 5, 7, 9, 11]
pred_best = []
def K_best(x_test, y_test, K):
    for s in K:
        predictions_best= Get_Neighbors(x_train, x_test, s)
        cm, acc = KNN(x_test, y_test,s)
        print(s, '  : Accuracy', acc)
        print('confusion_matrix for k:', s, cm)
print(K_best(x_test, y_test, k_best))   

Precision: 0.8924731182795699
Recall: 0.7614678899082569
Specificity: 0.803921568627451
Negative Predicted Value: 0.6119402985074627
F1 score: 0.8217821782178217
1   : Accuracy 0.775
confusion_matrix for k: 1 [[83. 10.]
 [26. 41.]]
Precision: 0.9032258064516129
Recall: 0.8
Specificity: 0.8363636363636363
Negative Predicted Value: 0.6865671641791045
F1 score: 0.8484848484848486
3   : Accuracy 0.8125
confusion_matrix for k: 3 [[84.  9.]
 [21. 46.]]
Precision: 0.9247311827956989
Recall: 0.8113207547169812
Specificity: 0.8703703703703703
Negative Predicted Value: 0.7014925373134329
F1 score: 0.8643216080402009
5   : Accuracy 0.83125
confusion_matrix for k: 5 [[86.  7.]
 [20. 47.]]
Precision: 0.9032258064516129
Recall: 0.8
Specificity: 0.8363636363636363
Negative Predicted Value: 0.6865671641791045
F1 score: 0.8484848484848486
7   : Accuracy 0.8125
confusion_matrix for k: 7 [[84.  9.]
 [21. 46.]]
Precision: 0.8817204301075269
Recall: 0.803921568627451
Specificity: 0.8103448275862069
Negativ

For K=5, accuracy is better than other K's.



Weighted KNN

In [None]:
def weighted_KNN(xtrain,xtest,k):
    xtest = np.array(x_test)
    predict=[]
    survived =[]
    for i in range(len(x_test)):
        distance = []
        for j in range(len(x_train)):
            dist = Euclidean_distance(x_test.iloc[i], x_train.iloc[j])
            distance.append(dist)

            survived.append(int(y_train.iloc[j])) 
        
        distance.sort()
        sur_list =[]
        weight = []
        for i in distance[:k]:
            sur_list.append(survived[distance.index(i)])
            weight.append(1/i**2)
        wt_1 =0
        wt_0 =0
            
        for i in range(len(sur_list)):
            if survived[i] ==1:
                wt_1 +=weight[i]
            else:
                wt_0+=weight[i]
        if wt_1>wt_0:
            predict.append(1)
            
        else:
            predict.append(0)
            
    return predict

def acc_weighted_KNN():
  np.diag(yp_weighted)


In [None]:
yp_weighted = weighted_KNN(x_train, x_test, 3)
cm_wknn = confusion_matrix(yp_weighted, y_test)
print('Weighted K-NN Accuracy: ', np.sum(np.diag(cm_wknn))/np.sum(cm_wknn))

#No, it's does not perform better than normal K-NN.

Precision: 0.45161290322580644
Recall: 0.5454545454545454
Specificity: 0.3855421686746988
Negative Predicted Value: 0.47761194029850745
F1 score: 0.4941176470588235
Weighted K-NN Accuracy:  0.4625


Took reference from:
https://machinelearningmastery.com/tutorial-to-implement-k-nearest-neighbors-in-python-from-scratch/