# K-NN (K-Nearest Neighbor)

In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import neighbors
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
dataset=pd.read_csv('train.csv')
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
dataset.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
dataset.shape

(889, 12)

In [5]:
# Preprocessing
dataset.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [6]:
# dropping Cabin as it contains more null values 
dataset.drop(columns=['Cabin'],axis=1,inplace=True)

In [7]:
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [8]:
dataset.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [9]:
# Dropping irrelevant columns
dataset.drop(columns=['PassengerId','Name','Ticket'],axis=1,inplace=True)

In [10]:
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [11]:
 # Conversion of variables
le=preprocessing.LabelEncoder()
le.fit(dataset['Sex'])
dataset['Sex']=le.transform(dataset['Sex'])
le.fit(dataset['Embarked'])
dataset['Embarked']=le.transform(dataset['Embarked'])

In [12]:
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [13]:
# splitting the dataset
y=dataset['Pclass']
X=dataset.drop(['Pclass'],axis=1)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

In [14]:
X_train.head()

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Embarked
350,0,1,45.0,0,0,35.0,2
124,1,1,12.0,1,0,11.2417,0
577,0,0,45.0,1,0,14.4583,0
422,0,0,28.0,1,1,14.4,2
118,0,0,2.0,4,2,31.275,2


In [15]:
X_train.shape

(622, 7)

In [16]:
X_test.head()

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Embarked
14,0,0,14.0,0,0,7.8542,2
158,0,1,45.0,8,2,69.55,2
762,1,0,36.0,1,2,120.0,2
740,0,1,36.0,1,0,78.85,2
482,1,0,63.0,0,0,9.5875,2


In [17]:
X_test.shape

(267, 7)

In [18]:
# Applying K-NN using function for iterations
def KNN(k):
    knn=neighbors.KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train,y_train).score(X_test,y_test)
    ypred=knn.predict(X_test)
    print("\nConfusion matrix when k=",k,"is :\n",confusion_matrix(y_test,ypred))
    print("Accuracy score when k =",k,"is :",accuracy_score(y_test,ypred,normalize=True))


r=range(1,268)
for i in r:
    KNN(i)


Confusion matrix when k= 1 is :
 [[ 62   6   2]
 [  3  37   9]
 [  1   8 139]]
Accuracy score when k = 1 is : 0.8913857677902621

Confusion matrix when k= 2 is :
 [[ 67   2   1]
 [  8  37   4]
 [  4  17 127]]
Accuracy score when k = 2 is : 0.8651685393258427

Confusion matrix when k= 3 is :
 [[ 63   5   2]
 [  7  28  14]
 [  3   5 140]]
Accuracy score when k = 3 is : 0.8651685393258427

Confusion matrix when k= 4 is :
 [[ 61   7   2]
 [  8  30  11]
 [  4  15 129]]
Accuracy score when k = 4 is : 0.8239700374531835

Confusion matrix when k= 5 is :
 [[ 58  10   2]
 [  7  28  14]
 [  3   9 136]]
Accuracy score when k = 5 is : 0.8314606741573034

Confusion matrix when k= 6 is :
 [[ 64   5   1]
 [  7  32  10]
 [  3  15 130]]
Accuracy score when k = 6 is : 0.846441947565543

Confusion matrix when k= 7 is :
 [[ 59  10   1]
 [  7  28  14]
 [  3   7 138]]
Accuracy score when k = 7 is : 0.8426966292134831

Confusion matrix when k= 8 is :
 [[ 61   8   1]
 [  7  34   8]
 [  3  13 132]]
Accuracy sc

Accuracy score when k = 68 is : 0.7602996254681648

Confusion matrix when k= 69 is :
 [[ 56   7   7]
 [  5  11  33]
 [  5   6 137]]
Accuracy score when k = 69 is : 0.7640449438202247

Confusion matrix when k= 70 is :
 [[ 57   6   7]
 [  5  11  33]
 [  5   7 136]]
Accuracy score when k = 70 is : 0.7640449438202247

Confusion matrix when k= 71 is :
 [[ 57   6   7]
 [  5  11  33]
 [  5   7 136]]
Accuracy score when k = 71 is : 0.7640449438202247

Confusion matrix when k= 72 is :
 [[ 56   7   7]
 [  5  11  33]
 [  5   8 135]]
Accuracy score when k = 72 is : 0.7565543071161048

Confusion matrix when k= 73 is :
 [[ 56   7   7]
 [  5  11  33]
 [  5   6 137]]
Accuracy score when k = 73 is : 0.7640449438202247

Confusion matrix when k= 74 is :
 [[ 57   6   7]
 [  5  11  33]
 [  5   6 137]]
Accuracy score when k = 74 is : 0.7677902621722846

Confusion matrix when k= 75 is :
 [[ 56   7   7]
 [  5  11  33]
 [  5   6 137]]
Accuracy score when k = 75 is : 0.7640449438202247

Confusion matrix when k=


Confusion matrix when k= 134 is :
 [[ 52  13   5]
 [  5  11  33]
 [  5   4 139]]
Accuracy score when k = 134 is : 0.7565543071161048

Confusion matrix when k= 135 is :
 [[ 52  13   5]
 [  5  11  33]
 [  5   3 140]]
Accuracy score when k = 135 is : 0.7602996254681648

Confusion matrix when k= 136 is :
 [[ 52  13   5]
 [  5  11  33]
 [  5   3 140]]
Accuracy score when k = 136 is : 0.7602996254681648

Confusion matrix when k= 137 is :
 [[ 52  13   5]
 [  5  11  33]
 [  5   3 140]]
Accuracy score when k = 137 is : 0.7602996254681648

Confusion matrix when k= 138 is :
 [[ 52  13   5]
 [  5  11  33]
 [  5   3 140]]
Accuracy score when k = 138 is : 0.7602996254681648

Confusion matrix when k= 139 is :
 [[ 52  13   5]
 [  5  11  33]
 [  5   3 140]]
Accuracy score when k = 139 is : 0.7602996254681648

Confusion matrix when k= 140 is :
 [[ 52  13   5]
 [  5  11  33]
 [  5   3 140]]
Accuracy score when k = 140 is : 0.7602996254681648

Confusion matrix when k= 141 is :
 [[ 52  13   5]
 [  5  11  

Accuracy score when k = 199 is : 0.7191011235955056

Confusion matrix when k= 200 is :
 [[ 48   6  16]
 [  5   1  43]
 [  5   0 143]]
Accuracy score when k = 200 is : 0.7191011235955056

Confusion matrix when k= 201 is :
 [[ 48   6  16]
 [  5   2  42]
 [  5   0 143]]
Accuracy score when k = 201 is : 0.7228464419475655

Confusion matrix when k= 202 is :
 [[ 48   6  16]
 [  4   2  43]
 [  5   0 143]]
Accuracy score when k = 202 is : 0.7228464419475655

Confusion matrix when k= 203 is :
 [[ 48   6  16]
 [  4   2  43]
 [  5   0 143]]
Accuracy score when k = 203 is : 0.7228464419475655

Confusion matrix when k= 204 is :
 [[ 48   6  16]
 [  4   2  43]
 [  5   0 143]]
Accuracy score when k = 204 is : 0.7228464419475655

Confusion matrix when k= 205 is :
 [[ 48   6  16]
 [  4   2  43]
 [  5   0 143]]
Accuracy score when k = 205 is : 0.7228464419475655

Confusion matrix when k= 206 is :
 [[ 48   6  16]
 [  4   2  43]
 [  5   0 143]]
Accuracy score when k = 206 is : 0.7228464419475655

Confusion


Confusion matrix when k= 264 is :
 [[ 48   0  22]
 [  4   1  44]
 [  5   0 143]]
Accuracy score when k = 264 is : 0.7191011235955056

Confusion matrix when k= 265 is :
 [[ 48   0  22]
 [  4   1  44]
 [  5   0 143]]
Accuracy score when k = 265 is : 0.7191011235955056

Confusion matrix when k= 266 is :
 [[ 47   0  23]
 [  4   1  44]
 [  5   0 143]]
Accuracy score when k = 266 is : 0.7153558052434457

Confusion matrix when k= 267 is :
 [[ 47   1  22]
 [  4   1  44]
 [  5   0 143]]
Accuracy score when k = 267 is : 0.7153558052434457


# Inference
The Accuracy is highest when k=1 which is 0.8913 and the accuracy decreases as the k value increases.

The accuracy is lowest when k=267 which is 0.7153.

Hence, we use k=1 value for the most accurate prediction.