# Important Library

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , confusion_matrix
from sklearn import neighbors

# Loading Data Set

In [5]:
data = pd.read_csv("train.csv")

In [7]:
data.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [8]:
data.shape

(891, 12)

# Converting categorical variable into numerical

In [13]:
le = LabelEncoder()

In [14]:
data['Sex'] = le.fit_transform(data['Sex'])

In [15]:
data['Embarked'] = le.fit_transform(data['Embarked'].astype('str'))

In [16]:
data.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,29.0,1,0,A/5 21171,7.25,,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,29.0,1,0,PC 17599,71.2833,C85,0


# Handling Missing value

In [10]:
data['Age'].mean()

29.69911764705882

In [11]:
age_new_var = np.where(data['Age'],29,data['Age'])

In [12]:
data['Age'] = age_new_var

In [17]:
data['Embarked'] = data['Embarked'].fillna(method='bfill')

In [18]:
data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [50]:
data = data.drop(['PassengerId','Name','Ticket','Cabin'],axis=1)

In [51]:
data.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')

In [52]:
X = data.drop('Pclass',axis=1)
y = data['Pclass']

# Spliting data into train and test

In [21]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [23]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(623, 7) (268, 7) (623,) (268,)


# KNN Classifier Algorithm

In [32]:
knn = neighbors.KNeighborsClassifier(n_neighbors=3)

In [48]:
knn.fit(X_train,y_train).score(X_test,y_test)

0.9029850746268657

In [34]:
y_pred = knn.predict(X_test)

In [35]:
confusion_matrix(y_pred,y_test)

array([[ 56,   7,   0],
       [  1,  37,   7],
       [  3,   8, 149]], dtype=int64)

In [37]:
X_test.shape[0]

268

# KNN Classifier General Function

In [45]:
def KNN(X,y):
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)
    max_accuracy =0
    k = 0
    for i in range(2,X_test.shape[0]):
        knn = neighbors.KNeighborsClassifier(n_neighbors=i)
        accuracy = knn.fit(X_train,y_train).score(X_test,y_test)
        #y_pred = knn.predict(X_test)
        if accuracy > max_accuracy:
            y_pred = knn.predict(X_test)
            max_accuracy = accuracy
            k = i
    
    print("Value of k is : ",k)
    print("Maximum Accuracy : ",max_accuracy)
    print("Confusion Matrix \n",confusion_matrix(y_pred,y_test))
    
  

In [None]:
#DV -> Pclass
# IDV -> Remaining

In [46]:
KNN(X,y)

Value of k is :  3
Maximum Accuracy :  0.9029850746268657
Confusion Matrix 
 [[ 56   7   0]
 [  1  37   7]
 [  3   8 149]]


In [47]:
(56+37+149)/268

0.9029850746268657

In [53]:
# DV -> Survived
# IDV -> Remaining
X = data.drop('Survived',axis=1)
y = data['Survived']

In [54]:
KNN(X,y)

Value of k is :  31
Maximum Accuracy :  0.7649253731343284
Confusion Matrix 
 [[148  43]
 [ 20  57]]


In [55]:
(148+57)/268

0.7649253731343284