#Data Pre-Processing

## Importing data

In [258]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
data_raw=pd.read_csv('/content/titanic_numbers.csv')

## Clean the data

In [259]:
data_raw

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,1,0,3,"Braund, Mr. Owen Harris",0,22.000000,1,0,7.2500,3,7
1,1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.000000,1,0,71.2833,1,4
2,2,3,1,3,"Heikkinen, Miss. Laina",1,26.000000,0,0,7.9250,3,5
3,3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.000000,1,0,53.1000,3,4
4,4,5,0,3,"Allen, Mr. William Henry",0,35.000000,0,0,8.0500,3,7
...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,887,0,2,"Montvila, Rev. Juozas",0,27.000000,0,0,13.0000,3,2
887,887,888,1,1,"Graham, Miss. Margaret Edith",1,19.000000,0,0,30.0000,3,5
888,888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,21.777778,1,2,23.4500,3,5
889,889,890,1,1,"Behr, Mr. Karl Howell",0,26.000000,0,0,30.0000,1,7


In [260]:

#dividing embarked categorical places into columns
data = data_raw.copy()

data['Embarked Unknown'] = data.index*0
data['Embarked Cherbourg'] = data.index*0
data['Embarked Queenstown'] = data.index*0
data['Embarked Southampton'] = data.index*0
embarked = data.columns.get_loc("Embarked")
embarked_U = data.columns.get_loc("Embarked Unknown")
embarked_C = data.columns.get_loc("Embarked Cherbourg")
embarked_Q = data.columns.get_loc("Embarked Queenstown")
embarked_S = data.columns.get_loc("Embarked Southampton")

for id,row in data.iterrows():
  if data.iloc[id,embarked] == 0:
    data.iloc[id,embarked_U] =1
  elif data.iloc[id,embarked] == 1:
    data.iloc[id,embarked_C] =1
  elif data.iloc[id,embarked] == 2:
    data.iloc[id,embarked_Q] =1
  elif data.iloc[id,embarked] == 3:
    data.iloc[id,embarked_S] =1
  else:
    print("Error")

data.drop(["Embarked","Embarked Unknown","PassengerId","Unnamed: 0"], axis=1,inplace=True)

data

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Title,Embarked Cherbourg,Embarked Queenstown,Embarked Southampton
0,0,3,"Braund, Mr. Owen Harris",0,22.000000,1,0,7.2500,7,0,0,1
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.000000,1,0,71.2833,4,1,0,0
2,1,3,"Heikkinen, Miss. Laina",1,26.000000,0,0,7.9250,5,0,0,1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.000000,1,0,53.1000,4,0,0,1
4,0,3,"Allen, Mr. William Henry",0,35.000000,0,0,8.0500,7,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",0,27.000000,0,0,13.0000,2,0,0,1
887,1,1,"Graham, Miss. Margaret Edith",1,19.000000,0,0,30.0000,5,0,0,1
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,21.777778,1,2,23.4500,5,0,0,1
889,1,1,"Behr, Mr. Karl Howell",0,26.000000,0,0,30.0000,7,1,0,0


## Split into training & test sets

In [261]:
X = data.drop(["Name","Survived"],axis=1).values
Y = data.iloc[:, 0].values
print(X)
print(Y)

[[ 3.          0.         22.         ...  0.          0.
   1.        ]
 [ 1.          1.         38.         ...  1.          0.
   0.        ]
 [ 3.          1.         26.         ...  0.          0.
   1.        ]
 ...
 [ 3.          1.         21.77777778 ...  0.          0.
   1.        ]
 [ 1.          0.         26.         ...  1.          0.
   0.        ]
 [ 3.          0.         32.         ...  0.          1.
   0.        ]]
[0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 1 0 1 0 1 0 1 1 1 0 1 0 0 1 0 0 1 1 0 0 0 1
 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 1 1 0 1 1 0 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0
 1 0 0 0 1 1 0 1 1 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 1 0
 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1
 0 1 1 0 0 1 0 1 1 1 1 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 1 0 0 0 1 1 0 1 0 1 0
 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 1
 1 0 1 0 0 0 0 0 1 1 1 0 1 1 0 

In [262]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)

In [263]:
print('---X_train---'*20)
print(X_train)
print('---X_test---'*20)
print(X_test)
print('---Y_train---'*20)
print(Y_train)
print('---Y_test---'*20)
print(Y_test)


---X_train------X_train------X_train------X_train------X_train------X_train------X_train------X_train------X_train------X_train------X_train------X_train------X_train------X_train------X_train------X_train------X_train------X_train------X_train------X_train---
[[ 3.          1.         35.64285714 ...  1.          0.
   0.        ]
 [ 2.          0.         31.         ...  0.          0.
   1.        ]
 [ 2.          0.         31.         ...  1.          0.
   0.        ]
 ...
 [ 3.          0.         30.72664459 ...  0.          1.
   0.        ]
 [ 3.          1.         36.         ...  0.          0.
   1.        ]
 [ 2.          0.         60.         ...  0.          0.
   1.        ]]
---X_test------X_test------X_test------X_test------X_test------X_test------X_test------X_test------X_test------X_test------X_test------X_test------X_test------X_test------X_test------X_test------X_test------X_test------X_test------X_test---
[[ 3.          0.         30.72664459 ...  1.         

## Feature Scaling

In [264]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [265]:
print('---X_train---'*20)
print(X_train)
print('---X_test---'*20)
print(X_test)

---X_train------X_train------X_train------X_train------X_train------X_train------X_train------X_train------X_train------X_train------X_train------X_train------X_train------X_train------X_train------X_train------X_train------X_train------X_train------X_train---
[[ 0.81925059  1.37207547  0.45396916 ...  2.12588331 -0.31426968
  -1.62827579]
 [-0.38096838 -0.72882288  0.10279677 ... -0.4703927  -0.31426968
   0.61414657]
 [-0.38096838 -0.72882288  0.10279677 ...  2.12588331 -0.31426968
  -1.62827579]
 ...
 [ 0.81925059 -0.72882288  0.08212095 ... -0.4703927   3.18198052
  -1.62827579]
 [ 0.81925059  1.37207547  0.48098242 ... -0.4703927  -0.31426968
   0.61414657]
 [-0.38096838 -0.72882288  2.29627354 ... -0.4703927  -0.31426968
   0.61414657]]
---X_test------X_test------X_test------X_test------X_test------X_test------X_test------X_test------X_test------X_test------X_test------X_test------X_test------X_test------X_test------X_test------X_test------X_test------X_test------X_test---
[[ 0.8

#Modelling

##Build the model

In [266]:
classifier = KNeighborsClassifier()

##Train the model

In [267]:
classifier.fit(X_train, Y_train)

KNeighborsClassifier()

##Make predictions

In [268]:
# 	Pclass,	Sex,	Age,	SibSp,	Parch,	Fare,	Title,	Embarked Cherbourg,	Embarked Queenstown,	Embarked Southampton
eu = [3,0,34,0,0,512.3292,6,0,1,0]
prediction = classifier.predict(sc.transform([eu]))
print(prediction)

[1]


In [269]:
prediction = classifier.predict(X_test)
print(prediction)

[0 0 0 1 1 0 1 1 0 0 0 1 0 1 1 1 0 0 0 1 0 1 0 0 0 1 1 1 1 0 0 1 0 0 0 1 0
 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 0 1 0 1 1 1 0 0 0
 0 1 1 1 0 0 0 1 1 0 0 1 1 0 1 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 1 0
 1 1 1 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 1 1 1 0 1
 1 0 0 0 0 0 1 0 1 0 1 1 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0]


#Evaluation

##Calculate performance metrics

In [270]:
print(np.concatenate((prediction.reshape(len(prediction),1), Y_test.reshape(len(Y_test),1)),axis= 1))

[[0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [0 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [0 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]

In [271]:
cm = confusion_matrix(Y_test, prediction)
print(cm)

print(f'True Positive:{cm[1][1]}  True Negative:{cm[1][0]}  False Positive:{cm[0][1]}  False Negative:{cm[0][0]}')
right = cm[1][1]+cm[0][0]
wrong = cm[1][0]+cm[0][1]
print(f'Right:{right}  Wrong:{wrong}')


acc = accuracy_score(Y_test, prediction)
print(acc)

percent_of_right = 100*right/(right+wrong)
print(f'Chance of success: {percent_of_right}')


[[94 16]
 [20 49]]
True Positive:49  True Negative:20  False Positive:16  False Negative:94
Right:143  Wrong:36
0.7988826815642458
Chance of success: 79.88826815642459


##Make a verdict

This model has 79.9% of accuracy