#### Import Libraries

In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

#### Load the data and preprocessing

In [2]:
df = pd.read_csv('titanic_edit.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title_Name,New_age
0,0,1,0,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,7.25,,S,Mr,22.0
1,1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,71.2833,C85,C,Mrs,38.0
2,2,3,1,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,7.925,,S,Miss,26.0
3,3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,53.1,C123,S,Mrs,35.0
4,4,5,0,3,"Allen, Mr. William Henry",male,0,0,373450,8.05,,S,Mr,35.0


In [3]:
df.columns

Index(['Unnamed: 0', 'PassengerId', 'Survived', 'Pclass', 'Name', 'Sex',
       'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title_Name',
       'New_age'],
      dtype='object')

In [4]:
# Drop the columns that we will not use in the machine learning
data = df.drop(['Unnamed: 0','Name','Ticket','Cabin','New_age','Title_Name',
               'PassengerId'],
               axis = 1)
data.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked
0,0,3,male,1,0,7.25,S
1,1,1,female,1,0,71.2833,C
2,1,3,female,0,0,7.925,S
3,1,1,female,1,0,53.1,S
4,0,3,male,0,0,8.05,S


In [5]:
print('Unique value of Sex')
print(data.Sex.unique())

print('Unique value of Embarked')
print(data.Embarked.unique())
print('C = Cherbourg, Q = Queenstown, S = Southampton')

Unique value of Sex
['male' 'female']
Unique value of Embarked
['S' 'C' 'Q']
C = Cherbourg, Q = Queenstown, S = Southampton


In [6]:
le = LabelEncoder()

# Encode the sex column
data.iloc[:,2] = le.fit_transform(data.iloc[:,2].values)

# Encode the Embarked column
data.iloc[:,-1] = le.fit_transform(data.iloc[:,-1].values)

In [7]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked
0,0,3,1,1,0,7.25,2
1,1,1,0,1,0,71.2833,0
2,1,3,0,0,0,7.925,2
3,1,1,0,1,0,53.1,2
4,0,3,1,0,0,8.05,2


In [8]:
print('Unique values of Sex')
print(data.Sex.unique())
print('1: male, 0: female\n')

print('Unique values of Embarked')
print(data.Embarked.unique())
print('2 = Cherbourg, 0 = Queenstown, 1 = Southampton')

Unique values of Sex
[1 0]
1: male, 0: female

Unique values of Embarked
[2 0 1]
2 = Cherbourg, 0 = Queenstown, 1 = Southampton


In [9]:
# Checking for null values
data.isna().sum()

Survived    0
Pclass      0
Sex         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [10]:
# Checking for dtypes
data.dtypes

Survived      int64
Pclass        int64
Sex           int32
SibSp         int64
Parch         int64
Fare        float64
Embarked      int32
dtype: object

Everything is numeric type.

In [52]:
# Split the data into dependent and independent variables
x = data.iloc[:,1:].values
y = data.iloc[:,0].values

In [53]:
# Split the data into training and testing set
xtrain, xtest, ytrain, ytest = train_test_split(x,y,random_state = 0, test_size=.2)

In [54]:
# Scale the data 
sc = StandardScaler()
xtrain = sc.fit_transform(xtrain)
xtest = sc.transform(xtest)

#### Apply Machine Learning Algorithms

##### Logistic Regression

In [14]:
# Logistic Regression
log = LogisticRegression(random_state=0)
log.fit(xtrain, ytrain)
ypred = log.predict(xtest)
accuracy = accuracy_score(ytest, ypred)
print('Accuracy of the model:{}'.format(accuracy))

Accuracy of the model:0.7247191011235955


##### K-Nearest Neighbors

In [36]:
# KNN
knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
knn.fit(xtrain, ytrain)
ypred = knn.predict(xtest)
accuracy = accuracy_score(ytest, ypred)
print('Accuracy of the model:{}'.format(accuracy))

Accuracy of the model:0.7247191011235955


In [30]:
# Evaluate different k values 
k_val = [1,2,3,4,5,6,7,8,9,10,15,20]

for i in k_val:
    knn = KNeighborsClassifier(n_neighbors=i, metric='euclidean')
    knn.fit(xtrain, ytrain)
    ypred = knn.predict(xtest)
    accuracy = accuracy_score(ytest, ypred)
    print('Prediction performance of KNN with K of {} is {}'.format(i, accuracy))

Prediction performance of KNN with K of 1 is 0.702247191011236
Prediction performance of KNN with K of 2 is 0.7078651685393258
Prediction performance of KNN with K of 3 is 0.7247191011235955
Prediction performance of KNN with K of 4 is 0.7134831460674157
Prediction performance of KNN with K of 5 is 0.7247191011235955
Prediction performance of KNN with K of 6 is 0.7078651685393258
Prediction performance of KNN with K of 7 is 0.702247191011236
Prediction performance of KNN with K of 8 is 0.7078651685393258
Prediction performance of KNN with K of 9 is 0.7303370786516854
Prediction performance of KNN with K of 10 is 0.7191011235955056
Prediction performance of KNN with K of 15 is 0.7359550561797753
Prediction performance of KNN with K of 20 is 0.7359550561797753


In [32]:
# Compute the sqrt
math.sqrt(len(ytest))

13.341664064126334

In [37]:
# KNN with k = 13
knn = KNeighborsClassifier(n_neighbors=13, metric='euclidean')
knn.fit(xtrain, ytrain)
ypred = knn.predict(xtest)
accuracy = accuracy_score(ytest, ypred)
print('Accuracy of the model:{}'.format(accuracy))

Accuracy of the model:0.7359550561797753


The accuracy of the KNN algorithm is not high enough. Using different values of k and evaluating the accuracy of the model, we notice that changing of accuracy is very small. 

##### Support Vector Machine

In [16]:
# Support Vector Machine - kernel = linear
svm = SVC(random_state=0, kernel='linear')
svm.fit(xtrain, ytrain)
ypred = svm.predict(xtest)
accuracy = accuracy_score(ytest, ypred)
print('Accuracy of the model:{}'.format(accuracy))

Accuracy of the model:0.7191011235955056


In [39]:
# Support Vector Machine - kernel = rbf
svm = SVC(random_state=0, kernel='rbf',gamma=0.1, C=1)
svm.fit(xtrain, ytrain)
ypred = svm.predict(xtest)
accuracy = accuracy_score(ytest, ypred)
print('Accuracy of the model:{}'.format(accuracy))

Accuracy of the model:0.7359550561797753


In [41]:
# Support Vector Machine - kernel = poly 
svm = SVC(random_state=0, kernel='poly', degree = 2, C=1)
svm.fit(xtrain, ytrain)
ypred = svm.predict(xtest)
accuracy = accuracy_score(ytest, ypred)
print('Accuracy of the model:{}'.format(accuracy))

Accuracy of the model:0.7359550561797753


##### Decision Tree

In [46]:
# Decision Tree
dt = DecisionTreeClassifier(criterion='entropy',random_state=0)
dt.fit(xtrain, ytrain)
ypred = dt.predict(xtest)
accuracy = accuracy_score(ytest, ypred)
print('Accuracy of the model:{}'.format(accuracy))

Accuracy of the model:0.7584269662921348


In [48]:
# Decision Tree - Different depth
depth = [1,2,3,4,5,6,7,8,9,10,15,20]

for i in depth:
    dt = DecisionTreeClassifier(criterion='entropy',random_state=0, max_depth=i)
    dt.fit(xtrain, ytrain)
    ypred = dt.predict(xtest)
    accuracy = accuracy_score(ytest, ypred)
    print('Accuracy with depth of {} is {}'.format(i, accuracy))

Accuracy with depth of 1 is 0.7191011235955056
Accuracy with depth of 2 is 0.7191011235955056
Accuracy with depth of 3 is 0.7303370786516854
Accuracy with depth of 4 is 0.7303370786516854
Accuracy with depth of 5 is 0.7303370786516854
Accuracy with depth of 6 is 0.7191011235955056
Accuracy with depth of 7 is 0.7415730337078652
Accuracy with depth of 8 is 0.7303370786516854
Accuracy with depth of 9 is 0.7415730337078652
Accuracy with depth of 10 is 0.7471910112359551
Accuracy with depth of 15 is 0.7584269662921348
Accuracy with depth of 20 is 0.7584269662921348


##### Random Forest

In [18]:
# Random Forest
rf = RandomForestClassifier(n_estimators=10, criterion='entropy',random_state=0)
rf.fit(xtrain, ytrain)
ypred = rf.predict(xtest)
accuracy = accuracy_score(ytest, ypred)
print('Accuracy of the model:{}'.format(accuracy))

Accuracy of the model:0.7528089887640449


In [61]:
# Random Forest
rf = RandomForestClassifier(n_estimators=100, criterion='entropy',random_state=0)
rf.fit(xtrain, ytrain)
ypred = rf.predict(xtest)
accuracy = accuracy_score(ytest, ypred)
print('Accuracy of the model:{}'.format(accuracy))

Accuracy of the model:0.7528089887640449
