In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import pyplot

import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier

## Dataset Source and Description

Titanic dataset includes data about which passengers survived the Titanic shipwreck

Source: https://www.kaggle.com/competitions/titanic/data


### Dataset Dictionary

<table>
<tbody>
<tr><th><b>Variable</b></th><th><b>Definition</b></th><th><b>Key</b></th></tr>
<tr>
<td>survival</td>
<td>Survival</td>
<td>0 = No, 1 = Yes</td>
</tr>
<tr>
<td>pclass</td>
<td>Ticket class</td>
<td>1 = 1st, 2 = 2nd, 3 = 3rd</td>
</tr>
<tr>
<td>sex</td>
<td>Sex</td>
<td></td>
</tr>
<tr>
<td>Age</td>
<td>Age in years</td>
<td></td>
</tr>
<tr>
<td>sibsp</td>
<td># of siblings / spouses aboard the Titanic</td>
<td></td>
</tr>
<tr>
<td>parch</td>
<td># of parents / children aboard the Titanic</td>
<td></td>
</tr>
<tr>
<td>ticket</td>
<td>Ticket number</td>
<td></td>
</tr>
<tr>
<td>fare</td>
<td>Passenger fare</td>
<td></td>
</tr>
<tr>
<td>cabin</td>
<td>Cabin number</td>
<td></td>
</tr>
<tr>
<td>embarked</td>
<td>Port of Embarkation</td>
<td>C = Cherbourg, Q = Queenstown, S = Southampton</td>
</tr>
</tbody>
</table>

1st = Upper
2nd = Middle
3rd = Lower

Age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

Sibsp: The dataset defines family relations in this way:

	Sibling = brother, sister, stepbrother, stepsister
	Spouse = husband, wife (mistresses and fiancés were ignored)

Parch: The dataset defines family relations in this way:

	Parent = mother, father
	Child = daughter, son, stepdaughter, stepson
Some children travelled only with a nanny, therefore parch=0 for them.



## Problem

To predict weather a passenger is survided or not


## Prepare Data

In [68]:
#Load the CSV into a Pandas Dataframe
titanic_data = pd.read_csv('./train.csv')

In [69]:
titanic_data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [70]:
titanic_data.shape

(891, 12)

In [71]:
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Note, that **there are missing values at the Age, Cabin and Embarked columns**.

### Sum up the total missing values

In [72]:
titanic_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### Data Cleaning

Since most of Cabin column values are missing we will omit this column along with the Ticket, PassengerId and Name columns that we will not use for this initial investigation. 

We will make a new Dataframe in case we want to access the initial one again.

In [73]:
#Drop the unneeded columns
titanic_data_clean=titanic_data.drop(['Cabin','Ticket','Name','PassengerId'],axis=1)

In [74]:
titanic_data_clean.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S


### Deal with missing values

In [75]:
titanic_data_clean.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

#### Deal with Age NaNs

Let's get some insight about Age values

In [76]:
titanic_data_clean.Age.describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [77]:
titanic_data_clean.Age.median()

28.0

We can replace Age NaN values with median value

In [78]:
age_median = titanic_data_clean['Age'].median()

titanic_data_clean['Age'] = titanic_data_clean['Age'].fillna(age_median)

#### Deal with missing values for Embarked 

In [79]:
titanic_data_clean.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    2
dtype: int64

We have only 2 missing values in Embarked column, so we can just delete these 2 rows

In [80]:
# drop all rows with NaNs for embarked
titanic_data_clean.dropna(axis=0, inplace=True)

In [81]:
titanic_data_clean.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

### Checking for outliers

In [82]:
titanic_data_clean.boxplot(column=['Fare', 'Age', 'SibSp', 'Parch'], figsize=(9,9))

<AxesSubplot: >

In [83]:
titanic_data_clean.Fare.describe()

count    889.000000
mean      32.096681
std       49.697504
min        0.000000
25%        7.895800
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

Find values in Fare which are > 300 and remove respective rows

In [84]:
# 
rows = titanic_data_clean[titanic_data_clean.Fare > 300]
rows


# titanic_data_clean = titanic_data_clean.drop(rows.index, axis=0)
titanic_data_clean.drop(rows.index, axis=0, inplace=True)

In [85]:
titanic_data_clean = titanic_data_clean.replace({'male':1,'female':0})

In [86]:
titanic_data_clean = titanic_data_clean.replace({'S':2,'C':1,'Q':0})

## Select features

In [87]:

# variables = titanic_data_clean.columns[1:]

# sns.set(style="ticks", color_codes=True)
# g = sns.pairplot(titanic_data_clean, 
#                  hue = 'Survived',
#                  vars=['Sex','Age','Fare'],                 
#                  markers=['o','s'])

In [88]:
# sns.pairplot(titanic_data_clean, hue="Survived", diag_kind="hist")
# plt.show()

In [89]:
titanic_data_clean.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,1
2,1,3,0,26.0,0,0,7.925,2


In [90]:
df_ready = titanic_data_clean
df_ready.head(3)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,1
2,1,3,0,26.0,0,0,7.925,2


### Convert categorical data to numerical
'male' => 1

'female' => 0


In [91]:
df_ready = df_ready.replace({'male':1,'female':0}, inplace=False)

In [92]:
df_ready.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,1
2,1,3,0,26.0,0,0,7.925,2


In [93]:
# independent variables
# X = df_ready.drop(['Survived','Parch'], axis=1)
X = df_ready[['Pclass','Sex','Fare','Embarked']]
X.head(3)

Unnamed: 0,Pclass,Sex,Fare,Embarked
0,3,1,7.25,2
1,1,0,71.2833,1
2,3,0,7.925,2


In [94]:
# dependent variable (labels, target)
y= df_ready['Survived']
y.head(3)

0    0
1    1
2    1
Name: Survived, dtype: int64

## Train

In [95]:
X.shape

(886, 4)

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=None)

In [97]:
X_train.shape

(620, 4)

In [98]:
X_test.shape

(266, 4)

### Choose algorithm

We will use a [KNN clasifier](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)

In [99]:
scores = []

for k in range(1,11):
	knn = KNeighborsClassifier(n_neighbors=k) 
	knn.fit(X_train,y_train)

	y_predicted = knn.predict(X_test)
	scores.append(knn.score(X_test, y_test))

scores


[0.7443609022556391,
 0.7255639097744361,
 0.7669172932330827,
 0.7556390977443609,
 0.7518796992481203,
 0.7218045112781954,
 0.7443609022556391,
 0.7255639097744361,
 0.7255639097744361,
 0.7218045112781954]

In [80]:
knn.score(X_test, y_test)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.6917293233082706