# Titanic dataset -  “what sorts of people were more likely to survive?” 

## Data Preprocessing

### Import Python libraries

In [33]:
# Importing Python Libraries for preprocessing
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

### Load dataset

In [34]:
# Storing file paths into variable
train_file = "dataset/train.csv"
test_file = "dataset/test.csv"

In [37]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


**The dataset contains 12 features, with only 9 relevant to building our model.**

In [35]:
# loading dataset into pandas dataframe
train_data = pd.read_csv(train_file)
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [36]:
test_data = pd.read_csv(test_file)
test_data.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


### Data Transformation

In [38]:
# One hot encoding Sex and Embaked features
cat_variables = train_data[['Sex', 'Embarked']]
cat_dummies = pd.get_dummies(cat_variables, drop_first=True)
cat_dummies.head() # Encoded features are saved to cat_dummies variable

Unnamed: 0,Sex_male,Embarked_Q,Embarked_S
0,1,0,1
1,0,0,0
2,0,0,1
3,0,0,1
4,1,0,1


In [39]:
train_data = train_data.drop(['Sex', 'Embarked', 'Ticket', 'Cabin', 'Name', 'PassengerId'], axis=1) # dropping less relevant features
num_data = pd.concat([train_data, cat_dummies], axis=1) # concactinating main dataset with encoded data
num_data.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0
2,1,3,26.0,0,0,7.925,0,0,1
3,1,1,35.0,1,0,53.1,0,0,1
4,0,3,35.0,0,0,8.05,1,0,1


**New dataset containing only numerical values**

### Check for class imbalance

In [40]:
num_data.groupby(by="Survived").count()

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,549,424,549,549,549,549,549,549
1,342,290,342,342,342,342,342,342


**Data set contains about 62% individuals who died in the crash and about 38% who survived, showing class imbalance.**

**By dropping the instances where the individual didn't surive and is missing their age data, while performing imputation for instances where the individual surived but is missing their age data we can reduce the class imbalance thereby having 44.6% instances where the person survived and 55.4% where the person didn't surive.**

### Handling Missing Data

In [41]:
missing_values = num_data.isnull().sum()

In [42]:
missing_values

Survived        0
Pclass          0
Age           177
SibSp           0
Parch           0
Fare            0
Sex_male        0
Embarked_Q      0
Embarked_S      0
dtype: int64

#### First, Scaling the dataset

In [45]:
from sklearn.preprocessing import MinMaxScaler

In [44]:
num_data.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
count,891.0,891.0,714.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,0.647587,0.08642,0.722783
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,0.47799,0.281141,0.447876
min,0.0,1.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104,0.0,0.0,0.0
50%,0.0,3.0,28.0,0.0,0.0,14.4542,1.0,0.0,1.0
75%,1.0,3.0,38.0,1.0,0.0,31.0,1.0,0.0,1.0
max,1.0,3.0,80.0,8.0,6.0,512.3292,1.0,1.0,1.0


**DataFrame before scaling**

In [46]:
scaler = MinMaxScaler()
scaled_data = pd.DataFrame(scaler.fit_transform(num_data), columns = num_data.columns)
scaled_data.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
count,891.0,891.0,714.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,0.654321,0.367921,0.065376,0.063599,0.062858,0.647587,0.08642,0.722783
std,0.486592,0.418036,0.18254,0.137843,0.134343,0.096995,0.47799,0.281141,0.447876
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.5,0.247612,0.0,0.0,0.01544,0.0,0.0,0.0
50%,0.0,1.0,0.346569,0.0,0.0,0.028213,1.0,0.0,1.0
75%,1.0,1.0,0.472229,0.125,0.0,0.060508,1.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


**DataFrame after scaling**

In [47]:
# survived = scaled_data["Survived"]
survived = scaled_data.loc[scaled_data['Survived'] == 1]
dead = scaled_data.loc[scaled_data['Survived'] == 0]

In [48]:
survived # Dataframe containing only people who survived, here KNN Imputation will calculate for missing age rows

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
1,1.0,0.0,0.472229,0.125,0.000000,0.139136,0.0,0.0,0.0
2,1.0,1.0,0.321438,0.000,0.000000,0.015469,0.0,0.0,1.0
3,1.0,0.0,0.434531,0.125,0.000000,0.103644,0.0,0.0,1.0
8,1.0,1.0,0.334004,0.000,0.333333,0.021731,0.0,0.0,1.0
9,1.0,0.5,0.170646,0.125,0.000000,0.058694,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
875,1.0,1.0,0.183212,0.000,0.000000,0.014102,0.0,0.0,0.0
879,1.0,0.0,0.698417,0.000,0.166667,0.162314,0.0,0.0,0.0
880,1.0,0.5,0.308872,0.000,0.166667,0.050749,0.0,0.0,1.0
887,1.0,0.0,0.233476,0.000,0.000000,0.058556,0.0,0.0,1.0


In [49]:
dead # Dataframe containing people who didnt survive, here missing age rows will be dropped

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0.0,1.0,0.271174,0.125,0.000000,0.014151,1.0,0.0,1.0
4,0.0,1.0,0.434531,0.000,0.000000,0.015713,1.0,0.0,1.0
5,0.0,1.0,,0.000,0.000000,0.016510,1.0,1.0,0.0
6,0.0,0.0,0.673285,0.000,0.000000,0.101229,1.0,0.0,1.0
7,0.0,1.0,0.019854,0.375,0.166667,0.041136,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
884,0.0,1.0,0.308872,0.000,0.000000,0.013761,1.0,0.0,1.0
885,0.0,1.0,0.484795,0.000,0.833333,0.056848,0.0,1.0,0.0
886,0.0,0.5,0.334004,0.000,0.000000,0.025374,1.0,0.0,1.0
888,0.0,1.0,,0.125,0.333333,0.045771,0.0,0.0,1.0
