# Titanic dataset -  “what sorts of people were more likely to survive?” 

## Data Preprocessing

### Import Python libraries

In [1]:
# Importing Python Libraries for preprocessing
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_validate

### Load dataset

In [2]:
# Storing file paths into variable
train_file = "dataset/train.csv"
test_file = "dataset/test.csv"

**The dataset contains 12 features, with only 9 relevant to building our model.**

In [3]:
# loading dataset into pandas dataframe
train_data = pd.read_csv(train_file)
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
test_data = pd.read_csv(test_file)
test_data.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


### Data Transformation

In [5]:
# One hot encoding Sex and Embaked features
cat_variables = train_data[['Sex', 'Embarked']]
cat_dummies = pd.get_dummies(cat_variables, drop_first=True)
cat_dummies.head() # Encoded features are saved to cat_dummies variable

Unnamed: 0,Sex_male,Embarked_Q,Embarked_S
0,1,0,1
1,0,0,0
2,0,0,1
3,0,0,1
4,1,0,1


In [6]:
train_data = train_data.drop(['Sex', 'Embarked', 'Ticket', 'Cabin', 'Name', 'PassengerId'], axis=1) # dropping less relevant features
num_data = pd.concat([train_data, cat_dummies], axis=1) # concactinating main dataset with encoded data
num_data.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0
2,1,3,26.0,0,0,7.925,0,0,1
3,1,1,35.0,1,0,53.1,0,0,1
4,0,3,35.0,0,0,8.05,1,0,1


**New dataset containing only numerical values**

### Check for class imbalance

In [7]:
num_data.groupby(by="Survived").count()

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,549,424,549,549,549,549,549,549
1,342,290,342,342,342,342,342,342


**Data set contains about 62% individuals who died in the crash and about 38% who survived, showing class imbalance.**

**By dropping the instances where the individual didn't surive and is missing their age data, while performing imputation for instances where the individual surived but is missing their age data we can reduce the class imbalance thereby having 44.6% instances where the person survived and 55.4% where the person didn't surive.**

### Handling Missing Data

In [8]:
num_data.isnull().sum()

Survived        0
Pclass          0
Age           177
SibSp           0
Parch           0
Fare            0
Sex_male        0
Embarked_Q      0
Embarked_S      0
dtype: int64

### First, split dataset

<b>We Split the dataset into train and test set before performing scaling and imputation because we dont want the test set to contain information from the train set</b>

In [9]:
X = num_data.drop(['Survived'], axis = 1) # independent variable
y = num_data[['Survived']] #dependent variable

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2, stratify=y)

#### Scaling the dataset

In [11]:
from sklearn.preprocessing import MinMaxScaler

In [12]:
# combine X and Y
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

**DataFrame before scaling**

In [13]:
train_data.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Survived
count,623.0,501.0,623.0,623.0,623.0,623.0,623.0,623.0,623.0
mean,2.29695,30.089501,0.499197,0.390048,33.290094,0.659711,0.089888,0.70305,0.383628
std,0.843039,14.873609,1.031458,0.852666,53.379025,0.474187,0.286251,0.457282,0.48666
min,1.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.5,20.0,0.0,0.0,7.8958,0.0,0.0,0.0,0.0
50%,3.0,28.0,0.0,0.0,14.5,1.0,0.0,1.0,0.0
75%,3.0,39.0,1.0,0.0,31.275,1.0,0.0,1.0,1.0
max,3.0,80.0,8.0,6.0,512.3292,1.0,1.0,1.0,1.0


**DataFrame after scaling**

In [21]:
scaler = MinMaxScaler()
scaled_train = pd.DataFrame(scaler.fit_transform(train_data), columns = train_data.columns)
scaled_test = pd.DataFrame(scaler.fit_transform(test_data), columns = test_data.columns)
scaled_train.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Survived
count,623.0,501.0,623.0,623.0,623.0,623.0,623.0,623.0,623.0
mean,0.648475,0.372826,0.0624,0.065008,0.064978,0.659711,0.089888,0.70305,0.383628
std,0.421519,0.186901,0.128932,0.142111,0.104189,0.474187,0.286251,0.457282,0.48666
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.25,0.246042,0.0,0.0,0.015412,0.0,0.0,0.0,0.0
50%,1.0,0.346569,0.0,0.0,0.028302,1.0,0.0,1.0,0.0
75%,1.0,0.484795,0.125,0.0,0.061045,1.0,0.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
# Split train data into survived and dead dataframes
survived_train = scaled_train.loc[scaled_train['Survived'] == 1]
dead_train = scaled_train.loc[scaled_train['Survived'] == 0]

In [23]:
# Split test data into survived and dead dataframes
survived_test = scaled_test.loc[scaled_test['Survived'] == 1]
dead_test = scaled_test.loc[scaled_test['Survived'] == 0]

In [24]:
from sklearn.impute import KNNImputer

### Handling missing values in Train Set

In [26]:
#Using KNN Imputation to get the missing age value for rows in the survived dataframe
imputer = KNNImputer()
new_survived_train = pd.DataFrame(imputer.fit_transform(survived_train),columns = survived_train.columns)

In [27]:
new_dead_train = dead_train.dropna() #Dropping rows with missing age value in the dead dataframe

In [28]:
combined_train = pd.concat([new_survived_train, new_dead_train], axis=0) #combining both the dead and undead dataframes
combined_train

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Survived
0,0.0,0.635587,0.125,0.000000,0.152164,0.0,0.0,1.0,1.0
1,0.5,0.019854,0.125,0.166667,0.050749,1.0,0.0,1.0,1.0
2,0.0,0.484795,0.125,0.166667,0.216430,0.0,0.0,0.0,1.0
3,1.0,0.208344,0.500,0.333333,0.015469,0.0,0.0,1.0,1.0
4,0.0,0.447097,0.125,0.333333,0.234224,1.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...
615,1.0,0.509927,0.250,0.000000,0.027538,1.0,0.0,1.0,0.0
616,1.0,0.359135,0.125,0.000000,0.013752,1.0,0.0,1.0,0.0
617,1.0,0.434531,0.000,0.000000,0.013761,1.0,0.0,1.0,0.0
618,1.0,0.007288,0.500,0.166667,0.077465,1.0,0.0,1.0,0.0


In [29]:
combined_train.isnull().sum()

Pclass        0
Age           0
SibSp         0
Parch         0
Fare          0
Sex_male      0
Embarked_Q    0
Embarked_S    0
Survived      0
dtype: int64

### Handling missing values in Test Set

In [30]:
new_survived_test = pd.DataFrame(imputer.fit_transform(survived_test),columns = survived_train.columns) #Using KNN Imputer to get the missing value for rows in the survived dataframe

In [31]:
new_dead_test = dead_test.dropna() #Dropping rows with missing age value in the dead dataframe
combined_test = pd.concat([new_survived_test, new_dead_test], axis=0) #combining both the dead and undead dataframes

In [32]:
combined_test.isnull().sum()

Pclass        0
Age           0
SibSp         0
Parch         0
Fare          0
Sex_male      0
Embarked_Q    0
Embarked_S    0
Survived      0
dtype: int64

In [33]:
X_train = combined_train.drop(['Survived'], axis = 1) # independent variable
y_train = combined_train[['Survived']] #dependent variable

X_test = combined_test.drop(['Survived'], axis = 1) # independent variable
y_test = combined_test[['Survived']] #dependent variable