### 1. Import Dependecies

In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

### 2. Basic Data Processing

In [32]:
df = pd.read_csv('data/raw/Titanic-Dataset.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


- Identifires = PassengerId
- Free text-columns = Name,Ticket,Cabin

In [33]:
df = df.drop(columns=['PassengerId','Name','Ticket','Cabin'])
df.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S
5,0,3,male,,0,0,8.4583,Q
6,0,1,male,54.0,0,0,51.8625,S
7,0,3,male,2.0,3,1,21.075,S
8,1,3,female,27.0,0,2,11.1333,S
9,1,2,female,14.0,1,0,30.0708,C


In [34]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

### 3. Handling NUll Values

#### 3.1 Imputation

##### 3.2.1 Age

In [35]:
mean = df['Age'].mean()
median = df['Age'].median()
mean = np.round(mean,2)
mean, median

(np.float64(29.7), 28.0)

In [36]:
df.loc[:,'Age'] = df['Age'].fillna(mean)

In [37]:
df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    2
dtype: int64

##### 3.2.2 Embarked

In [38]:
mode = df['Embarked'].mode()[0]
mode


'S'

In [39]:
df.loc[:,'Embarked'] = df['Embarked'].fillna(mode)

In [40]:
df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

### 4.  Encode categorical variables

#### 4.1 One Hot Encoding for norminal variables

In [41]:
norminal_variables=['Sex','Embarked']
sex_dummies = pd.get_dummies(df['Sex'],prefix='Sex')
embarked_dummies = pd.get_dummies(df['Embarked'],prefix='Embarked')

df = pd.concat([df,sex_dummies],axis=1)
del df['Sex']

df=pd.concat([df,embarked_dummies],axis=1)
del df['Embarked']

df

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.2500,False,True,False,False,True
1,1,1,38.0,1,0,71.2833,True,False,True,False,False
2,1,3,26.0,0,0,7.9250,True,False,False,False,True
3,1,1,35.0,1,0,53.1000,True,False,False,False,True
4,0,3,35.0,0,0,8.0500,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.0,0,0,13.0000,False,True,False,False,True
887,1,1,19.0,0,0,30.0000,True,False,False,False,True
888,0,3,29.7,1,2,23.4500,True,False,False,False,True
889,1,1,26.0,0,0,30.0000,False,True,True,False,False


#### 4.2 Label Encoding for Ordinal variables

- since Pclass is already in encode format no need to encode again

### 5. Standardization

In [42]:
scalling_columns = ['Age','SibSp','Parch','Fare']

In [43]:
for col in scalling_columns:
     df[col]= (df[col] - df[col].mean()) / df[col].std()

In [44]:
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,-0.592162,0.43255,-0.473408,-0.502163,False,True,False,False,True
1,1,1,0.638417,0.43255,-0.473408,0.786404,True,False,True,False,False
2,1,3,-0.284517,-0.474279,-0.473408,-0.48858,True,False,False,False,True
3,1,1,0.407683,0.43255,-0.473408,0.420494,True,False,False,False,True
4,0,3,0.407683,-0.474279,-0.473408,-0.486064,False,True,False,False,True


In [45]:
df.to_csv('data/processed/titanicDataset_Final.csv', index=False)

### 6. Splitting dataset

In [46]:
X = df.drop(columns=['Survived'])
Y = df['Survived']

In [49]:
X_train, X_test, Y_train, Y_test = train_test_split(
                                                    X, Y,
                                                    test_size=0.2,
                                                    random_state=42
                                                    )

In [54]:
np.savez('artifacts/X_train.npz', X_train)
np.savez('artifacts/Y_train.npz', Y_train)
np.savez('artifacts/X_test.npz', X_test)
np.savez('artifacts/Y_test.npz', Y_test)