In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

We have cleaned the dataset in another notebook. You can see the details [here](https://github.com/galiph/ExploratoryDataAnalysis/blob/master/01_Titanic_EDA.ipynb). 

In [21]:
titanic_df = pd.read_csv('./data/titanic_train.csv')

#Converting Sex
sex_conv = lambda x : 1 if x=='male' else 0 
titanic_df['Is_male'] = titanic_df['Sex'].apply(sex_conv)

# Adding Family column
titanic_df['FamilySize'] = titanic_df['SibSp'] + titanic_df['Parch']
titanic_df.drop('SibSp',axis=1,inplace=True)
titanic_df.drop('Parch',axis=1,inplace=True)

# Adding "Title" column
titanic_df['Title']=0
for i in titanic_df:
    titanic_df['Title']=titanic_df['Name'].str.extract('([A-Za-z]+)\.', expand=False)  
    
# Imputing Age  
titanic_df['Age'] = titanic_df.groupby('Title')['Age'].transform(lambda x: x.fillna(np.round(x.mean())))

# Map of aggregated titles:
titles_dict = {'Capt': 'Other','Major': 'Other', 'Jonkheer': 'Other','Don': 'Other',
               'Sir': 'Other','Dr': 'Other','Rev': 'Other','Countess': 'Other',
               'Dona': 'Other','Mme': 'Mrs','Mlle': 'Miss','Ms': 'Miss',
               'Mr': 'Mr','Mrs': 'Mrs','Miss': 'Miss', 'Master': 'Master',
               'Lady': 'Other'}
titanic_df['Title'] = titanic_df['Title'].map(titles_dict)

# Dropping some columns
titanic_df.drop(['Ticket', 'Cabin', "Name"], axis=1, inplace = True)

# Converting Age
titanic_df['Age'] =pd.cut(titanic_df['Age'], bins=[1, 12, 50, 200], labels=['Child','Adult','Elder'])

In [23]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Embarked,Is_male,FamilySize,Title
0,1,0,3,male,Adult,7.25,S,1,1,Mr
1,2,1,1,female,Adult,71.2833,C,0,1,Mrs
2,3,1,3,female,Adult,7.925,S,0,0,Miss
3,4,1,1,female,Adult,53.1,S,0,1,Mrs
4,5,0,3,male,Adult,8.05,S,1,0,Mr


We don't need "PassengerId" feature for now. So let'drop it.

In [24]:
# drop PassengerId
titanic_df.drop('PassengerId', axis=1, inplace=True)

In [25]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
Survived      891 non-null int64
Pclass        891 non-null int64
Sex           891 non-null object
Age           877 non-null category
Fare          891 non-null float64
Embarked      889 non-null object
Is_male       891 non-null int64
FamilySize    891 non-null int64
Title         889 non-null object
dtypes: category(1), float64(1), int64(4), object(3)
memory usage: 56.7+ KB


Let's convert the features to categoricel except "Fare" and "Family"

In [26]:
titanic_df.Pclass = pd.Categorical(titanic_df.Pclass)
titanic_df.Embarked = pd.Categorical(titanic_df.Embarked)
titanic_df.Title= pd.Categorical(titanic_df.Title)

Now transforming categorical feature to dummy variables

In [27]:
titanic_df = pd.get_dummies(titanic_df, drop_first=1)  
titanic_df.head()

Unnamed: 0,Survived,Fare,Is_male,FamilySize,Pclass_2,Pclass_3,Sex_male,Age_Adult,Age_Elder,Embarked_Q,Embarked_S,Title_Miss,Title_Mr,Title_Mrs,Title_Other
0,0,7.25,1,1,0,1,1,1,0,0,1,0,1,0,0
1,1,71.2833,0,1,0,0,0,1,0,0,0,0,0,1,0
2,1,7.925,0,0,0,1,0,1,0,0,1,1,0,0,0
3,1,53.1,0,1,0,0,0,1,0,0,1,0,0,1,0
4,0,8.05,1,0,0,1,1,1,0,0,1,0,1,0,0


In [33]:
# Get training and test sets
from sklearn.model_selection import train_test_split
X = titanic_df.drop(["Survived"], axis = 1)
y = titanic_df.Survived
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

Let' have Box-Cox transformation

In [35]:
from scipy.stats import boxcox
X_train_boxcox = X_train.copy()
X_train_boxcox['Fare'] = boxcox(X_train_boxcox['Fare'] + 1)[0]
X_test_boxcox = X_test.copy()
X_test_boxcox['Fare'] = boxcox(X_test_boxcox['Fare'] + 1)[0]

In [36]:
X.head()

Unnamed: 0,Fare,Is_male,FamilySize,Pclass_2,Pclass_3,Sex_male,Age_Adult,Age_Elder,Embarked_Q,Embarked_S,Title_Miss,Title_Mr,Title_Mrs,Title_Other
0,7.25,1,1,0,1,1,1,0,0,1,0,1,0,0
1,71.2833,0,1,0,0,0,1,0,0,0,0,0,1,0
2,7.925,0,0,0,1,0,1,0,0,1,1,0,0,0
3,53.1,0,1,0,0,0,1,0,0,1,0,0,1,0
4,8.05,1,0,0,1,1,1,0,0,1,0,1,0,0


In [38]:
X_train_boxcox.head()

Unnamed: 0,Fare,Is_male,FamilySize,Pclass_2,Pclass_3,Sex_male,Age_Adult,Age_Elder,Embarked_Q,Embarked_S,Title_Miss,Title_Mr,Title_Mrs,Title_Other
661,1.92822,1,0,0,1,1,1,0,0,0,0,1,0,0
393,3.895994,0,1,0,0,0,1,0,0,0,1,0,0,0
266,3.176069,1,5,0,1,1,1,0,0,1,0,1,0,0
144,2.271567,1,0,1,0,1,1,0,0,1,0,1,0,0
446,2.661845,0,1,1,0,0,1,0,0,1,1,0,0,0


Polynomial Expansion

In [40]:
# Rescale data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_boxcox_scaled = scaler.fit_transform(X_train_boxcox)
X_test_boxcox_scaled = scaler.transform(X_test_boxcox)

In [45]:
# Get polynomial features
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2).fit(X_train_transformed)
X_train_poly = poly.transform(X_train_boxcox_scaled)
X_test_poly = poly.transform(X_test_boxcox_scaled)