## Feature Engineering - titanic dataset

In [57]:
import pandas as pd
import numpy as np

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print(train.shape)
print(test.shape)

(891, 12)
(418, 11)


In [58]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [59]:
train['SibSp'].value_counts()

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

In [60]:
train['Parch'].value_counts()

0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64

Creating new feature - FamMembers for number of family members each passenger was with (adding SibSp and Parch columns)

In [61]:
train['FamMembers'] = train['SibSp'] + train['Parch']

In [62]:
test['FamMembers'] = test['SibSp'] + test['Parch']

Creating new feature - 0 or 1 based on whether passenger was travelling alone

In [63]:
train['LoneTravel'] = train['FamMembers'].apply(lambda x: 0 if x == 0 else 1)

In [64]:
test['LoneTravel'] = test['FamMembers'].apply(lambda x: 0 if x == 0 else 1)

Filling NaN values in Cabin column with 'Unknown'

In [65]:
cabin = train[['Cabin']] #isolate Cabin column as DF

In [66]:
cabin = cabin.fillna(value='Unknown') #fill NaN values with 'Unknown'

In [67]:
cabin = cabin.reset_index()

In [68]:
cabin.head()

Unnamed: 0,index,Cabin
0,0,Unknown
1,1,C85
2,2,Unknown
3,3,C123
4,4,Unknown


In [69]:
train = train.reset_index() #creating new index column that will align with Cabin DF index column

In [74]:
# train.head()

In [72]:
train = train.merge(cabin, how='left',on='index')

In [76]:
# train.head()
# train.shape

(891, 16)

In [77]:
cabin = test[['Cabin']]
cabin = cabin.fillna(value='Unknown')

In [78]:
cabin = cabin.reset_index()

In [79]:
cabin.head()

Unnamed: 0,index,Cabin
0,0,Unknown
1,1,Unknown
2,2,Unknown
3,3,Unknown
4,4,Unknown


In [81]:
test = test.reset_index()

In [83]:
# test.head()

In [84]:
test = test.merge(cabin, how='left',on='index')

In [86]:
# test.head()
test.shape

(418, 15)

Label Encoding Categorical Features - Name, Sex, Ticket, Embarked and **cabin**

In [87]:
train_lab = train.copy() #creating copies of train and test sets
test_lab = test.copy()

In [88]:
all_data = pd.concat([train_lab,test_lab]).reset_index(drop=True)
all_data.shape

(1309, 16)

In [89]:
cols = ('Name','Sex','Ticket','Embarked','Cabin_y')

In [90]:
from sklearn.preprocessing import LabelEncoder

label_df = all_data.copy()

for c in cols:
    if label_df[c].dtype == 'object':
        le = LabelEncoder()
        label_df[c] = le.fit_transform(label_df[c].astype(str))

In [91]:
label_df.head()

Unnamed: 0,Age,Cabin_x,Cabin_y,Embarked,FamMembers,Fare,LoneTravel,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,index
0,22.0,,186,2,1,7.25,1,155,0,1,3,1,1,0.0,720,0
1,38.0,C85,106,0,1,71.2833,1,286,0,2,1,0,1,1.0,816,1
2,26.0,,186,2,0,7.925,0,523,0,3,3,0,0,1.0,914,2
3,35.0,C123,70,2,1,53.1,1,422,0,4,1,0,1,1.0,65,3
4,35.0,,186,2,0,8.05,0,22,0,5,3,1,0,0.0,649,4


In [92]:
label_df.dtypes

Age            float64
Cabin_x         object
Cabin_y          int64
Embarked         int64
FamMembers       int64
Fare           float64
LoneTravel       int64
Name             int64
Parch            int64
PassengerId      int64
Pclass           int64
Sex              int64
SibSp            int64
Survived       float64
Ticket           int64
index            int64
dtype: object

In [93]:
all_data = label_df

Imputing Age with MICE