## Feature Extraction
Extract features from raw data.

Here, we first start with extracting new features from already existing variables.


In [38]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)


def load():
    data = pd.read_csv("datasets/titanic.csv")
    return data


df = load()

In [39]:
##Binary features
df["NEW_CABIN_FLAG"] = df["Cabin"].isnull().astype('int') ##if NAN 1 else 0

df.groupby("NEW_CABIN_FLAG").agg({"Survived": "mean"})
##interestingly, if cabin info is missing, survival rate is lower

Unnamed: 0_level_0,Survived
NEW_CABIN_FLAG,Unnamed: 1_level_1
0,0.667
1,0.3


In [40]:
df.loc[((df["SibSp"] + df["Parch"]) > 0), "IS_ALONE"] = "NO"
df.loc[((df["SibSp"] + df["Parch"]) == 0), "IS_ALONE"] = "YES"

df.groupby("IS_ALONE").agg({"Survived": "mean"})

Unnamed: 0_level_0,Survived
IS_ALONE,Unnamed: 1_level_1
NO,0.506
YES,0.304


In [41]:
##Text based features

##character count
df["NEW_NAME_COUNT"] = df["Name"].str.len()

##word count
df["NEW_NAME_WORD_COUNT"] = df["Name"].apply(lambda x: len(str(x).split(" ")))

##any specific word in the name
df["NEW_NAME_DR"] = df["Name"].apply(lambda x: len([x for x in x.split() if x.startswith("Dr")]))

df.groupby("NEW_NAME_DR").agg({"Survived": ["mean", "count"]})

Unnamed: 0_level_0,Survived,Survived
Unnamed: 0_level_1,mean,count
NEW_NAME_DR,Unnamed: 1_level_2,Unnamed: 2_level_2
0,0.383,881
1,0.5,10


In [42]:
##Regex 

##extract title from name
df["NEW_NAME_TITLE"] = df["Name"].str.extract(' ([A-Za-z]+)\.', expand=False)

df["NEW_NAME_TITLE"].value_counts()


NEW_NAME_TITLE
Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Major         2
Col           2
Countess      1
Capt          1
Ms            1
Sir           1
Lady          1
Mme           1
Don           1
Jonkheer      1
Name: count, dtype: int64

In [43]:
df[["NEW_NAME_TITLE", "Survived", "Age"]].groupby(["NEW_NAME_TITLE"]).agg({"Survived": ["mean", "count"], "Age": ["mean","count"]}) 

Unnamed: 0_level_0,Survived,Survived,Age,Age
Unnamed: 0_level_1,mean,count,mean,count
NEW_NAME_TITLE,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Capt,0.0,1,70.0,1
Col,0.5,2,58.0,2
Countess,1.0,1,33.0,1
Don,0.0,1,40.0,1
Dr,0.429,7,42.0,6
Jonkheer,0.0,1,38.0,1
Lady,1.0,1,48.0,1
Major,0.5,2,48.5,2
Master,0.575,40,4.574,36
Miss,0.698,182,21.774,146
