# Titanic FeatureTools version II (1)

Same as Titanic I: https://github.com/dongzhang84/Featuretools/blob/main/Titanic_Featuretools.ipynb

But featuretools only for the training dataset, for automation consideration

Two follow-up questions:

- How to save features for future feature engineering
- What if new coming testing categorical data not recognized

In [105]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline

import featuretools as ft
from featuretools.primitives import *
from featuretools.variable_types import Numeric

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression, LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import explained_variance_score
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel, SelectKBest, RFE, chi2

# model tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval

import warnings
warnings.filterwarnings("ignore")

In [148]:
traindf = pd.read_csv('titanic/train.csv').set_index('PassengerId')
testdf = pd.read_csv('titanic/test.csv').set_index('PassengerId')

In [149]:
traindf

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [150]:
#Thanks to:
# https://www.kaggle.com/mauricef/titanic
# https://www.kaggle.com/vbmokin/titanic-top-3-one-line-of-the-prediction-code
#
df = pd.concat([traindf, testdf], axis=0, sort=False)
df['Title'] = df.Name.str.split(',').str[1].str.split('.').str[0].str.strip()
df['Title'] = df.Name.str.split(',').str[1].str.split('.').str[0].str.strip()
df['IsWomanOrBoy'] = ((df.Title == 'Master') | (df.Sex == 'female'))
df['LastName'] = df.Name.str.split(',').str[0]
family = df.groupby(df.LastName).Survived
df['WomanOrBoyCount'] = family.transform(lambda s: s[df.IsWomanOrBoy].fillna(0).count())
#df['WomanOrBoyCount'] = df.mask(df.IsWomanOrBoy, df.WomanOrBoyCount - 1, axis=0)
df['FamilySurvivedCount'] = family.transform(lambda s: s[df.IsWomanOrBoy].fillna(0).sum())
#df['FamilySurvivedCount'] = df.mask(df.IsWomanOrBoy, df.FamilySurvivedCount - \
                                    #df.Survived.fillna(0), axis=0)
df['WomanOrBoySurvived'] = df.FamilySurvivedCount / df.WomanOrBoyCount.replace(0, np.nan)
df.WomanOrBoyCount = df.WomanOrBoyCount.replace(np.nan, 0)
df['Alone'] = (df.WomanOrBoyCount == 0)

#Thanks to https://www.kaggle.com/kpacocha/top-6-titanic-machine-learning-from-disaster
#"Title" improvement
df['Title'] = df['Title'].replace('Ms','Miss')
df['Title'] = df['Title'].replace('Mlle','Miss')
df['Title'] = df['Title'].replace('Mme','Mrs')
# Embarked
df['Embarked'] = df['Embarked'].fillna('S')
# Cabin, Deck
df['Deck'] = df['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')
df.loc[(df['Deck'] == 'T'), 'Deck'] = 'A'

# Thanks to https://www.kaggle.com/erinsweet/simpledetect
# Fare
med_fare = df.groupby(['Pclass', 'Parch', 'SibSp']).Fare.median()[3][0][0]
df['Fare'] = df['Fare'].fillna(med_fare)
#Age
df['Age'] = df.groupby(['Sex', 'Pclass', 'Title'])['Age'].apply(lambda x: x.fillna(x.median()))
# Family_Size
df['Family_Size'] = df['SibSp'] + df['Parch'] + 1

# Thanks to https://www.kaggle.com/vbmokin/titanic-top-3-cluster-analysis
cols_to_drop = ['Name','Ticket','Cabin']
df = df.drop(cols_to_drop, axis=1)

df.WomanOrBoySurvived = df.WomanOrBoySurvived.fillna(0)
df.WomanOrBoyCount = df.WomanOrBoyCount.fillna(0)
df.FamilySurvivedCount = df.FamilySurvivedCount.fillna(0)
df.Alone = df.Alone.fillna(0)
df.Alone = df.Alone*1

In [151]:
target = df.Survived.loc[traindf.index]
df = df.drop(['SibSp','Parch','IsWomanOrBoy','WomanOrBoyCount','FamilySurvivedCount','WomanOrBoySurvived','Alone'], axis=1)
df['PassengerId'] = df.index
df.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,LastName,Deck,Family_Size,PassengerId
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,3,male,22.0,7.25,S,Mr,Braund,M,2,1
2,1.0,1,female,38.0,71.2833,C,Mrs,Cumings,C,2,2
3,1.0,3,female,26.0,7.925,S,Miss,Heikkinen,M,1,3
4,1.0,1,female,35.0,53.1,S,Mrs,Futrelle,C,2,4
5,0.0,3,male,35.0,8.05,S,Mr,Allen,M,1,5


In [152]:
traindf = df.loc[traindf.index].drop(['Survived'],axis=1)
traindf

Unnamed: 0_level_0,Pclass,Sex,Age,Fare,Embarked,Title,LastName,Deck,Family_Size,PassengerId
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,male,22.0,7.2500,S,Mr,Braund,M,2,1
2,1,female,38.0,71.2833,C,Mrs,Cumings,C,2,2
3,3,female,26.0,7.9250,S,Miss,Heikkinen,M,1,3
4,1,female,35.0,53.1000,S,Mrs,Futrelle,C,2,4
5,3,male,35.0,8.0500,S,Mr,Allen,M,1,5
...,...,...,...,...,...,...,...,...,...,...
887,2,male,27.0,13.0000,S,Rev,Montvila,M,1,887
888,1,female,19.0,30.0000,S,Miss,Graham,B,1,888
889,3,female,18.0,23.4500,S,Miss,Johnston,M,4,889
890,1,male,26.0,30.0000,C,Mr,Behr,C,1,890


In [153]:
testdf = df.loc[testdf.index].drop(['Survived'],axis=1)
testdf

Unnamed: 0_level_0,Pclass,Sex,Age,Fare,Embarked,Title,LastName,Deck,Family_Size,PassengerId
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,male,34.5,7.8292,Q,Mr,Kelly,M,1,892
893,3,female,47.0,7.0000,S,Mrs,Wilkes,M,2,893
894,2,male,62.0,9.6875,Q,Mr,Myles,M,1,894
895,3,male,27.0,8.6625,S,Mr,Wirz,M,1,895
896,3,female,22.0,12.2875,S,Mrs,Hirvonen,M,3,896
...,...,...,...,...,...,...,...,...,...,...
1305,3,male,26.0,8.0500,S,Mr,Spector,M,1,1305
1306,1,female,39.0,108.9000,C,Dona,Oliva y Ocana,C,1,1306
1307,3,male,38.5,7.2500,S,Mr,Saether,M,1,1307
1308,3,male,26.0,8.0500,S,Mr,Ware,M,1,1308


In [154]:
es = ft.EntitySet(id = 'titanic_data')
es = es.entity_from_dataframe(entity_id = 'df', dataframe = traindf, 
                              variable_types = 
                              {
                                  'Embarked': ft.variable_types.Categorical,
                                  'Sex': ft.variable_types.Boolean,
                                  'Title': ft.variable_types.Categorical,
                                  'Family_Size': ft.variable_types.Numeric,
                                  'LastName': ft.variable_types.Categorical
                              },
                              index = 'PassengerId')

In [155]:
es['df']

Entity: df
  Variables:
    PassengerId (dtype: index)
    Pclass (dtype: numeric)
    Age (dtype: numeric)
    Fare (dtype: numeric)
    Deck (dtype: categorical)
    Embarked (dtype: categorical)
    Sex (dtype: boolean)
    Title (dtype: categorical)
    Family_Size (dtype: numeric)
    LastName (dtype: categorical)
  Shape:
    (Rows: 891, Columns: 10)

In [156]:
primitives = ft.list_primitives()
pd.options.display.max_colwidth = 500
primitives[primitives['type'] == 'aggregation'].head(primitives[primitives['type'] == 'aggregation'].shape[0])

Unnamed: 0,name,type,dask_compatible,koalas_compatible,description,valid_inputs,return_type
0,all,aggregation,True,False,Calculates if all values are 'True' in a list.,Boolean,Boolean
1,min,aggregation,True,True,"Calculates the smallest value, ignoring `NaN` values.",Numeric,Numeric
2,last,aggregation,False,False,Determines the last value in a list.,Variable,
3,mean,aggregation,True,True,Computes the average for a list of values.,Numeric,Numeric
4,percent_true,aggregation,True,False,Determines the percent of `True` values.,Boolean,Numeric
5,entropy,aggregation,False,False,Calculates the entropy for a categorical variable,Categorical,Numeric
6,std,aggregation,True,True,"Computes the dispersion relative to the mean value, ignoring `NaN`.",Numeric,Numeric
7,median,aggregation,False,False,Determines the middlemost number in a list of values.,Numeric,Numeric
8,avg_time_between,aggregation,False,False,Computes the average number of seconds between consecutive events.,DatetimeTimeIndex,Numeric
9,sum,aggregation,True,True,"Calculates the total addition, ignoring `NaN`.",Numeric,Numeric


In [157]:
es = es.normalize_entity(base_entity_id='df', new_entity_id='Pclass', index='Pclass')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Sex', index='Sex')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Age', index='Age')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Fare', index='Fare')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Embarked', index='Embarked')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Title', index='Title')
es = es.normalize_entity(base_entity_id='df', new_entity_id='LastName', index='LastName')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Deck', index='Deck')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Family_Size', index='Family_Size')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Title_Sex', index='Sex')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Sex_LastName', index='LastName')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Title_LastName', index='LastName')
es

es

Entityset: titanic_data
  Entities:
    df [Rows: 891, Columns: 10]
    Pclass [Rows: 3, Columns: 1]
    Sex [Rows: 2, Columns: 1]
    Age [Rows: 89, Columns: 1]
    Fare [Rows: 248, Columns: 1]
    Embarked [Rows: 3, Columns: 1]
    Title [Rows: 14, Columns: 1]
    LastName [Rows: 667, Columns: 1]
    Deck [Rows: 8, Columns: 1]
    Family_Size [Rows: 9, Columns: 1]
    Title_Sex [Rows: 2, Columns: 1]
    Sex_LastName [Rows: 667, Columns: 1]
    Title_LastName [Rows: 667, Columns: 1]
  Relationships:
    df.Pclass -> Pclass.Pclass
    df.Sex -> Sex.Sex
    df.Age -> Age.Age
    df.Fare -> Fare.Fare
    df.Embarked -> Embarked.Embarked
    df.Title -> Title.Title
    df.LastName -> LastName.LastName
    df.Deck -> Deck.Deck
    df.Family_Size -> Family_Size.Family_Size
    df.Sex -> Title_Sex.Sex
    df.LastName -> Sex_LastName.LastName
    df.LastName -> Title_LastName.LastName

In [158]:
features, feature_names = ft.dfs(entityset = es, 
                                 target_entity = 'df', 
                                 max_depth = 2)
len(feature_names)

213

In [159]:
features

Unnamed: 0_level_0,Pclass,Age,Fare,Deck,Embarked,Sex,Title,Family_Size,LastName,Pclass.COUNT(df),...,Title_LastName.MODE(df.Sex),Title_LastName.MODE(df.Title),Title_LastName.NUM_UNIQUE(df.Age),Title_LastName.NUM_UNIQUE(df.Deck),Title_LastName.NUM_UNIQUE(df.Embarked),Title_LastName.NUM_UNIQUE(df.Family_Size),Title_LastName.NUM_UNIQUE(df.Fare),Title_LastName.NUM_UNIQUE(df.Pclass),Title_LastName.NUM_UNIQUE(df.Sex),Title_LastName.NUM_UNIQUE(df.Title)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3,22.0,7.2500,M,S,male,Mr,2,Braund,491,...,male,Mr,2,1,1,1,2,1,1,1
2,1,38.0,71.2833,C,C,female,Mrs,2,Cumings,216,...,female,Mrs,1,1,1,1,1,1,1,1
3,3,26.0,7.9250,M,S,female,Miss,1,Heikkinen,491,...,female,Miss,1,1,1,1,1,1,1,1
4,1,35.0,53.1000,C,S,female,Mrs,2,Futrelle,216,...,female,Mr,2,1,1,1,1,1,2,2
5,3,35.0,8.0500,M,S,male,Mr,1,Allen,491,...,female,Miss,2,2,1,1,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,2,27.0,13.0000,M,S,male,Rev,1,Montvila,184,...,male,Rev,1,1,1,1,1,1,1,1
888,1,19.0,30.0000,B,S,female,Miss,1,Graham,216,...,female,Miss,3,2,1,2,2,1,2,3
889,3,18.0,23.4500,M,S,female,Miss,4,Johnston,491,...,female,Miss,2,1,1,1,1,1,2,2
890,1,26.0,30.0000,C,C,male,Mr,1,Behr,216,...,male,Mr,1,1,1,1,1,1,1,1


In [160]:
feature_names

[<Feature: Pclass>,
 <Feature: Age>,
 <Feature: Fare>,
 <Feature: Deck>,
 <Feature: Embarked>,
 <Feature: Sex>,
 <Feature: Title>,
 <Feature: Family_Size>,
 <Feature: LastName>,
 <Feature: Pclass.COUNT(df)>,
 <Feature: Pclass.MODE(df.Age)>,
 <Feature: Pclass.MODE(df.Deck)>,
 <Feature: Pclass.MODE(df.Embarked)>,
 <Feature: Pclass.MODE(df.Family_Size)>,
 <Feature: Pclass.MODE(df.Fare)>,
 <Feature: Pclass.MODE(df.LastName)>,
 <Feature: Pclass.MODE(df.Sex)>,
 <Feature: Pclass.MODE(df.Title)>,
 <Feature: Pclass.NUM_UNIQUE(df.Age)>,
 <Feature: Pclass.NUM_UNIQUE(df.Deck)>,
 <Feature: Pclass.NUM_UNIQUE(df.Embarked)>,
 <Feature: Pclass.NUM_UNIQUE(df.Family_Size)>,
 <Feature: Pclass.NUM_UNIQUE(df.Fare)>,
 <Feature: Pclass.NUM_UNIQUE(df.LastName)>,
 <Feature: Pclass.NUM_UNIQUE(df.Sex)>,
 <Feature: Pclass.NUM_UNIQUE(df.Title)>,
 <Feature: Sex.COUNT(df)>,
 <Feature: Sex.MODE(df.Age)>,
 <Feature: Sex.MODE(df.Deck)>,
 <Feature: Sex.MODE(df.Embarked)>,
 <Feature: Sex.MODE(df.Family_Size)>,
 <Feature: 

## Encoding categorical features

In [161]:
# Determination categorical features
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
categorical_columns = []
cols = features.columns.values.tolist()
for col in cols:
    if features[col].dtype in numerics: continue
    categorical_columns.append(col)
categorical_columns

['Deck',
 'Embarked',
 'Sex',
 'Title',
 'LastName',
 'Pclass.MODE(df.Deck)',
 'Pclass.MODE(df.Embarked)',
 'Pclass.MODE(df.LastName)',
 'Pclass.MODE(df.Sex)',
 'Pclass.MODE(df.Title)',
 'Sex.MODE(df.Deck)',
 'Sex.MODE(df.Embarked)',
 'Sex.MODE(df.LastName)',
 'Sex.MODE(df.Title)',
 'Age.MODE(df.Deck)',
 'Age.MODE(df.Embarked)',
 'Age.MODE(df.LastName)',
 'Age.MODE(df.Sex)',
 'Age.MODE(df.Title)',
 'Fare.MODE(df.Deck)',
 'Fare.MODE(df.Embarked)',
 'Fare.MODE(df.LastName)',
 'Fare.MODE(df.Sex)',
 'Fare.MODE(df.Title)',
 'Embarked.MODE(df.Deck)',
 'Embarked.MODE(df.LastName)',
 'Embarked.MODE(df.Sex)',
 'Embarked.MODE(df.Title)',
 'Title.MODE(df.Deck)',
 'Title.MODE(df.Embarked)',
 'Title.MODE(df.LastName)',
 'Title.MODE(df.Sex)',
 'LastName.MODE(df.Deck)',
 'LastName.MODE(df.Embarked)',
 'LastName.MODE(df.Sex)',
 'LastName.MODE(df.Title)',
 'Deck.MODE(df.Embarked)',
 'Deck.MODE(df.LastName)',
 'Deck.MODE(df.Sex)',
 'Deck.MODE(df.Title)',
 'Family_Size.MODE(df.Deck)',
 'Family_Size.MODE(

In [163]:
# Encoding categorical features

features_cate = features.copy()

encode_list = []

for col in categorical_columns:
    if col in features.columns:
        le = LabelEncoder()
        le.fit(list(features[col].astype(str).values))
        encode_list.append(le)
        features_cate[col] = le.transform(list(features[col].astype(str).values))

In [164]:
len(encode_list)

57

In [165]:
features.head()

Unnamed: 0_level_0,Pclass,Age,Fare,Deck,Embarked,Sex,Title,Family_Size,LastName,Pclass.COUNT(df),...,Title_LastName.MODE(df.Sex),Title_LastName.MODE(df.Title),Title_LastName.NUM_UNIQUE(df.Age),Title_LastName.NUM_UNIQUE(df.Deck),Title_LastName.NUM_UNIQUE(df.Embarked),Title_LastName.NUM_UNIQUE(df.Family_Size),Title_LastName.NUM_UNIQUE(df.Fare),Title_LastName.NUM_UNIQUE(df.Pclass),Title_LastName.NUM_UNIQUE(df.Sex),Title_LastName.NUM_UNIQUE(df.Title)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3,22.0,7.25,M,S,male,Mr,2,Braund,491,...,male,Mr,2,1,1,1,2,1,1,1
2,1,38.0,71.2833,C,C,female,Mrs,2,Cumings,216,...,female,Mrs,1,1,1,1,1,1,1,1
3,3,26.0,7.925,M,S,female,Miss,1,Heikkinen,491,...,female,Miss,1,1,1,1,1,1,1,1
4,1,35.0,53.1,C,S,female,Mrs,2,Futrelle,216,...,female,Mr,2,1,1,1,1,1,2,2
5,3,35.0,8.05,M,S,male,Mr,1,Allen,491,...,female,Miss,2,2,1,1,2,2,2,2


In [167]:
features_cate

Unnamed: 0_level_0,Pclass,Age,Fare,Deck,Embarked,Sex,Title,Family_Size,LastName,Pclass.COUNT(df),...,Title_LastName.MODE(df.Sex),Title_LastName.MODE(df.Title),Title_LastName.NUM_UNIQUE(df.Age),Title_LastName.NUM_UNIQUE(df.Deck),Title_LastName.NUM_UNIQUE(df.Embarked),Title_LastName.NUM_UNIQUE(df.Family_Size),Title_LastName.NUM_UNIQUE(df.Fare),Title_LastName.NUM_UNIQUE(df.Pclass),Title_LastName.NUM_UNIQUE(df.Sex),Title_LastName.NUM_UNIQUE(df.Title)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3,22.0,7.2500,7,2,1,9,2,73,491,...,1,9,2,1,1,1,2,1,1,1
2,1,38.0,71.2833,2,0,0,10,2,136,216,...,0,10,1,1,1,1,1,1,1,1
3,3,26.0,7.9250,7,2,0,8,1,251,491,...,0,8,1,1,1,1,1,1,1,1
4,1,35.0,53.1000,2,2,0,10,2,198,216,...,0,9,2,1,1,1,1,1,2,2
5,3,35.0,8.0500,7,2,1,9,1,11,491,...,0,8,2,2,1,1,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,2,27.0,13.0000,7,2,1,11,1,406,184,...,1,11,1,1,1,1,1,1,1,1
888,1,19.0,30.0000,1,2,0,8,1,221,216,...,0,8,3,2,1,2,2,1,2,3
889,3,18.0,23.4500,7,2,0,8,4,293,491,...,0,8,2,1,1,1,1,1,2,2
890,1,26.0,30.0000,2,0,1,9,1,52,216,...,1,9,1,1,1,1,1,1,1,1


In [168]:
target

PassengerId
1      0.0
2      1.0
3      1.0
4      1.0
5      0.0
      ... 
887    0.0
888    1.0
889    0.0
890    1.0
891    0.0
Name: Survived, Length: 891, dtype: float64

In [169]:
train = features_cate

lasso = LassoCV(cv=5).fit(train, target)
model = SelectFromModel(lasso, prefit=True)
X_new = model.transform(train)
X_selected_df = pd.DataFrame(X_new, columns=[train.columns[i] for i in range(len(train.columns)) 
                                             if model.get_support()[i]])

In [170]:
X_selected_df

Unnamed: 0,Age,Fare,Pclass.COUNT(df),Pclass.NUM_UNIQUE(df.Fare),Sex.COUNT(df),Age.COUNT(df),Age.MODE(df.LastName),Fare.COUNT(df),Fare.MODE(df.Age),Fare.MODE(df.LastName),Embarked.COUNT(df),Title.COUNT(df),Title.MODE(df.Age),Title.MODE(df.Fare),Title.NUM_UNIQUE(df.Fare),Deck.COUNT(df),Family_Size.MODE(df.Age),Family_Size.NUM_UNIQUE(df.Age),Family_Size.NUM_UNIQUE(df.LastName),Sex_LastName.MODE(df.Fare)
0,22.0,7.2500,491.0,119.0,577.0,27.0,19.0,13.0,22.0,3.0,646.0,517.0,26.0,8.05,171.0,687.0,18.0,53.0,116.0,7.0458
1,38.0,71.2833,216.0,94.0,314.0,11.0,12.0,1.0,38.0,69.0,168.0,126.0,31.0,26.00,99.0,59.0,18.0,53.0,116.0,71.2833
2,26.0,7.9250,491.0,119.0,314.0,108.0,66.0,18.0,32.0,103.0,646.0,185.0,18.0,7.75,108.0,687.0,26.0,71.0,499.0,7.9250
3,35.0,53.1000,216.0,94.0,314.0,18.0,1.0,5.0,19.0,58.0,646.0,126.0,31.0,26.00,99.0,59.0,18.0,53.0,116.0,53.1000
4,35.0,8.0500,491.0,119.0,577.0,18.0,1.0,43.0,26.0,4.0,646.0,517.0,26.0,8.05,171.0,687.0,26.0,71.0,499.0,8.0500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,27.0,13.0000,184.0,42.0,577.0,18.0,8.0,42.0,30.0,26.0,646.0,6.0,27.0,13.00,5.0,687.0,26.0,71.0,499.0,13.0000
887,19.0,30.0000,216.0,94.0,314.0,25.0,6.0,6.0,19.0,29.0,646.0,185.0,18.0,7.75,108.0,47.0,26.0,71.0,499.0,153.4625
888,18.0,23.4500,491.0,119.0,314.0,59.0,52.0,2.0,18.0,135.0,646.0,185.0,18.0,7.75,108.0,687.0,24.0,19.0,13.0,23.4500
889,26.0,30.0000,216.0,94.0,577.0,108.0,66.0,6.0,19.0,29.0,168.0,517.0,26.0,8.05,171.0,59.0,26.0,71.0,499.0,30.0000


In [171]:
FE_option3 = X_selected_df.columns
FE_option3

Index(['Age', 'Fare', 'Pclass.COUNT(df)', 'Pclass.NUM_UNIQUE(df.Fare)',
       'Sex.COUNT(df)', 'Age.COUNT(df)', 'Age.MODE(df.LastName)',
       'Fare.COUNT(df)', 'Fare.MODE(df.Age)', 'Fare.MODE(df.LastName)',
       'Embarked.COUNT(df)', 'Title.COUNT(df)', 'Title.MODE(df.Age)',
       'Title.MODE(df.Fare)', 'Title.NUM_UNIQUE(df.Fare)', 'Deck.COUNT(df)',
       'Family_Size.MODE(df.Age)', 'Family_Size.NUM_UNIQUE(df.Age)',
       'Family_Size.NUM_UNIQUE(df.LastName)', 'Sex_LastName.MODE(df.Fare)'],
      dtype='object')

# Load test data

In [172]:
es_tst = ft.EntitySet(id = 'titanic_test')
es_tst = es_tst.entity_from_dataframe(entity_id = 'df', dataframe = testdf, 
                              variable_types = 
                              {
                                  'Embarked': ft.variable_types.Categorical,
                                  'Sex': ft.variable_types.Boolean,
                                  'Title': ft.variable_types.Categorical,
                                  'Family_Size': ft.variable_types.Numeric,
                                  'LastName': ft.variable_types.Categorical
                              },
                              index = 'PassengerId')

In [173]:
es_tst = es_tst.normalize_entity(base_entity_id='df', new_entity_id='Pclass', index='Pclass')
es_tst = es_tst.normalize_entity(base_entity_id='df', new_entity_id='Sex', index='Sex')
es_tst = es_tst.normalize_entity(base_entity_id='df', new_entity_id='Age', index='Age')
es_tst = es_tst.normalize_entity(base_entity_id='df', new_entity_id='Fare', index='Fare')
es_tst = es_tst.normalize_entity(base_entity_id='df', new_entity_id='Embarked', index='Embarked')
es_tst = es_tst.normalize_entity(base_entity_id='df', new_entity_id='Title', index='Title')
es_tst = es_tst.normalize_entity(base_entity_id='df', new_entity_id='LastName', index='LastName')
es_tst = es_tst.normalize_entity(base_entity_id='df', new_entity_id='Deck', index='Deck')
es_tst = es_tst.normalize_entity(base_entity_id='df', new_entity_id='Family_Size', index='Family_Size')
es_tst = es_tst.normalize_entity(base_entity_id='df', new_entity_id='Title_Sex', index='Sex')
es_tst = es_tst.normalize_entity(base_entity_id='df', new_entity_id='Sex_LastName', index='LastName')
es_tst = es_tst.normalize_entity(base_entity_id='df', new_entity_id='Title_LastName', index='LastName')
es_tst

es_tst

Entityset: titanic_test
  Entities:
    df [Rows: 418, Columns: 10]
    Pclass [Rows: 3, Columns: 1]
    Sex [Rows: 2, Columns: 1]
    Age [Rows: 81, Columns: 1]
    Fare [Rows: 169, Columns: 1]
    Embarked [Rows: 3, Columns: 1]
    Title [Rows: 8, Columns: 1]
    LastName [Rows: 352, Columns: 1]
    Deck [Rows: 8, Columns: 1]
    Family_Size [Rows: 9, Columns: 1]
    Title_Sex [Rows: 2, Columns: 1]
    Sex_LastName [Rows: 352, Columns: 1]
    Title_LastName [Rows: 352, Columns: 1]
  Relationships:
    df.Pclass -> Pclass.Pclass
    df.Sex -> Sex.Sex
    df.Age -> Age.Age
    df.Fare -> Fare.Fare
    df.Embarked -> Embarked.Embarked
    df.Title -> Title.Title
    df.LastName -> LastName.LastName
    df.Deck -> Deck.Deck
    df.Family_Size -> Family_Size.Family_Size
    df.Sex -> Title_Sex.Sex
    df.LastName -> Sex_LastName.LastName
    df.LastName -> Title_LastName.LastName

In [174]:
es_tst

Entityset: titanic_test
  Entities:
    df [Rows: 418, Columns: 10]
    Pclass [Rows: 3, Columns: 1]
    Sex [Rows: 2, Columns: 1]
    Age [Rows: 81, Columns: 1]
    Fare [Rows: 169, Columns: 1]
    Embarked [Rows: 3, Columns: 1]
    Title [Rows: 8, Columns: 1]
    LastName [Rows: 352, Columns: 1]
    Deck [Rows: 8, Columns: 1]
    Family_Size [Rows: 9, Columns: 1]
    Title_Sex [Rows: 2, Columns: 1]
    Sex_LastName [Rows: 352, Columns: 1]
    Title_LastName [Rows: 352, Columns: 1]
  Relationships:
    df.Pclass -> Pclass.Pclass
    df.Sex -> Sex.Sex
    df.Age -> Age.Age
    df.Fare -> Fare.Fare
    df.Embarked -> Embarked.Embarked
    df.Title -> Title.Title
    df.LastName -> LastName.LastName
    df.Deck -> Deck.Deck
    df.Family_Size -> Family_Size.Family_Size
    df.Sex -> Title_Sex.Sex
    df.LastName -> Sex_LastName.LastName
    df.LastName -> Title_LastName.LastName

In [187]:
test_fe = ft.calculate_feature_matrix(features=feature_names, entityset=es_tst)
test_fe

Unnamed: 0_level_0,Pclass,Age,Fare,Deck,Embarked,Sex,Title,Family_Size,LastName,Pclass.COUNT(df),...,Title_LastName.MODE(df.Sex),Title_LastName.MODE(df.Title),Title_LastName.NUM_UNIQUE(df.Age),Title_LastName.NUM_UNIQUE(df.Deck),Title_LastName.NUM_UNIQUE(df.Embarked),Title_LastName.NUM_UNIQUE(df.Family_Size),Title_LastName.NUM_UNIQUE(df.Fare),Title_LastName.NUM_UNIQUE(df.Pclass),Title_LastName.NUM_UNIQUE(df.Sex),Title_LastName.NUM_UNIQUE(df.Title)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
892,3,34.5,7.8292,M,Q,male,Mr,1,Kelly,218,...,male,Mr,1,1,1,1,1,1,1,1
893,3,47.0,7.0000,M,S,female,Mrs,2,Wilkes,218,...,female,Mrs,1,1,1,1,1,1,1,1
894,2,62.0,9.6875,M,Q,male,Mr,1,Myles,93,...,male,Mr,1,1,1,1,1,1,1,1
895,3,27.0,8.6625,M,S,male,Mr,1,Wirz,218,...,male,Mr,1,1,1,1,1,1,1,1
896,3,22.0,12.2875,M,S,female,Mrs,3,Hirvonen,218,...,female,Mrs,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,3,26.0,8.0500,M,S,male,Mr,1,Spector,218,...,male,Mr,1,1,1,1,1,1,1,1
1306,1,39.0,108.9000,C,C,female,Dona,1,Oliva y Ocana,107,...,female,Dona,1,1,1,1,1,1,1,1
1307,3,38.5,7.2500,M,S,male,Mr,1,Saether,218,...,male,Mr,1,1,1,1,1,1,1,1
1308,3,26.0,8.0500,M,S,male,Mr,1,Ware,218,...,male,Mr,4,1,1,2,3,2,2,2


In [188]:
features[categorical_columns]

Unnamed: 0_level_0,Deck,Embarked,Sex,Title,LastName,Pclass.MODE(df.Deck),Pclass.MODE(df.Embarked),Pclass.MODE(df.LastName),Pclass.MODE(df.Sex),Pclass.MODE(df.Title),...,Title_Sex.MODE(df.LastName),Title_Sex.MODE(df.Title),Sex_LastName.MODE(df.Deck),Sex_LastName.MODE(df.Embarked),Sex_LastName.MODE(df.Sex),Sex_LastName.MODE(df.Title),Title_LastName.MODE(df.Deck),Title_LastName.MODE(df.Embarked),Title_LastName.MODE(df.Sex),Title_LastName.MODE(df.Title)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,M,S,male,Mr,Braund,M,S,Andersson,male,Mr,...,Panula,Mr,M,S,male,Mr,M,S,male,Mr
2,C,C,female,Mrs,Cumings,C,S,Carter,male,Mr,...,Andersson,Miss,C,C,female,Mrs,C,C,female,Mrs
3,M,S,female,Miss,Heikkinen,M,S,Andersson,male,Mr,...,Andersson,Miss,M,S,female,Miss,M,S,female,Miss
4,C,S,female,Mrs,Futrelle,C,S,Carter,male,Mr,...,Andersson,Miss,C,S,female,Mr,C,S,female,Mr
5,M,S,male,Mr,Allen,M,S,Andersson,male,Mr,...,Panula,Mr,B,S,female,Miss,B,S,female,Miss
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,M,S,male,Rev,Montvila,M,S,Brown,male,Mr,...,Panula,Mr,M,S,male,Rev,M,S,male,Rev
888,B,S,female,Miss,Graham,C,S,Carter,male,Mr,...,Andersson,Miss,C,S,female,Miss,C,S,female,Miss
889,M,S,female,Miss,Johnston,M,S,Andersson,male,Mr,...,Andersson,Miss,M,S,female,Miss,M,S,female,Miss
890,C,C,male,Mr,Behr,C,S,Carter,male,Mr,...,Panula,Mr,C,C,male,Mr,C,C,male,Mr


In [189]:
features[categorical_columns]

Unnamed: 0_level_0,Deck,Embarked,Sex,Title,LastName,Pclass.MODE(df.Deck),Pclass.MODE(df.Embarked),Pclass.MODE(df.LastName),Pclass.MODE(df.Sex),Pclass.MODE(df.Title),...,Title_Sex.MODE(df.LastName),Title_Sex.MODE(df.Title),Sex_LastName.MODE(df.Deck),Sex_LastName.MODE(df.Embarked),Sex_LastName.MODE(df.Sex),Sex_LastName.MODE(df.Title),Title_LastName.MODE(df.Deck),Title_LastName.MODE(df.Embarked),Title_LastName.MODE(df.Sex),Title_LastName.MODE(df.Title)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,M,S,male,Mr,Braund,M,S,Andersson,male,Mr,...,Panula,Mr,M,S,male,Mr,M,S,male,Mr
2,C,C,female,Mrs,Cumings,C,S,Carter,male,Mr,...,Andersson,Miss,C,C,female,Mrs,C,C,female,Mrs
3,M,S,female,Miss,Heikkinen,M,S,Andersson,male,Mr,...,Andersson,Miss,M,S,female,Miss,M,S,female,Miss
4,C,S,female,Mrs,Futrelle,C,S,Carter,male,Mr,...,Andersson,Miss,C,S,female,Mr,C,S,female,Mr
5,M,S,male,Mr,Allen,M,S,Andersson,male,Mr,...,Panula,Mr,B,S,female,Miss,B,S,female,Miss
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,M,S,male,Rev,Montvila,M,S,Brown,male,Mr,...,Panula,Mr,M,S,male,Rev,M,S,male,Rev
888,B,S,female,Miss,Graham,C,S,Carter,male,Mr,...,Andersson,Miss,C,S,female,Miss,C,S,female,Miss
889,M,S,female,Miss,Johnston,M,S,Andersson,male,Mr,...,Andersson,Miss,M,S,female,Miss,M,S,female,Miss
890,C,C,male,Mr,Behr,C,S,Carter,male,Mr,...,Panula,Mr,C,C,male,Mr,C,C,male,Mr


In [200]:
test_fe = test_fe[(test_fe.Title != 'Dona')]

In [206]:
i = 0

for col in categorical_columns:
    
    try: 
        test_fe[col] = encode_list[i].transform(test_fe[col])
    except:
        print(col, "An exception occurred")
    i += 1

LastName An exception occurred
Pclass.MODE(df.Embarked) An exception occurred
Pclass.MODE(df.LastName) An exception occurred
Sex.MODE(df.LastName) An exception occurred
Age.MODE(df.LastName) An exception occurred
Age.MODE(df.Title) An exception occurred
Fare.MODE(df.LastName) An exception occurred
Fare.MODE(df.Title) An exception occurred
Embarked.MODE(df.LastName) An exception occurred
Embarked.MODE(df.Sex) An exception occurred
Embarked.MODE(df.Title) An exception occurred
Title.MODE(df.Deck) An exception occurred
Title.MODE(df.LastName) An exception occurred
Deck.MODE(df.Embarked) An exception occurred
Deck.MODE(df.LastName) An exception occurred
Family_Size.MODE(df.Deck) An exception occurred
Family_Size.MODE(df.LastName) An exception occurred
Family_Size.MODE(df.Title) An exception occurred
Title_Sex.MODE(df.LastName) An exception occurred


In [208]:
test_fe[FE_option3]

Unnamed: 0_level_0,Age,Fare,Pclass.COUNT(df),Pclass.NUM_UNIQUE(df.Fare),Sex.COUNT(df),Age.COUNT(df),Age.MODE(df.LastName),Fare.COUNT(df),Fare.MODE(df.Age),Fare.MODE(df.LastName),Embarked.COUNT(df),Title.COUNT(df),Title.MODE(df.Age),Title.MODE(df.Fare),Title.NUM_UNIQUE(df.Fare),Deck.COUNT(df),Family_Size.MODE(df.Age),Family_Size.NUM_UNIQUE(df.Age),Family_Size.NUM_UNIQUE(df.LastName),Sex_LastName.MODE(df.Fare)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
892,34.5,7.8292,218,79,266,1,Kelly,1,34.5,Kelly,46,240,26.0,8.05,104,327,26.0,54,240,7.8292
893,47.0,7.0000,218,79,152,5,Astor,2,26.0,Pearce,270,72,31.0,21.00,63,327,26.0,39,63,7.0000
894,62.0,9.6875,93,31,266,1,Myles,1,62.0,Myles,46,240,26.0,8.05,104,327,26.0,54,240,9.6875
895,27.0,8.6625,218,79,266,12,Barry,8,27.0,Cacic,270,240,26.0,8.05,104,327,26.0,54,240,8.6625
896,22.0,12.2875,218,79,152,16,Bradley,1,22.0,Hirvonen,270,72,31.0,21.00,63,327,26.0,37,41,12.2875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,28.0,7.7750,218,79,152,7,Beauchamp,10,26.0,Andersson,270,79,18.0,7.75,53,327,26.0,54,240,7.7750
1305,26.0,8.0500,218,79,266,58,Thomas,17,26.0,Davies,270,240,26.0,8.05,104,327,26.0,54,240,8.0500
1307,38.5,7.2500,218,79,266,1,Saether,5,24.0,Carver,270,240,26.0,8.05,104,327,26.0,54,240,7.2500
1308,26.0,8.0500,218,79,266,58,Thomas,17,26.0,Davies,270,240,26.0,8.05,104,327,26.0,54,240,21.0000
