# Titanic FeatureTools version -- only for training data

Follow up: https://github.com/dongzhang84/Featuretools/blob/main/Titanic_Featuretools_automation_1.ipynb

Save features and load features

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import pickle
%matplotlib inline

import featuretools as ft
from featuretools.primitives import *
from featuretools.variable_types import Numeric

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression, LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.metrics import(
    classification_report, confusion_matrix, accuracy_score, mean_squared_error, 
    mean_absolute_error, explained_variance_score
)



from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel, SelectKBest, RFE, chi2

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression


import warnings
warnings.filterwarnings("ignore")

In [2]:
X_train = pd.read_csv('titanic/traindf.csv').set_index('PassengerId')
X_train_orig = X_train.copy()

In [3]:
X_train_orig

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Deck,Family_Size
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
299,1,1,male,40.00,30.5000,S,Mr,C,1
885,0,3,male,25.00,7.0500,S,Mr,M,1
248,1,2,female,24.00,14.5000,S,Mrs,M,3
479,0,3,male,22.00,7.5208,S,Mr,M,1
306,1,1,male,0.92,151.5500,S,Master,C,4
...,...,...,...,...,...,...,...,...,...
107,1,3,female,21.00,7.6500,S,Miss,M,1
271,0,1,male,40.00,31.0000,S,Mr,M,1
861,0,3,male,41.00,14.1083,S,Mr,M,3
436,1,1,female,14.00,120.0000,S,Miss,B,4


In [4]:
y_train = X_train.Survived
X_train = X_train.drop(['Survived'], axis=1)

# Automatic FE with Featuretools

In [5]:
es = ft.EntitySet(id = 'titanic_data')
es = es.entity_from_dataframe(entity_id = 'df', dataframe = X_train, 
                              variable_types = 
                              {
                                  'Embarked': ft.variable_types.Categorical,
                                  'Sex': ft.variable_types.Boolean,
                                  'Title': ft.variable_types.Categorical,
                                  'Family_Size': ft.variable_types.Numeric,
                              },
                              index = 'Id')

In [6]:
es['df']

Entity: df
  Variables:
    Id (dtype: index)
    Pclass (dtype: numeric)
    Age (dtype: numeric)
    Fare (dtype: numeric)
    Deck (dtype: categorical)
    Embarked (dtype: categorical)
    Sex (dtype: boolean)
    Title (dtype: categorical)
    Family_Size (dtype: numeric)
  Shape:
    (Rows: 668, Columns: 9)

In [7]:
es = es.normalize_entity(base_entity_id='df', new_entity_id='Pclass', index='Pclass')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Sex', index='Sex')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Embarked', index='Embarked')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Title', index='Title')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Deck', index='Deck')
es

Entityset: titanic_data
  Entities:
    df [Rows: 668, Columns: 9]
    Pclass [Rows: 3, Columns: 1]
    Sex [Rows: 2, Columns: 1]
    Embarked [Rows: 3, Columns: 1]
    Title [Rows: 11, Columns: 1]
    Deck [Rows: 8, Columns: 1]
  Relationships:
    df.Pclass -> Pclass.Pclass
    df.Sex -> Sex.Sex
    df.Embarked -> Embarked.Embarked
    df.Title -> Title.Title
    df.Deck -> Deck.Deck

In [8]:
feature_matrix, feature_names = ft.dfs(entityset=es, 
                                       target_entity = 'df',
                                       max_depth = 2, 
                                       ignore_variables={'df':['Survived','PassengerId']})

len(feature_names)

143

In [9]:
feature_names

[<Feature: Pclass>,
 <Feature: Age>,
 <Feature: Fare>,
 <Feature: Deck>,
 <Feature: Embarked>,
 <Feature: Sex>,
 <Feature: Title>,
 <Feature: Family_Size>,
 <Feature: Pclass.COUNT(df)>,
 <Feature: Pclass.MAX(df.Age)>,
 <Feature: Pclass.MAX(df.Family_Size)>,
 <Feature: Pclass.MAX(df.Fare)>,
 <Feature: Pclass.MEAN(df.Age)>,
 <Feature: Pclass.MEAN(df.Family_Size)>,
 <Feature: Pclass.MEAN(df.Fare)>,
 <Feature: Pclass.MIN(df.Age)>,
 <Feature: Pclass.MIN(df.Family_Size)>,
 <Feature: Pclass.MIN(df.Fare)>,
 <Feature: Pclass.MODE(df.Deck)>,
 <Feature: Pclass.MODE(df.Embarked)>,
 <Feature: Pclass.MODE(df.Sex)>,
 <Feature: Pclass.MODE(df.Title)>,
 <Feature: Pclass.NUM_UNIQUE(df.Deck)>,
 <Feature: Pclass.NUM_UNIQUE(df.Embarked)>,
 <Feature: Pclass.NUM_UNIQUE(df.Sex)>,
 <Feature: Pclass.NUM_UNIQUE(df.Title)>,
 <Feature: Pclass.SKEW(df.Age)>,
 <Feature: Pclass.SKEW(df.Family_Size)>,
 <Feature: Pclass.SKEW(df.Fare)>,
 <Feature: Pclass.STD(df.Age)>,
 <Feature: Pclass.STD(df.Family_Size)>,
 <Feature: P

In [10]:
feature_matrix_enc, features_enc = ft.encode_features(feature_matrix, feature_names, include_unknown=False)

In [11]:
len(features_enc)

179

In [12]:
X_train = feature_matrix_enc.copy()
X_train.head()

Unnamed: 0_level_0,Pclass = 3,Pclass = 1,Pclass = 2,Age,Fare,Deck = M,Deck = C,Deck = B,Deck = E,Deck = D,...,Deck.NUM_UNIQUE(df.Title),Deck.SKEW(df.Age),Deck.SKEW(df.Family_Size),Deck.SKEW(df.Fare),Deck.STD(df.Age),Deck.STD(df.Family_Size),Deck.STD(df.Fare),Deck.SUM(df.Age),Deck.SUM(df.Family_Size),Deck.SUM(df.Fare)
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,False,True,False,40.0,30.5,False,True,False,False,False,...,6,-0.246234,1.479506,1.012559,14.795076,1.451969,69.726912,1663.92,104,4675.9207
1,True,False,False,25.0,7.05,True,False,False,False,False,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
2,False,False,True,24.0,14.5,True,False,False,False,False,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
3,True,False,False,22.0,7.5208,True,False,False,False,False,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
4,False,True,False,0.92,151.55,False,True,False,False,False,...,6,-0.246234,1.479506,1.012559,14.795076,1.451969,69.726912,1663.92,104,4675.9207


In [13]:
X_train_orig.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Deck,Family_Size
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
299,1,1,male,40.0,30.5,S,Mr,C,1
885,0,3,male,25.0,7.05,S,Mr,M,1
248,1,2,female,24.0,14.5,S,Mrs,M,3
479,0,3,male,22.0,7.5208,S,Mr,M,1
306,1,1,male,0.92,151.55,S,Master,C,4


In [14]:
# save feature engineering

ft.save_features(features_enc, "titanic/feature_definitions.json")

# Encoding categorical features

In [15]:
# Determination categorical features
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
categorical_columns = []
cols = X_train.columns.values.tolist()
for col in cols:
    if X_train[col].dtype in numerics: continue
    categorical_columns.append(col)
categorical_columns

['Pclass = 3',
 'Pclass = 1',
 'Pclass = 2',
 'Deck = M',
 'Deck = C',
 'Deck = B',
 'Deck = E',
 'Deck = D',
 'Deck = F',
 'Deck = A',
 'Deck = G',
 'Embarked = S',
 'Embarked = C',
 'Embarked = Q',
 'Sex = male',
 'Sex = female',
 'Title = Mr',
 'Title = Miss',
 'Title = Mrs',
 'Title = Master',
 'Title = Rev',
 'Title = Dr',
 'Title = Col',
 'Title = the Countess',
 'Title = Major',
 'Title = Lady',
 'Pclass.MODE(df.Deck) = M',
 'Pclass.MODE(df.Deck) = C',
 'Pclass.MODE(df.Embarked) = S',
 'Pclass.MODE(df.Sex) = male',
 'Pclass.MODE(df.Title) = Mr',
 'Sex.MODE(df.Deck) = M',
 'Sex.MODE(df.Embarked) = S',
 'Sex.MODE(df.Pclass) = 3',
 'Sex.MODE(df.Title) = Mr',
 'Sex.MODE(df.Title) = Miss',
 'Embarked.MODE(df.Deck) = M',
 'Embarked.MODE(df.Pclass) = 3',
 'Embarked.MODE(df.Pclass) = 1',
 'Embarked.MODE(df.Sex) = male',
 'Embarked.MODE(df.Title) = Mr',
 'Title.MODE(df.Deck) = M',
 'Title.MODE(df.Deck) = A',
 'Title.MODE(df.Deck) = B',
 'Title.MODE(df.Deck) = C',
 'Title.MODE(df.Embarked

In [16]:
from sklearn.preprocessing import OrdinalEncoder

encode_list = []

for col in categorical_columns:
    if col in X_train.columns:
        le = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        le.fit(list(X_train[col].astype(str).values.reshape(-1, 1)))
        encode_list.append(le)
        X_train[col] = le.transform(list(X_train[col].astype(str).values.reshape(-1, 1)))

In [17]:
X_train

Unnamed: 0_level_0,Pclass = 3,Pclass = 1,Pclass = 2,Age,Fare,Deck = M,Deck = C,Deck = B,Deck = E,Deck = D,...,Deck.NUM_UNIQUE(df.Title),Deck.SKEW(df.Age),Deck.SKEW(df.Family_Size),Deck.SKEW(df.Fare),Deck.STD(df.Age),Deck.STD(df.Family_Size),Deck.STD(df.Fare),Deck.SUM(df.Age),Deck.SUM(df.Family_Size),Deck.SUM(df.Fare)
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,1.0,0.0,40.00,30.5000,0.0,1.0,0.0,0.0,0.0,...,6,-0.246234,1.479506,1.012559,14.795076,1.451969,69.726912,1663.92,104,4675.9207
1,1.0,0.0,0.0,25.00,7.0500,1.0,0.0,0.0,0.0,0.0,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
2,0.0,0.0,1.0,24.00,14.5000,1.0,0.0,0.0,0.0,0.0,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
3,1.0,0.0,0.0,22.00,7.5208,1.0,0.0,0.0,0.0,0.0,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
4,0.0,1.0,0.0,0.92,151.5500,0.0,1.0,0.0,0.0,0.0,...,6,-0.246234,1.479506,1.012559,14.795076,1.451969,69.726912,1663.92,104,4675.9207
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
663,1.0,0.0,0.0,21.00,7.6500,1.0,0.0,0.0,0.0,0.0,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
664,0.0,1.0,0.0,40.00,31.0000,1.0,0.0,0.0,0.0,0.0,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
665,1.0,0.0,0.0,41.00,14.1083,1.0,0.0,0.0,0.0,0.0,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
666,0.0,1.0,0.0,14.00,120.0000,0.0,0.0,1.0,0.0,0.0,...,6,0.565411,1.611508,2.458647,14.737029,1.059714,119.371526,1230.00,58,3732.0251


In [18]:
# save it

with open("titanic/models.pkl", "wb") as f:
    pickle.dump(encode_list, f)

# Feature Selection

In [19]:
X_train.isnull().sum().sort_values()

Pclass = 3                      0
Embarked.SUM(df.Age)            0
Embarked.SUM(df.Family_Size)    0
Embarked.SUM(df.Fare)           0
Title.COUNT(df)                 0
                               ..
Title.STD(df.Fare)              4
Title.STD(df.Age)               4
Title.SKEW(df.Fare)             6
Title.SKEW(df.Family_Size)      6
Title.SKEW(df.Age)              6
Length: 179, dtype: int64

In [20]:
X_train.fillna(0, inplace=True)

In [21]:
## Threshold for removing correlated variables

#threshold = 0.8

## Absolute value correlation matrix
#corr_matrix = X_train.corr().abs()
#upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

## Select columns with correlations above threshold
#collinear_features = [column for column in upper.columns if any(upper[column] > threshold)]

#print('There are %d features to remove.' % (len(collinear_features)))

#train_fe = X_train.drop(columns = collinear_features)
#train_fe

In [22]:
#lasso = LassoCV(cv=5).fit(X_train, y_train)
#model = SelectFromModel(lasso, prefit=True)
#X_new = model.transform(X_train)
#train_fe = pd.DataFrame(X_new, columns=[X_train.columns[i] for i in range(len(X_train.columns)) 
                                             #if model.get_support()[i]])
    
#model.get_support()

In [23]:
#lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_train, y_train)
#model = SelectFromModel(lsvc, prefit=True)
#X_new = model.transform(X_train)
#train_fe = pd.DataFrame(X_new, columns=[X_train.columns[i] 
                                        #for i in range(len(X_train.columns)) if model.get_support()[i]])
#train_fe.shape

#model.get_support()

## FS by the Recursive Feature Elimination (RFE) with Random Forest

In [24]:
embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=200), threshold='1.25*median')
embeded_rf_selector.fit(X_train, y_train)

embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X_train.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')

FE_option = embeded_rf_feature

train_fe = X_train[FE_option]
train_fe

75 selected features


Unnamed: 0_level_0,Pclass = 3,Age,Fare,Sex = male,Sex = female,Title = Mr,Family_Size,Pclass.COUNT(df),Pclass.MAX(df.Age),Pclass.MAX(df.Family_Size),...,Deck.MEAN(df.Family_Size),Deck.MEAN(df.Fare),Deck.MIN(df.Age),Deck.MIN(df.Fare),Deck.SKEW(df.Family_Size),Deck.STD(df.Age),Deck.STD(df.Family_Size),Deck.SUM(df.Age),Deck.SUM(df.Family_Size),Deck.SUM(df.Fare)
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,40.00,30.5000,1.0,0.0,1.0,1,152,80.0,6,...,2.260870,101.650450,0.92,26.55,1.479506,14.795076,1.451969,1663.92,104,4675.9207
1,1.0,25.00,7.0500,1.0,0.0,1.0,1,375,74.0,11,...,1.940270,19.744251,0.42,0.00,2.757381,12.214704,1.822822,14027.92,1007,10247.2661
2,0.0,24.00,14.5000,0.0,1.0,0.0,3,141,70.0,6,...,1.940270,19.744251,0.42,0.00,2.757381,12.214704,1.822822,14027.92,1007,10247.2661
3,1.0,22.00,7.5208,1.0,0.0,1.0,1,375,74.0,11,...,1.940270,19.744251,0.42,0.00,2.757381,12.214704,1.822822,14027.92,1007,10247.2661
4,0.0,0.92,151.5500,1.0,0.0,0.0,4,152,80.0,6,...,2.260870,101.650450,0.92,26.55,1.479506,14.795076,1.451969,1663.92,104,4675.9207
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
663,1.0,21.00,7.6500,0.0,1.0,0.0,1,375,74.0,11,...,1.940270,19.744251,0.42,0.00,2.757381,12.214704,1.822822,14027.92,1007,10247.2661
664,0.0,40.00,31.0000,1.0,0.0,1.0,1,152,80.0,6,...,1.940270,19.744251,0.42,0.00,2.757381,12.214704,1.822822,14027.92,1007,10247.2661
665,1.0,41.00,14.1083,1.0,0.0,1.0,3,375,74.0,11,...,1.940270,19.744251,0.42,0.00,2.757381,12.214704,1.822822,14027.92,1007,10247.2661
666,0.0,14.00,120.0000,0.0,1.0,0.0,4,152,80.0,6,...,1.705882,109.765444,14.00,0.00,1.611508,14.737029,1.059714,1230.00,58,3732.0251


In [25]:
FE_option = train_fe.columns
FE_option

Index(['Pclass = 3', 'Age', 'Fare', 'Sex = male', 'Sex = female', 'Title = Mr',
       'Family_Size', 'Pclass.COUNT(df)', 'Pclass.MAX(df.Age)',
       'Pclass.MAX(df.Family_Size)', 'Pclass.MAX(df.Fare)',
       'Pclass.MEAN(df.Age)', 'Pclass.MEAN(df.Family_Size)',
       'Pclass.MEAN(df.Fare)', 'Pclass.MIN(df.Age)',
       'Pclass.NUM_UNIQUE(df.Title)', 'Pclass.SKEW(df.Age)',
       'Pclass.SKEW(df.Family_Size)', 'Pclass.SKEW(df.Fare)',
       'Pclass.STD(df.Age)', 'Pclass.STD(df.Family_Size)',
       'Pclass.STD(df.Fare)', 'Pclass.SUM(df.Age)',
       'Pclass.SUM(df.Family_Size)', 'Sex.COUNT(df)', 'Sex.MAX(df.Age)',
       'Sex.MEAN(df.Age)', 'Sex.MEAN(df.Family_Size)', 'Sex.MEAN(df.Fare)',
       'Sex.MIN(df.Age)', 'Sex.MIN(df.Fare)', 'Sex.MODE(df.Title) = Miss',
       'Sex.NUM_UNIQUE(df.Title)', 'Sex.SKEW(df.Age)',
       'Sex.SKEW(df.Family_Size)', 'Sex.SKEW(df.Fare)', 'Sex.STD(df.Age)',
       'Sex.STD(df.Family_Size)', 'Sex.STD(df.Fare)', 'Sex.SUM(df.Age)',
       'Sex.SUM(df.Fa

In [26]:
# save it

with open("titanic/models.pkl", "ab") as f:
    pickle.dump(FE_option, f)

# Modeling

In [27]:
# Tuning Random Forest model for features "features_set", makes prediction and save it into file  
#train_fe = X_train.drop(columns = collinear_features)

random_forest = GridSearchCV(estimator=RandomForestClassifier(), 
                             param_grid={'n_estimators': [100, 500]}, cv=5).fit(train_fe, y_train)
random_forest.fit(train_fe, y_train)
random_forest.score(train_fe, y_train)
acc_random_forest = round(random_forest.score(train_fe, y_train) * 100, 2)
print(acc_random_forest)

98.5


In [28]:
# save it

with open("titanic/models.pkl", "ab") as f:
    pickle.dump(random_forest, f)