# Titanic FeatureTools version -- only for testing data

Follow up: https://github.com/dongzhang84/Featuretools/blob/main/Titanic_Featuretools_automation_1.ipynb

Load features and load saved model

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import pickle
%matplotlib inline

import featuretools as ft
from featuretools.primitives import *
from featuretools.variable_types import Numeric

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression, LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.metrics import(
    classification_report, confusion_matrix, accuracy_score, mean_squared_error, 
    mean_absolute_error, explained_variance_score
)



from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel, SelectKBest, RFE, chi2

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression


import warnings
warnings.filterwarnings("ignore")

In [2]:
X_test = pd.read_csv('titanic/testdf.csv').set_index('PassengerId')
X_test_orig = X_test.copy()

In [3]:
X_test_orig 

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Deck,Family_Size
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
710,1,3,male,4.0,15.2458,C,Master,M,3
440,0,2,male,31.0,10.5000,S,Mr,M,1
841,0,3,male,20.0,7.9250,S,Mr,M,1
721,1,2,female,6.0,33.0000,S,Miss,M,2
40,1,3,female,14.0,11.2417,C,Miss,M,2
...,...,...,...,...,...,...,...,...,...
881,1,2,female,25.0,26.0000,S,Mrs,M,2
426,0,3,male,26.0,7.2500,S,Mr,M,1
102,0,3,male,26.0,7.8958,S,Mr,M,1
200,0,2,female,24.0,13.0000,S,Miss,M,1


In [4]:
y_test = X_test.Survived
X_test = X_test.drop(['Survived'], axis=1)

In [5]:
X_test

Unnamed: 0_level_0,Pclass,Sex,Age,Fare,Embarked,Title,Deck,Family_Size
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
710,3,male,4.0,15.2458,C,Master,M,3
440,2,male,31.0,10.5000,S,Mr,M,1
841,3,male,20.0,7.9250,S,Mr,M,1
721,2,female,6.0,33.0000,S,Miss,M,2
40,3,female,14.0,11.2417,C,Miss,M,2
...,...,...,...,...,...,...,...,...
881,2,female,25.0,26.0000,S,Mrs,M,2
426,3,male,26.0,7.2500,S,Mr,M,1
102,3,male,26.0,7.8958,S,Mr,M,1
200,2,female,24.0,13.0000,S,Miss,M,1


In [6]:
saved_features = ft.load_features('titanic/feature_definitions.json')

# Load FeatureTools

In [7]:
es_tst = ft.EntitySet(id = 'titanic_data')
es_tst = es_tst.entity_from_dataframe(entity_id = 'df', dataframe = X_test, 
                              variable_types = 
                              {
                                  'Embarked': ft.variable_types.Categorical,
                                  'Sex': ft.variable_types.Boolean,
                                  'Title': ft.variable_types.Categorical,
                                  'Family_Size': ft.variable_types.Numeric,
                              },
                              index = 'Id')

In [8]:
es_tst

Entityset: titanic_data
  Entities:
    df [Rows: 223, Columns: 9]
  Relationships:
    No relationships

In [9]:
es_tst = es_tst.normalize_entity(base_entity_id='df', new_entity_id='Pclass', index='Pclass')
es_tst = es_tst.normalize_entity(base_entity_id='df', new_entity_id='Sex', index='Sex')
es_tst = es_tst.normalize_entity(base_entity_id='df', new_entity_id='Embarked', index='Embarked')
es_tst = es_tst.normalize_entity(base_entity_id='df', new_entity_id='Title', index='Title')
es_tst = es_tst.normalize_entity(base_entity_id='df', new_entity_id='Deck', index='Deck')
es_tst

Entityset: titanic_data
  Entities:
    df [Rows: 223, Columns: 9]
    Pclass [Rows: 3, Columns: 1]
    Sex [Rows: 2, Columns: 1]
    Embarked [Rows: 3, Columns: 1]
    Title [Rows: 10, Columns: 1]
    Deck [Rows: 8, Columns: 1]
  Relationships:
    df.Pclass -> Pclass.Pclass
    df.Sex -> Sex.Sex
    df.Embarked -> Embarked.Embarked
    df.Title -> Title.Title
    df.Deck -> Deck.Deck

# Load Saved FE

In [10]:
feature_matrix_tst = ft.calculate_feature_matrix(features=saved_features, entityset=es_tst)

In [11]:
X_test = feature_matrix_tst.copy()
X_test.head()

Unnamed: 0_level_0,Pclass = 3,Pclass = 1,Pclass = 2,Age,Fare,Deck = M,Deck = C,Deck = B,Deck = E,Deck = D,...,Deck.NUM_UNIQUE(df.Title),Deck.SKEW(df.Age),Deck.SKEW(df.Family_Size),Deck.SKEW(df.Fare),Deck.STD(df.Age),Deck.STD(df.Family_Size),Deck.STD(df.Fare),Deck.SUM(df.Age),Deck.SUM(df.Family_Size),Deck.SUM(df.Fare)
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,True,False,False,4.0,15.2458,True,False,False,False,False,...,8,0.771881,2.079961,2.87027,12.505801,1.460035,15.756563,4590.83,307,2913.8164
1,False,False,True,31.0,10.5,True,False,False,False,False,...,8,0.771881,2.079961,2.87027,12.505801,1.460035,15.756563,4590.83,307,2913.8164
2,True,False,False,20.0,7.925,True,False,False,False,False,...,8,0.771881,2.079961,2.87027,12.505801,1.460035,15.756563,4590.83,307,2913.8164
3,False,False,True,6.0,33.0,True,False,False,False,False,...,8,0.771881,2.079961,2.87027,12.505801,1.460035,15.756563,4590.83,307,2913.8164
4,True,False,False,14.0,11.2417,True,False,False,False,False,...,8,0.771881,2.079961,2.87027,12.505801,1.460035,15.756563,4590.83,307,2913.8164


In [12]:
X_test_orig

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Deck,Family_Size
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
710,1,3,male,4.0,15.2458,C,Master,M,3
440,0,2,male,31.0,10.5000,S,Mr,M,1
841,0,3,male,20.0,7.9250,S,Mr,M,1
721,1,2,female,6.0,33.0000,S,Miss,M,2
40,1,3,female,14.0,11.2417,C,Miss,M,2
...,...,...,...,...,...,...,...,...,...
881,1,2,female,25.0,26.0000,S,Mrs,M,2
426,0,3,male,26.0,7.2500,S,Mr,M,1
102,0,3,male,26.0,7.8958,S,Mr,M,1
200,0,2,female,24.0,13.0000,S,Miss,M,1


# Load saved FE and models

In [13]:
pickle_list = []

with open("titanic/models.pkl", "rb") as f:
    while True:
        try:
            pickle_list.append(pickle.load(f))
        except EOFError:
            break

In [14]:
len(pickle_list)

3

In [15]:
encode_list = pickle_list[0]
selected_features = pickle_list[1]
model = pickle_list[2]

In [16]:
# Determination categorical features
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
categorical_columns = []
cols = X_test.columns.values.tolist()
for col in cols:
    if X_test[col].dtype in numerics: continue
    categorical_columns.append(col)
categorical_columns

['Pclass = 3',
 'Pclass = 1',
 'Pclass = 2',
 'Deck = M',
 'Deck = C',
 'Deck = B',
 'Deck = E',
 'Deck = D',
 'Deck = F',
 'Deck = A',
 'Deck = G',
 'Embarked = S',
 'Embarked = C',
 'Embarked = Q',
 'Sex = male',
 'Sex = female',
 'Title = Mr',
 'Title = Miss',
 'Title = Mrs',
 'Title = Master',
 'Title = Rev',
 'Title = Dr',
 'Title = Col',
 'Title = the Countess',
 'Title = Major',
 'Title = Lady',
 'Pclass.MODE(df.Deck) = M',
 'Pclass.MODE(df.Deck) = C',
 'Pclass.MODE(df.Embarked) = S',
 'Pclass.MODE(df.Sex) = male',
 'Pclass.MODE(df.Title) = Mr',
 'Sex.MODE(df.Deck) = M',
 'Sex.MODE(df.Embarked) = S',
 'Sex.MODE(df.Pclass) = 3',
 'Sex.MODE(df.Title) = Mr',
 'Sex.MODE(df.Title) = Miss',
 'Embarked.MODE(df.Deck) = M',
 'Embarked.MODE(df.Pclass) = 3',
 'Embarked.MODE(df.Pclass) = 1',
 'Embarked.MODE(df.Sex) = male',
 'Embarked.MODE(df.Title) = Mr',
 'Title.MODE(df.Deck) = M',
 'Title.MODE(df.Deck) = A',
 'Title.MODE(df.Deck) = B',
 'Title.MODE(df.Deck) = C',
 'Title.MODE(df.Embarked

In [17]:
i = 0

for col in categorical_columns:
    
    try: 
        X_test[col] = encode_list[i].transform(list(X_test[col].astype(str).values.reshape(-1, 1)))
    except:
        print(col, "An exception occurred")
    i += 1

In [18]:
X_test

Unnamed: 0_level_0,Pclass = 3,Pclass = 1,Pclass = 2,Age,Fare,Deck = M,Deck = C,Deck = B,Deck = E,Deck = D,...,Deck.NUM_UNIQUE(df.Title),Deck.SKEW(df.Age),Deck.SKEW(df.Family_Size),Deck.SKEW(df.Fare),Deck.STD(df.Age),Deck.STD(df.Family_Size),Deck.STD(df.Fare),Deck.SUM(df.Age),Deck.SUM(df.Family_Size),Deck.SUM(df.Fare)
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,4.0,15.2458,1.0,0.0,0.0,0.0,0.0,...,8,0.771881,2.079961,2.87027,12.505801,1.460035,15.756563,4590.83,307,2913.8164
1,0.0,0.0,1.0,31.0,10.5000,1.0,0.0,0.0,0.0,0.0,...,8,0.771881,2.079961,2.87027,12.505801,1.460035,15.756563,4590.83,307,2913.8164
2,1.0,0.0,0.0,20.0,7.9250,1.0,0.0,0.0,0.0,0.0,...,8,0.771881,2.079961,2.87027,12.505801,1.460035,15.756563,4590.83,307,2913.8164
3,0.0,0.0,1.0,6.0,33.0000,1.0,0.0,0.0,0.0,0.0,...,8,0.771881,2.079961,2.87027,12.505801,1.460035,15.756563,4590.83,307,2913.8164
4,1.0,0.0,0.0,14.0,11.2417,1.0,0.0,0.0,0.0,0.0,...,8,0.771881,2.079961,2.87027,12.505801,1.460035,15.756563,4590.83,307,2913.8164
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,0.0,0.0,1.0,25.0,26.0000,1.0,0.0,0.0,0.0,0.0,...,8,0.771881,2.079961,2.87027,12.505801,1.460035,15.756563,4590.83,307,2913.8164
219,1.0,0.0,0.0,26.0,7.2500,1.0,0.0,0.0,0.0,0.0,...,8,0.771881,2.079961,2.87027,12.505801,1.460035,15.756563,4590.83,307,2913.8164
220,1.0,0.0,0.0,26.0,7.8958,1.0,0.0,0.0,0.0,0.0,...,8,0.771881,2.079961,2.87027,12.505801,1.460035,15.756563,4590.83,307,2913.8164
221,0.0,0.0,1.0,24.0,13.0000,1.0,0.0,0.0,0.0,0.0,...,8,0.771881,2.079961,2.87027,12.505801,1.460035,15.756563,4590.83,307,2913.8164


In [19]:
test_fe = X_test[selected_features]
test_fe.fillna(0, inplace=True)

In [20]:
Y_pred = model.predict(test_fe).astype(int)
Y_pred

array([1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0])

In [21]:
print(f"Confusion Matrix: \n {confusion_matrix(y_test, Y_pred)}")

Confusion Matrix: 
 [[99 35]
 [ 7 82]]


In [22]:
print(classification_report(y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.93      0.74      0.83       134
           1       0.70      0.92      0.80        89

    accuracy                           0.81       223
   macro avg       0.82      0.83      0.81       223
weighted avg       0.84      0.81      0.81       223

