# Titanic FeatureTools automation

Same as Titanic I: https://github.com/dongzhang84/Featuretools/blob/main/Titanic_Featuretools.ipynb

This notebook is for automation consideration. 

- Feature engineering is only based on training dataset. 
- Categorical encoder is only based on training dataset. 
- Testing data loaded to do feature engineering
- Feature selection
- Modeling

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import pickle
%matplotlib inline

import featuretools as ft
from featuretools.primitives import *
from featuretools.variable_types import Numeric

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression, LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.metrics import(
    classification_report, confusion_matrix, accuracy_score, mean_squared_error, 
    mean_absolute_error, explained_variance_score
)



from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel, SelectKBest, RFE, chi2

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# model tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval

import warnings
warnings.filterwarnings("ignore")

In [2]:
traindf = pd.read_csv('titanic/train.csv').set_index('PassengerId')

In [3]:
traindf

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## Preparing to modeling with manual FE

In [4]:
#Thanks to:
# https://www.kaggle.com/mauricef/titanic
# https://www.kaggle.com/vbmokin/titanic-top-3-one-line-of-the-prediction-code
#
df = traindf
df['Title'] = df.Name.str.split(',').str[1].str.split('.').str[0].str.strip()
df['Title'] = df.Name.str.split(',').str[1].str.split('.').str[0].str.strip()
df['IsWomanOrBoy'] = ((df.Title == 'Master') | (df.Sex == 'female'))
df['LastName'] = df.Name.str.split(',').str[0]
family = df.groupby(df.LastName).Survived
df['WomanOrBoyCount'] = family.transform(lambda s: s[df.IsWomanOrBoy].fillna(0).count())
#df['WomanOrBoyCount'] = df.mask(df.IsWomanOrBoy, df.WomanOrBoyCount - 1, axis=0)
df['FamilySurvivedCount'] = family.transform(lambda s: s[df.IsWomanOrBoy].fillna(0).sum())
#df['FamilySurvivedCount'] = df.mask(df.IsWomanOrBoy, df.FamilySurvivedCount - \
                                    #df.Survived.fillna(0), axis=0)
df['WomanOrBoySurvived'] = df.FamilySurvivedCount / df.WomanOrBoyCount.replace(0, np.nan)
df.WomanOrBoyCount = df.WomanOrBoyCount.replace(np.nan, 0)
df['Alone'] = (df.WomanOrBoyCount == 0)

#Thanks to https://www.kaggle.com/kpacocha/top-6-titanic-machine-learning-from-disaster
#"Title" improvement
df['Title'] = df['Title'].replace('Ms','Miss')
df['Title'] = df['Title'].replace('Mlle','Miss')
df['Title'] = df['Title'].replace('Mme','Mrs')
# Embarked
df['Embarked'] = df['Embarked'].fillna('S')
# Cabin, Deck
df['Deck'] = df['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')
df.loc[(df['Deck'] == 'T'), 'Deck'] = 'A'

# Thanks to https://www.kaggle.com/erinsweet/simpledetect
# Fare
med_fare = df.groupby(['Pclass', 'Parch', 'SibSp']).Fare.median()[3][0][0]
df['Fare'] = df['Fare'].fillna(med_fare)
#Age
df['Age'] = df.groupby(['Sex', 'Pclass', 'Title'])['Age'].apply(lambda x: x.fillna(x.median()))
# Family_Size
df['Family_Size'] = df['SibSp'] + df['Parch'] + 1

# Thanks to https://www.kaggle.com/vbmokin/titanic-top-3-cluster-analysis
cols_to_drop = ['Name','Ticket','Cabin']
df = df.drop(cols_to_drop, axis=1)

df.WomanOrBoySurvived = df.WomanOrBoySurvived.fillna(0)
df.WomanOrBoyCount = df.WomanOrBoyCount.fillna(0)
df.FamilySurvivedCount = df.FamilySurvivedCount.fillna(0)
df.Alone = df.Alone.fillna(0)
df.Alone = df.Alone*1

df_optimum = pd.concat([df.WomanOrBoySurvived.fillna(0), df.Alone, df.Sex.replace({'male': 0, 'female': 1})], axis=1)

target = df.Survived.loc[traindf.index]
df = df.drop(['SibSp','Parch','IsWomanOrBoy','WomanOrBoyCount','FamilySurvivedCount','LastName','WomanOrBoySurvived','Alone'], axis=1)
df['PassengerId'] = df.index
df.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Deck,Family_Size,PassengerId
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,3,male,22.0,7.25,S,Mr,M,2,1
2,1,1,female,38.0,71.2833,C,Mrs,C,2,2
3,1,3,female,26.0,7.925,S,Miss,M,1,3
4,1,1,female,35.0,53.1,S,Mrs,C,2,4
5,0,3,male,35.0,8.05,S,Mr,M,1,5


In [5]:
X_train, X_test, y_train, y_test = train_test_split(df, df['Survived'], random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# reset indices

X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

X_train_orig =X_train.copy()
X_test_orig = X_test.copy()

X_train.to_csv('titanic/traindf.csv',index=False)
X_test.to_csv('titanic/testdf.csv', index=False)


X_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Deck,Family_Size,PassengerId
0,1,1,male,40.0,30.5,S,Mr,C,1,299
1,0,3,male,25.0,7.05,S,Mr,M,1,885
2,1,2,female,24.0,14.5,S,Mrs,M,3,248
3,0,3,male,22.0,7.5208,S,Mr,M,1,479
4,1,1,male,0.92,151.55,S,Master,C,4,306


In [6]:
X_train.shape, X_test.shape

((668, 10), (223, 10))

In [7]:
X_train.isnull().sum()

Survived       0
Pclass         0
Sex            0
Age            0
Fare           0
Embarked       0
Title          0
Deck           0
Family_Size    0
PassengerId    0
dtype: int64

In [8]:
X_test.isnull().sum()

Survived       0
Pclass         0
Sex            0
Age            0
Fare           0
Embarked       0
Title          0
Deck           0
Family_Size    0
PassengerId    0
dtype: int64

# Automatic FE with Featuretools

In [9]:
es = ft.EntitySet(id = 'titanic_data')
es = es.entity_from_dataframe(entity_id = 'df', dataframe = X_train, 
                              variable_types = 
                              {
                                  'Embarked': ft.variable_types.Categorical,
                                  'Sex': ft.variable_types.Boolean,
                                  'Title': ft.variable_types.Categorical,
                                  'Family_Size': ft.variable_types.Numeric,
                              },
                              index = 'Id')

In [10]:
es['df']

Entity: df
  Variables:
    Id (dtype: index)
    Survived (dtype: numeric)
    Pclass (dtype: numeric)
    Age (dtype: numeric)
    Fare (dtype: numeric)
    Deck (dtype: categorical)
    PassengerId (dtype: numeric)
    Embarked (dtype: categorical)
    Sex (dtype: boolean)
    Title (dtype: categorical)
    Family_Size (dtype: numeric)
  Shape:
    (Rows: 668, Columns: 11)

In [11]:
es = es.normalize_entity(base_entity_id='df', new_entity_id='Pclass', index='Pclass')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Sex', index='Sex')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Embarked', index='Embarked')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Title', index='Title')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Deck', index='Deck')
es

Entityset: titanic_data
  Entities:
    df [Rows: 668, Columns: 11]
    Pclass [Rows: 3, Columns: 1]
    Sex [Rows: 2, Columns: 1]
    Embarked [Rows: 3, Columns: 1]
    Title [Rows: 11, Columns: 1]
    Deck [Rows: 8, Columns: 1]
  Relationships:
    df.Pclass -> Pclass.Pclass
    df.Sex -> Sex.Sex
    df.Embarked -> Embarked.Embarked
    df.Title -> Title.Title
    df.Deck -> Deck.Deck

In [12]:
es['Pclass'].df

Unnamed: 0,Pclass
1,1
3,3
2,2


In [13]:
es['Sex'].df

Unnamed: 0,Sex
male,male
female,female


In [14]:
primitives = ft.list_primitives()
pd.options.display.max_colwidth = 500
primitives[primitives['type'] == 'aggregation'].head(primitives[primitives['type'] == 'aggregation'].shape[0])

Unnamed: 0,name,type,dask_compatible,koalas_compatible,description,valid_inputs,return_type
0,skew,aggregation,False,False,Computes the extent to which a distribution differs from a normal distribution.,Numeric,Numeric
1,n_most_common,aggregation,False,False,Determines the `n` most common elements.,Discrete,Discrete
2,count,aggregation,True,True,"Determines the total number of values, excluding `NaN`.",Index,Numeric
3,num_unique,aggregation,True,True,"Determines the number of distinct values, ignoring `NaN` values.",Discrete,Numeric
4,time_since_first,aggregation,False,False,Calculates the time elapsed since the first datetime (in seconds).,DatetimeTimeIndex,Numeric
5,all,aggregation,True,False,Calculates if all values are 'True' in a list.,Boolean,Boolean
6,min,aggregation,True,True,"Calculates the smallest value, ignoring `NaN` values.",Numeric,Numeric
7,last,aggregation,False,False,Determines the last value in a list.,Variable,
8,mean,aggregation,True,True,Computes the average for a list of values.,Numeric,Numeric
9,percent_true,aggregation,True,False,Determines the percent of `True` values.,Boolean,Numeric


In [15]:
#features, feature_names = ft.dfs(entityset = es, target_entity = 'df', max_depth = 2)

feature_matrix, feature_names = ft.dfs(entityset=es, 
                                       target_entity = 'df',
                                       max_depth = 2, 
                                       ignore_variables={'df':['Survived','PassengerId']})

len(feature_names)

143

In [16]:
feature_names

[<Feature: Pclass>,
 <Feature: Age>,
 <Feature: Fare>,
 <Feature: Deck>,
 <Feature: Embarked>,
 <Feature: Sex>,
 <Feature: Title>,
 <Feature: Family_Size>,
 <Feature: Pclass.COUNT(df)>,
 <Feature: Pclass.MAX(df.Age)>,
 <Feature: Pclass.MAX(df.Family_Size)>,
 <Feature: Pclass.MAX(df.Fare)>,
 <Feature: Pclass.MEAN(df.Age)>,
 <Feature: Pclass.MEAN(df.Family_Size)>,
 <Feature: Pclass.MEAN(df.Fare)>,
 <Feature: Pclass.MIN(df.Age)>,
 <Feature: Pclass.MIN(df.Family_Size)>,
 <Feature: Pclass.MIN(df.Fare)>,
 <Feature: Pclass.MODE(df.Deck)>,
 <Feature: Pclass.MODE(df.Embarked)>,
 <Feature: Pclass.MODE(df.Sex)>,
 <Feature: Pclass.MODE(df.Title)>,
 <Feature: Pclass.NUM_UNIQUE(df.Deck)>,
 <Feature: Pclass.NUM_UNIQUE(df.Embarked)>,
 <Feature: Pclass.NUM_UNIQUE(df.Sex)>,
 <Feature: Pclass.NUM_UNIQUE(df.Title)>,
 <Feature: Pclass.SKEW(df.Age)>,
 <Feature: Pclass.SKEW(df.Family_Size)>,
 <Feature: Pclass.SKEW(df.Fare)>,
 <Feature: Pclass.STD(df.Age)>,
 <Feature: Pclass.STD(df.Family_Size)>,
 <Feature: P

In [17]:
feature_matrix

Unnamed: 0_level_0,Pclass,Age,Fare,Deck,Embarked,Sex,Title,Family_Size,Pclass.COUNT(df),Pclass.MAX(df.Age),...,Deck.NUM_UNIQUE(df.Title),Deck.SKEW(df.Age),Deck.SKEW(df.Family_Size),Deck.SKEW(df.Fare),Deck.STD(df.Age),Deck.STD(df.Family_Size),Deck.STD(df.Fare),Deck.SUM(df.Age),Deck.SUM(df.Family_Size),Deck.SUM(df.Fare)
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,40.00,30.5000,C,S,male,Mr,1,152,80.0,...,6,-0.246234,1.479506,1.012559,14.795076,1.451969,69.726912,1663.92,104,4675.9207
1,3,25.00,7.0500,M,S,male,Mr,1,375,74.0,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
2,2,24.00,14.5000,M,S,female,Mrs,3,141,70.0,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
3,3,22.00,7.5208,M,S,male,Mr,1,375,74.0,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
4,1,0.92,151.5500,C,S,male,Master,4,152,80.0,...,6,-0.246234,1.479506,1.012559,14.795076,1.451969,69.726912,1663.92,104,4675.9207
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
663,3,21.00,7.6500,M,S,female,Miss,1,375,74.0,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
664,1,40.00,31.0000,M,S,male,Mr,1,152,80.0,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
665,3,41.00,14.1083,M,S,male,Mr,3,375,74.0,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
666,1,14.00,120.0000,B,S,female,Miss,4,152,80.0,...,6,0.565411,1.611508,2.458647,14.737029,1.059714,119.371526,1230.00,58,3732.0251


In [18]:
X_train=feature_matrix.copy() # save the generated features back into our X_Train
feature_matrix.head().T

Id,0,1,2,3,4
Pclass,1,3,2,3,1
Age,40.0,25.0,24.0,22.0,0.92
Fare,30.5,7.05,14.5,7.5208,151.55
Deck,C,M,M,M,C
Embarked,S,S,S,S,S
...,...,...,...,...,...
Deck.STD(df.Family_Size),1.451969,1.822822,1.822822,1.822822,1.451969
Deck.STD(df.Fare),69.726912,31.726899,31.726899,31.726899,69.726912
Deck.SUM(df.Age),1663.92,14027.92,14027.92,14027.92,1663.92
Deck.SUM(df.Family_Size),104,1007,1007,1007,104


In [19]:
feature_matrix_enc, features_enc = ft.encode_features(feature_matrix, feature_names, include_unknown=False)

In [20]:
len(features_enc)

179

In [21]:
X_train = feature_matrix_enc.copy()
X_train.head()

Unnamed: 0_level_0,Pclass = 3,Pclass = 1,Pclass = 2,Age,Fare,Deck = M,Deck = C,Deck = B,Deck = E,Deck = D,...,Deck.NUM_UNIQUE(df.Title),Deck.SKEW(df.Age),Deck.SKEW(df.Family_Size),Deck.SKEW(df.Fare),Deck.STD(df.Age),Deck.STD(df.Family_Size),Deck.STD(df.Fare),Deck.SUM(df.Age),Deck.SUM(df.Family_Size),Deck.SUM(df.Fare)
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,False,True,False,40.0,30.5,False,True,False,False,False,...,6,-0.246234,1.479506,1.012559,14.795076,1.451969,69.726912,1663.92,104,4675.9207
1,True,False,False,25.0,7.05,True,False,False,False,False,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
2,False,False,True,24.0,14.5,True,False,False,False,False,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
3,True,False,False,22.0,7.5208,True,False,False,False,False,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
4,False,True,False,0.92,151.55,False,True,False,False,False,...,6,-0.246234,1.479506,1.012559,14.795076,1.451969,69.726912,1663.92,104,4675.9207


In [22]:
X_train_orig.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Deck,Family_Size,PassengerId
0,1,1,male,40.0,30.5,S,Mr,C,1,299
1,0,3,male,25.0,7.05,S,Mr,M,1,885
2,1,2,female,24.0,14.5,S,Mrs,M,3,248
3,0,3,male,22.0,7.5208,S,Mr,M,1,479
4,1,1,male,0.92,151.55,S,Master,C,4,306


In [23]:
# save feature engineering

ft.save_features(features_enc, "titanic/feature_definitions.json")

In [24]:
saved_features = ft.load_features('titanic/feature_definitions.json')

In [25]:
es_tst = ft.EntitySet(id = 'titanic_data')
es_tst = es_tst.entity_from_dataframe(entity_id = 'df', dataframe = X_test_orig, 
                              variable_types = 
                              {
                                  'Embarked': ft.variable_types.Categorical,
                                  'Sex': ft.variable_types.Boolean,
                                  'Title': ft.variable_types.Categorical,
                                  'Family_Size': ft.variable_types.Numeric,
                              },
                              index = 'Id')

In [26]:
es_tst

Entityset: titanic_data
  Entities:
    df [Rows: 223, Columns: 11]
  Relationships:
    No relationships

In [27]:
es_tst = es_tst.normalize_entity(base_entity_id='df', new_entity_id='Pclass', index='Pclass')
es_tst = es_tst.normalize_entity(base_entity_id='df', new_entity_id='Sex', index='Sex')
es_tst = es_tst.normalize_entity(base_entity_id='df', new_entity_id='Embarked', index='Embarked')
es_tst = es_tst.normalize_entity(base_entity_id='df', new_entity_id='Title', index='Title')
es_tst = es_tst.normalize_entity(base_entity_id='df', new_entity_id='Deck', index='Deck')
es_tst

Entityset: titanic_data
  Entities:
    df [Rows: 223, Columns: 11]
    Pclass [Rows: 3, Columns: 1]
    Sex [Rows: 2, Columns: 1]
    Embarked [Rows: 3, Columns: 1]
    Title [Rows: 10, Columns: 1]
    Deck [Rows: 8, Columns: 1]
  Relationships:
    df.Pclass -> Pclass.Pclass
    df.Sex -> Sex.Sex
    df.Embarked -> Embarked.Embarked
    df.Title -> Title.Title
    df.Deck -> Deck.Deck

In [28]:
feature_matrix_tst = ft.calculate_feature_matrix(features=saved_features, entityset=es_tst)

In [29]:
X_test = feature_matrix_tst.copy()
X_test.head()

Unnamed: 0_level_0,Pclass = 3,Pclass = 1,Pclass = 2,Age,Fare,Deck = M,Deck = C,Deck = B,Deck = E,Deck = D,...,Deck.NUM_UNIQUE(df.Title),Deck.SKEW(df.Age),Deck.SKEW(df.Family_Size),Deck.SKEW(df.Fare),Deck.STD(df.Age),Deck.STD(df.Family_Size),Deck.STD(df.Fare),Deck.SUM(df.Age),Deck.SUM(df.Family_Size),Deck.SUM(df.Fare)
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,True,False,False,4.0,15.2458,True,False,False,False,False,...,8,0.771881,2.079961,2.87027,12.505801,1.460035,15.756563,4590.83,307,2913.8164
1,False,False,True,31.0,10.5,True,False,False,False,False,...,8,0.771881,2.079961,2.87027,12.505801,1.460035,15.756563,4590.83,307,2913.8164
2,True,False,False,20.0,7.925,True,False,False,False,False,...,8,0.771881,2.079961,2.87027,12.505801,1.460035,15.756563,4590.83,307,2913.8164
3,False,False,True,6.0,33.0,True,False,False,False,False,...,8,0.771881,2.079961,2.87027,12.505801,1.460035,15.756563,4590.83,307,2913.8164
4,True,False,False,14.0,11.2417,True,False,False,False,False,...,8,0.771881,2.079961,2.87027,12.505801,1.460035,15.756563,4590.83,307,2913.8164


In [30]:
X_test_orig

Unnamed: 0,Id,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Deck,Family_Size,PassengerId
0,0,1,3,male,4.0,15.2458,C,Master,M,3,710
1,1,0,2,male,31.0,10.5000,S,Mr,M,1,440
2,2,0,3,male,20.0,7.9250,S,Mr,M,1,841
3,3,1,2,female,6.0,33.0000,S,Miss,M,2,721
4,4,1,3,female,14.0,11.2417,C,Miss,M,2,40
...,...,...,...,...,...,...,...,...,...,...,...
218,218,1,2,female,25.0,26.0000,S,Mrs,M,2,881
219,219,0,3,male,26.0,7.2500,S,Mr,M,1,426
220,220,0,3,male,26.0,7.8958,S,Mr,M,1,102
221,221,0,2,female,24.0,13.0000,S,Miss,M,1,200


## Encoding categorical features

In [31]:
# Determination categorical features
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
categorical_columns = []
cols = X_train.columns.values.tolist()
for col in cols:
    if X_train[col].dtype in numerics: continue
    categorical_columns.append(col)
categorical_columns

['Pclass = 3',
 'Pclass = 1',
 'Pclass = 2',
 'Deck = M',
 'Deck = C',
 'Deck = B',
 'Deck = E',
 'Deck = D',
 'Deck = F',
 'Deck = A',
 'Deck = G',
 'Embarked = S',
 'Embarked = C',
 'Embarked = Q',
 'Sex = male',
 'Sex = female',
 'Title = Mr',
 'Title = Miss',
 'Title = Mrs',
 'Title = Master',
 'Title = Rev',
 'Title = Dr',
 'Title = Col',
 'Title = the Countess',
 'Title = Major',
 'Title = Lady',
 'Pclass.MODE(df.Deck) = M',
 'Pclass.MODE(df.Deck) = C',
 'Pclass.MODE(df.Embarked) = S',
 'Pclass.MODE(df.Sex) = male',
 'Pclass.MODE(df.Title) = Mr',
 'Sex.MODE(df.Deck) = M',
 'Sex.MODE(df.Embarked) = S',
 'Sex.MODE(df.Pclass) = 3',
 'Sex.MODE(df.Title) = Mr',
 'Sex.MODE(df.Title) = Miss',
 'Embarked.MODE(df.Deck) = M',
 'Embarked.MODE(df.Pclass) = 3',
 'Embarked.MODE(df.Pclass) = 1',
 'Embarked.MODE(df.Sex) = male',
 'Embarked.MODE(df.Title) = Mr',
 'Title.MODE(df.Deck) = M',
 'Title.MODE(df.Deck) = A',
 'Title.MODE(df.Deck) = B',
 'Title.MODE(df.Deck) = C',
 'Title.MODE(df.Embarked

In [32]:
X_train

Unnamed: 0_level_0,Pclass = 3,Pclass = 1,Pclass = 2,Age,Fare,Deck = M,Deck = C,Deck = B,Deck = E,Deck = D,...,Deck.NUM_UNIQUE(df.Title),Deck.SKEW(df.Age),Deck.SKEW(df.Family_Size),Deck.SKEW(df.Fare),Deck.STD(df.Age),Deck.STD(df.Family_Size),Deck.STD(df.Fare),Deck.SUM(df.Age),Deck.SUM(df.Family_Size),Deck.SUM(df.Fare)
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,False,True,False,40.00,30.5000,False,True,False,False,False,...,6,-0.246234,1.479506,1.012559,14.795076,1.451969,69.726912,1663.92,104,4675.9207
1,True,False,False,25.00,7.0500,True,False,False,False,False,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
2,False,False,True,24.00,14.5000,True,False,False,False,False,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
3,True,False,False,22.00,7.5208,True,False,False,False,False,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
4,False,True,False,0.92,151.5500,False,True,False,False,False,...,6,-0.246234,1.479506,1.012559,14.795076,1.451969,69.726912,1663.92,104,4675.9207
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
663,True,False,False,21.00,7.6500,True,False,False,False,False,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
664,False,True,False,40.00,31.0000,True,False,False,False,False,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
665,True,False,False,41.00,14.1083,True,False,False,False,False,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
666,False,True,False,14.00,120.0000,False,False,True,False,False,...,6,0.565411,1.611508,2.458647,14.737029,1.059714,119.371526,1230.00,58,3732.0251


In [33]:
from sklearn.preprocessing import OrdinalEncoder

encode_list = []

for col in categorical_columns:
    if col in X_train.columns:
        le = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        le.fit(list(X_train[col].astype(str).values.reshape(-1, 1)))
        encode_list.append(le)
        X_train[col] = le.transform(list(X_train[col].astype(str).values.reshape(-1, 1)))
        #X_test[col] = le.transform(list(X_test[col].astype(str).values.reshape(-1, 1)))
        
# save it

with open("titanic/models.pkl", "ab") as f:
    pickle.dump(encode_list, f)

In [34]:
len(encode_list)

61

In [35]:
X_train

Unnamed: 0_level_0,Pclass = 3,Pclass = 1,Pclass = 2,Age,Fare,Deck = M,Deck = C,Deck = B,Deck = E,Deck = D,...,Deck.NUM_UNIQUE(df.Title),Deck.SKEW(df.Age),Deck.SKEW(df.Family_Size),Deck.SKEW(df.Fare),Deck.STD(df.Age),Deck.STD(df.Family_Size),Deck.STD(df.Fare),Deck.SUM(df.Age),Deck.SUM(df.Family_Size),Deck.SUM(df.Fare)
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,1.0,0.0,40.00,30.5000,0.0,1.0,0.0,0.0,0.0,...,6,-0.246234,1.479506,1.012559,14.795076,1.451969,69.726912,1663.92,104,4675.9207
1,1.0,0.0,0.0,25.00,7.0500,1.0,0.0,0.0,0.0,0.0,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
2,0.0,0.0,1.0,24.00,14.5000,1.0,0.0,0.0,0.0,0.0,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
3,1.0,0.0,0.0,22.00,7.5208,1.0,0.0,0.0,0.0,0.0,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
4,0.0,1.0,0.0,0.92,151.5500,0.0,1.0,0.0,0.0,0.0,...,6,-0.246234,1.479506,1.012559,14.795076,1.451969,69.726912,1663.92,104,4675.9207
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
663,1.0,0.0,0.0,21.00,7.6500,1.0,0.0,0.0,0.0,0.0,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
664,0.0,1.0,0.0,40.00,31.0000,1.0,0.0,0.0,0.0,0.0,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
665,1.0,0.0,0.0,41.00,14.1083,1.0,0.0,0.0,0.0,0.0,...,7,0.457219,2.757381,9.112063,12.214704,1.822822,31.726899,14027.92,1007,10247.2661
666,0.0,1.0,0.0,14.00,120.0000,0.0,0.0,1.0,0.0,0.0,...,6,0.565411,1.611508,2.458647,14.737029,1.059714,119.371526,1230.00,58,3732.0251


In [36]:
pickle_list = []

with open("titanic/models.pkl", "rb") as f:
    while True:
        try:
            pickle_list.append(pickle.load(f))
        except EOFError:
            break
            
encode_list = pickle_list[0]

In [37]:
len(encode_list)

61

In [38]:
i = 0

for col in categorical_columns:
    
    try: 
        X_test[col] = encode_list[i].transform(list(X_test[col].astype(str).values.reshape(-1, 1)))
    except:
        print(i, col, "An exception occurred")
    i += 1

In [39]:
X_train['Deck.NUM_UNIQUE(df.Title)'].value_counts()

7    519
6     80
4     47
3     19
2      3
Name: Deck.NUM_UNIQUE(df.Title), dtype: int64

In [40]:
#X_test_orig

In [41]:
#X_train.isnull().sum()

In [42]:
#collinear_features

In [43]:
#FE_option1

In [44]:
# Threshold for removing correlated variables
threshold = 0.8

# Absolute value correlation matrix
corr_matrix = X_train.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Select columns with correlations above threshold
collinear_features = [column for column in upper.columns if any(upper[column] > threshold)]

print('There are %d features to remove.' % (len(collinear_features)))

There are 118 features to remove.


In [45]:
X_train_flt = X_train.drop(columns = collinear_features)
X_test_flt = X_test.drop(columns = collinear_features)
X_train_flt.shape, X_test_flt.shape

((668, 61), (223, 61))

# Modeling 1

In [46]:
def run_classification(trainx, testx):
    classifier =DecisionTreeClassifier(criterion='gini',max_depth=2, max_leaf_nodes=20)
    classifier.fit(trainx, y_train)
    Y_pred = classifier.predict(testx)
    print(f"Decision Tree Test score {classifier.score(testx, y_test)}")
    print(f"Decision Tree Confusion Matrix: \n {confusion_matrix(y_test, Y_pred)}")
    print(classification_report(y_test, Y_pred))
#     feature_importances = classifier.feature_importances_
#     feature_importances     

    
    lr=LogisticRegression(solver='lbfgs',max_iter=500)
    lr = lr.fit(trainx, y_train)
    print(f"LogisticRegression Test socre {lr.score(testx, y_test)}")
    Y_pred2 = lr.predict(testx)
    print(f"Logistic Regression Confusion Matix:\n {confusion_matrix(y_test, Y_pred2)}")
    print(classification_report(y_test, Y_pred))
    
    
    lr_weights = pd.DataFrame({
        'features': testx.columns,
        'importance': lr.coef_[0]
    }) 
    lr_weights.sort_values('importance', ascending=False, inplace=True)
    print(f"LogisticRegression Top Feature Weights\n {lr_weights.head()}")

In [47]:
run_classification(X_train_flt, X_test_flt)

Decision Tree Test score 0.7757847533632287
Decision Tree Confusion Matrix: 
 [[125   9]
 [ 41  48]]
              precision    recall  f1-score   support

           0       0.75      0.93      0.83       134
           1       0.84      0.54      0.66        89

    accuracy                           0.78       223
   macro avg       0.80      0.74      0.75       223
weighted avg       0.79      0.78      0.76       223

LogisticRegression Test socre 0.7623318385650224
Logistic Regression Confusion Matix:
 [[117  17]
 [ 36  53]]
              precision    recall  f1-score   support

           0       0.75      0.93      0.83       134
           1       0.84      0.54      0.66        89

    accuracy                           0.78       223
   macro avg       0.80      0.74      0.75       223
weighted avg       0.79      0.78      0.76       223

LogisticRegression Top Feature Weights
                      features  importance
55  Title.NUM_UNIQUE(df.Deck)    1.002200
8          

# Modeling 2

In [48]:
# Tuning Random Forest model for features "features_set", makes prediction and save it into file  
train_fe = X_train.drop(columns = collinear_features)
test_fe = X_test.drop(columns = collinear_features)

random_forest = GridSearchCV(estimator=RandomForestClassifier(), param_grid={'n_estimators': [100, 500]}, cv=5).fit(train_fe, y_train)
random_forest.fit(train_fe, y_train)
Y_pred = random_forest.predict(test_fe).astype(int)
random_forest.score(train_fe, y_train)
acc_random_forest = round(random_forest.score(train_fe, y_train) * 100, 2)
#pd.DataFrame({'Survived': Y_pred}, index=testdf.index).reset_index().to_csv(file, index=False)
print(acc_random_forest)
Y_pred

98.5


array([1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0])

In [49]:
print(f"Confusion Matrix: \n {confusion_matrix(y_test, Y_pred)}")

Confusion Matrix: 
 [[99 35]
 [ 9 80]]


In [50]:
print(classification_report(y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.92      0.74      0.82       134
           1       0.70      0.90      0.78        89

    accuracy                           0.80       223
   macro avg       0.81      0.82      0.80       223
weighted avg       0.83      0.80      0.80       223

