# Titanic FeatureTools version 2

Follow this blog: 

https://medium.com/dataexplorations/tool-review-can-featuretools-simplify-the-process-of-feature-engineering-5d165100b0c3

Or this github:

https://github.com/ag2816/TitanicKaggle/blob/master/Titanic_FeatureTools.ipynb

In [1]:
import featuretools as ft
import featuretools.variable_types as vtypes

import pandas as pd
import numpy as np

#visualization
import seaborn as sns
import matplotlib.pyplot as plt
get_ipython().magic(u'matplotlib inline')

#import sklearn
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.metrics import(
    classification_report, confusion_matrix, accuracy_score, mean_squared_error, mean_absolute_error
)


from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import (
    StandardScaler, LabelBinarizer, FunctionTransformer,PolynomialFeatures, OrdinalEncoder
)

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

from sklearn.feature_selection import (
    VarianceThreshold, SelectKBest, SelectPercentile, 
    SelectFromModel, f_regression, RFECV
)

from sklearn.impute import SimpleImputer
from sklearn_pandas import DataFrameMapper
from sklearn.model_selection import GridSearchCV

from sklearn.base import TransformerMixin, BaseEstimator

from sklearn.pipeline import Pipeline, FeatureUnion

# Load Data

In [2]:
titanic_df = pd.read_csv('titanic/train.csv')
test_df= pd.read_csv('titanic/test.csv')

In [3]:
# split training to train and test

X_train, X_test, y_train, y_test = train_test_split(titanic_df.iloc[:,:], titanic_df['Survived'], random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
X_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
298,299,1,1,"Saalfeld, Mr. Adolphe",male,,0,0,19988,30.5,C106,S
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,,S
247,248,1,2,"Hamalainen, Mrs. William (Anna)",female,24.0,0,2,250649,14.5,,S
478,479,0,3,"Karlsson, Mr. Nils August",male,22.0,0,0,350060,7.5208,,S
305,306,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S


In [4]:
X_train.shape, X_test.shape

((668, 12), (223, 12))

In [5]:
X_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            132
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          519
Embarked         2
dtype: int64

# FE for original fields

In [6]:
X_train['Age'].fillna(X_train['Age'].median(), inplace = True)
X_train['Embarked'].fillna(X_train['Embarked'].mode()[0], inplace = True)
X_train['Fare'].fillna(X_train['Fare'].median(), inplace = True)

In [7]:
X_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          519
Embarked         0
dtype: int64

In [8]:
X_test['Age'].fillna(X_train['Age'].median(), inplace = True)
X_test['Embarked'].fillna(X_train['Embarked'].mode()[0], inplace = True)
X_test['Fare'].fillna(X_train['Fare'].median(), inplace = True)

In [9]:
X_test.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          168
Embarked         0
dtype: int64

In [10]:
X_train['family_count'] = X_train['Parch'] + X_train['SibSp']
X_test['family_count'] = X_test['Parch'] + X_test['SibSp']

In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

le.fit(X_train['Sex'])
X_train['Sex'] = le.transform(X_train['Sex'])
X_test['Sex'] = le.transform(X_test['Sex'])

le = LabelEncoder()

le.fit(X_train['Embarked'])
X_train['Embarked'] = le.transform(X_train['Embarked'])
X_test['Embarked'] = le.transform(X_test['Embarked'])

In [12]:
X_train = X_train.drop(columns=['Survived','Parch','SibSp', 'Name','Ticket','Cabin'], axis=1)
X_test = X_test.drop(columns=['Survived', 'Parch','SibSp', 'Name', 'Ticket','Cabin'], axis=1)

In [13]:
X_train_orig =X_train.copy()
X_train

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,family_count
298,299,1,1,28.00,30.5000,2,0
884,885,3,1,25.00,7.0500,2,0
247,248,2,0,24.00,14.5000,2,2
478,479,3,1,22.00,7.5208,2,0
305,306,1,1,0.92,151.5500,2,3
...,...,...,...,...,...,...,...
106,107,3,0,21.00,7.6500,2,0
270,271,1,1,28.00,31.0000,2,0
860,861,3,1,41.00,14.1083,2,2
435,436,1,0,14.00,120.0000,2,3


In [14]:
# save a copy of this dataframe for later reference
X_test_orig = X_test.copy()
X_test

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,family_count
709,710,3,1,28.0,15.2458,0,2
439,440,2,1,31.0,10.5000,2,0
840,841,3,1,20.0,7.9250,2,0
720,721,2,0,6.0,33.0000,2,1
39,40,3,0,14.0,11.2417,0,1
...,...,...,...,...,...,...,...
880,881,2,0,25.0,26.0000,2,1
425,426,3,1,28.0,7.2500,2,0
101,102,3,1,28.0,7.8958,2,0
199,200,2,0,24.0,13.0000,2,0


# Load FeatureTools

In [16]:
primitives = ft.list_primitives()
pd.options.display.max_colwidth = 100
primitives[primitives['type'] == 'aggregation'].head(primitives[primitives['type'] == 'aggregation'].shape[0])

Unnamed: 0,name,type,dask_compatible,koalas_compatible,description,valid_inputs,return_type
0,all,aggregation,True,False,Calculates if all values are 'True' in a list.,Boolean,Boolean
1,min,aggregation,True,True,"Calculates the smallest value, ignoring `NaN` values.",Numeric,Numeric
2,last,aggregation,False,False,Determines the last value in a list.,Variable,
3,mean,aggregation,True,True,Computes the average for a list of values.,Numeric,Numeric
4,percent_true,aggregation,True,False,Determines the percent of `True` values.,Boolean,Numeric
5,entropy,aggregation,False,False,Calculates the entropy for a categorical variable,Categorical,Numeric
6,std,aggregation,True,True,"Computes the dispersion relative to the mean value, ignoring `NaN`.",Numeric,Numeric
7,median,aggregation,False,False,Determines the middlemost number in a list of values.,Numeric,Numeric
8,avg_time_between,aggregation,False,False,Computes the average number of seconds between consecutive events.,DatetimeTimeIndex,Numeric
9,sum,aggregation,True,True,"Calculates the total addition, ignoring `NaN`.",Numeric,Numeric


In [17]:
primitives[primitives['type'] == 'transform'].head(primitives[primitives['type'] == 'transform'].shape[0])

Unnamed: 0,name,type,dask_compatible,koalas_compatible,description,valid_inputs,return_type
22,minute,transform,True,True,Determines the minutes value of a datetime.,Datetime,Numeric
23,scalar_subtract_numeric_feature,transform,True,True,Subtract each value in the list from a given scalar.,Numeric,Numeric
24,greater_than_equal_to_scalar,transform,True,True,Determines if values are greater than or equal to a given scalar.,"Datetime, Ordinal, Numeric",Boolean
25,add_numeric_scalar,transform,True,True,Add a scalar to each value in the list.,Numeric,Numeric
26,hour,transform,True,True,Determines the hour value of a datetime.,Datetime,Ordinal
...,...,...,...,...,...,...,...
79,diff,transform,False,False,Compute the difference between the value in a list and the,Numeric,Numeric
80,divide_by_feature,transform,True,True,Divide a scalar by each value in the list.,Numeric,Numeric
81,month,transform,True,True,Determines the month value of a datetime.,Datetime,Ordinal
82,less_than_scalar,transform,True,True,Determines if values are less than a given scalar.,"Datetime, Ordinal, Numeric",Boolean


In [18]:
# reset indices

X_train_orig.reset_index(drop=True, inplace=True)
X_test_orig.reset_index(drop=True, inplace=True)
X_train_orig.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,family_count
0,299,1,1,28.0,30.5,2,0
1,885,3,1,25.0,7.05,2,0
2,248,2,0,24.0,14.5,2,2
3,479,3,1,22.0,7.5208,2,0
4,306,1,1,0.92,151.55,2,3


## FE by FeatureTools

In [19]:
# creating and entity set 'es'
es = ft.EntitySet(id = 'Survivors')
variable_types = { 
      'Sex': vtypes.Categorical,
      'Pclass': vtypes.Categorical,
      'Embarked': vtypes.Categorical}

es.entity_from_dataframe(entity_id = 'Passengers', dataframe = X_train_orig, index = 'Id', variable_types=variable_types)



Entityset: Survivors
  Entities:
    Passengers [Rows: 668, Columns: 8]
  Relationships:
    No relationships

In [20]:
es["Passengers"].variables

[<Variable: Id (dtype = index)>,
 <Variable: PassengerId (dtype = numeric)>,
 <Variable: Age (dtype = numeric)>,
 <Variable: Fare (dtype = numeric)>,
 <Variable: family_count (dtype = numeric)>,
 <Variable: Sex (dtype = categorical)>,
 <Variable: Pclass (dtype = categorical)>,
 <Variable: Embarked (dtype = categorical)>]

In [21]:
es = es.normalize_entity(base_entity_id='Passengers', new_entity_id='Pclass', index='Pclass')
es

Entityset: Survivors
  Entities:
    Passengers [Rows: 668, Columns: 8]
    Pclass [Rows: 3, Columns: 1]
  Relationships:
    Passengers.Pclass -> Pclass.Pclass

In [22]:
feature_matrix, feature_names = ft.dfs(entityset=es, 
    target_entity = 'Passengers', 
    max_depth = 2, 
    verbose = 3, 
    n_jobs = 1,
    ignore_variables={'Passengers':['PassengerId']})

Built 29 features
Elapsed: 00:00 | Progress: 100%|████████████████████████████████████████████████


In [23]:
feature_matrix

Unnamed: 0_level_0,Age,Fare,family_count,Sex,Pclass,Embarked,Pclass.COUNT(Passengers),Pclass.MAX(Passengers.Age),Pclass.MAX(Passengers.Fare),Pclass.MAX(Passengers.family_count),...,Pclass.NUM_UNIQUE(Passengers.Sex),Pclass.SKEW(Passengers.Age),Pclass.SKEW(Passengers.Fare),Pclass.SKEW(Passengers.family_count),Pclass.STD(Passengers.Age),Pclass.STD(Passengers.Fare),Pclass.STD(Passengers.family_count),Pclass.SUM(Passengers.Age),Pclass.SUM(Passengers.Fare),Pclass.SUM(Passengers.family_count)
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,28.00,30.5000,0,1,1,2,152,80.0,512.3292,5,...,2,0.259387,2.900362,1.960429,14.234234,84.524889,1.091313,5598.92,13352.9125,119
1,25.00,7.0500,0,1,3,2,375,74.0,69.5500,10,...,2,0.211083,2.703980,2.564656,10.574068,12.555034,2.037534,9670.92,5236.2036,386
2,24.00,14.5000,2,0,2,2,141,70.0,73.5000,5,...,2,0.133481,1.957192,1.234036,13.996094,13.119260,1.101395,4196.00,2906.7209,114
3,22.00,7.5208,0,1,3,2,375,74.0,69.5500,10,...,2,0.211083,2.703980,2.564656,10.574068,12.555034,2.037534,9670.92,5236.2036,386
4,0.92,151.5500,3,1,1,2,152,80.0,512.3292,5,...,2,0.259387,2.900362,1.960429,14.234234,84.524889,1.091313,5598.92,13352.9125,119
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
663,21.00,7.6500,0,0,3,2,375,74.0,69.5500,10,...,2,0.211083,2.703980,2.564656,10.574068,12.555034,2.037534,9670.92,5236.2036,386
664,28.00,31.0000,0,1,1,2,152,80.0,512.3292,5,...,2,0.259387,2.900362,1.960429,14.234234,84.524889,1.091313,5598.92,13352.9125,119
665,41.00,14.1083,2,1,3,2,375,74.0,69.5500,10,...,2,0.211083,2.703980,2.564656,10.574068,12.555034,2.037534,9670.92,5236.2036,386
666,14.00,120.0000,3,0,1,2,152,80.0,512.3292,5,...,2,0.259387,2.900362,1.960429,14.234234,84.524889,1.091313,5598.92,13352.9125,119


In [25]:
feature_names

[<Feature: Age>,
 <Feature: Fare>,
 <Feature: family_count>,
 <Feature: Sex>,
 <Feature: Pclass>,
 <Feature: Embarked>,
 <Feature: Pclass.COUNT(Passengers)>,
 <Feature: Pclass.MAX(Passengers.Age)>,
 <Feature: Pclass.MAX(Passengers.Fare)>,
 <Feature: Pclass.MAX(Passengers.family_count)>,
 <Feature: Pclass.MEAN(Passengers.Age)>,
 <Feature: Pclass.MEAN(Passengers.Fare)>,
 <Feature: Pclass.MEAN(Passengers.family_count)>,
 <Feature: Pclass.MIN(Passengers.Age)>,
 <Feature: Pclass.MIN(Passengers.Fare)>,
 <Feature: Pclass.MIN(Passengers.family_count)>,
 <Feature: Pclass.MODE(Passengers.Embarked)>,
 <Feature: Pclass.MODE(Passengers.Sex)>,
 <Feature: Pclass.NUM_UNIQUE(Passengers.Embarked)>,
 <Feature: Pclass.NUM_UNIQUE(Passengers.Sex)>,
 <Feature: Pclass.SKEW(Passengers.Age)>,
 <Feature: Pclass.SKEW(Passengers.Fare)>,
 <Feature: Pclass.SKEW(Passengers.family_count)>,
 <Feature: Pclass.STD(Passengers.Age)>,
 <Feature: Pclass.STD(Passengers.Fare)>,
 <Feature: Pclass.STD(Passengers.family_count)>,


In [26]:
X_train=feature_matrix.copy() # save the generated features back into our X_Train
feature_matrix.head().T

Id,0,1,2,3,4
Age,28.0,25.0,24.0,22.0,0.92
Fare,30.5,7.05,14.5,7.5208,151.55
family_count,0.0,0.0,2.0,0.0,3.0
Sex,1.0,1.0,0.0,1.0,1.0
Pclass,1.0,3.0,2.0,3.0,1.0
Embarked,2.0,2.0,2.0,2.0,2.0
Pclass.COUNT(Passengers),152.0,375.0,141.0,375.0,152.0
Pclass.MAX(Passengers.Age),80.0,74.0,70.0,74.0,80.0
Pclass.MAX(Passengers.Fare),512.3292,69.55,73.5,69.55,512.3292
Pclass.MAX(Passengers.family_count),5.0,10.0,5.0,10.0,5.0


In [27]:
feature_matrix_enc, features_enc = ft.encode_features(feature_matrix, feature_names, include_unknown=False)

In [28]:
features_enc

[<Feature: Age>,
 <Feature: Fare>,
 <Feature: family_count>,
 <Feature: Sex = 1>,
 <Feature: Sex = 0>,
 <Feature: Pclass = 3>,
 <Feature: Pclass = 1>,
 <Feature: Pclass = 2>,
 <Feature: Embarked = 2>,
 <Feature: Embarked = 0>,
 <Feature: Embarked = 1>,
 <Feature: Pclass.COUNT(Passengers)>,
 <Feature: Pclass.MAX(Passengers.Age)>,
 <Feature: Pclass.MAX(Passengers.Fare)>,
 <Feature: Pclass.MAX(Passengers.family_count)>,
 <Feature: Pclass.MEAN(Passengers.Age)>,
 <Feature: Pclass.MEAN(Passengers.Fare)>,
 <Feature: Pclass.MEAN(Passengers.family_count)>,
 <Feature: Pclass.MIN(Passengers.Age)>,
 <Feature: Pclass.MIN(Passengers.Fare)>,
 <Feature: Pclass.MIN(Passengers.family_count)>,
 <Feature: Pclass.MODE(Passengers.Embarked) = 2>,
 <Feature: Pclass.MODE(Passengers.Sex) = 1>,
 <Feature: Pclass.NUM_UNIQUE(Passengers.Embarked)>,
 <Feature: Pclass.NUM_UNIQUE(Passengers.Sex)>,
 <Feature: Pclass.SKEW(Passengers.Age)>,
 <Feature: Pclass.SKEW(Passengers.Fare)>,
 <Feature: Pclass.SKEW(Passengers.famil

In [29]:
feature_matrix_enc

Unnamed: 0_level_0,Age,Fare,family_count,Sex = 1,Sex = 0,Pclass = 3,Pclass = 1,Pclass = 2,Embarked = 2,Embarked = 0,...,Pclass.NUM_UNIQUE(Passengers.Sex),Pclass.SKEW(Passengers.Age),Pclass.SKEW(Passengers.Fare),Pclass.SKEW(Passengers.family_count),Pclass.STD(Passengers.Age),Pclass.STD(Passengers.Fare),Pclass.STD(Passengers.family_count),Pclass.SUM(Passengers.Age),Pclass.SUM(Passengers.Fare),Pclass.SUM(Passengers.family_count)
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,28.00,30.5000,0,True,False,False,True,False,True,False,...,2,0.259387,2.900362,1.960429,14.234234,84.524889,1.091313,5598.92,13352.9125,119
1,25.00,7.0500,0,True,False,True,False,False,True,False,...,2,0.211083,2.703980,2.564656,10.574068,12.555034,2.037534,9670.92,5236.2036,386
2,24.00,14.5000,2,False,True,False,False,True,True,False,...,2,0.133481,1.957192,1.234036,13.996094,13.119260,1.101395,4196.00,2906.7209,114
3,22.00,7.5208,0,True,False,True,False,False,True,False,...,2,0.211083,2.703980,2.564656,10.574068,12.555034,2.037534,9670.92,5236.2036,386
4,0.92,151.5500,3,True,False,False,True,False,True,False,...,2,0.259387,2.900362,1.960429,14.234234,84.524889,1.091313,5598.92,13352.9125,119
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
663,21.00,7.6500,0,False,True,True,False,False,True,False,...,2,0.211083,2.703980,2.564656,10.574068,12.555034,2.037534,9670.92,5236.2036,386
664,28.00,31.0000,0,True,False,False,True,False,True,False,...,2,0.259387,2.900362,1.960429,14.234234,84.524889,1.091313,5598.92,13352.9125,119
665,41.00,14.1083,2,True,False,True,False,False,True,False,...,2,0.211083,2.703980,2.564656,10.574068,12.555034,2.037534,9670.92,5236.2036,386
666,14.00,120.0000,3,False,True,False,True,False,True,False,...,2,0.259387,2.900362,1.960429,14.234234,84.524889,1.091313,5598.92,13352.9125,119


In [30]:
X_train = feature_matrix_enc.copy()
X_train.head()

Unnamed: 0_level_0,Age,Fare,family_count,Sex = 1,Sex = 0,Pclass = 3,Pclass = 1,Pclass = 2,Embarked = 2,Embarked = 0,...,Pclass.NUM_UNIQUE(Passengers.Sex),Pclass.SKEW(Passengers.Age),Pclass.SKEW(Passengers.Fare),Pclass.SKEW(Passengers.family_count),Pclass.STD(Passengers.Age),Pclass.STD(Passengers.Fare),Pclass.STD(Passengers.family_count),Pclass.SUM(Passengers.Age),Pclass.SUM(Passengers.Fare),Pclass.SUM(Passengers.family_count)
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,28.0,30.5,0,True,False,False,True,False,True,False,...,2,0.259387,2.900362,1.960429,14.234234,84.524889,1.091313,5598.92,13352.9125,119
1,25.0,7.05,0,True,False,True,False,False,True,False,...,2,0.211083,2.70398,2.564656,10.574068,12.555034,2.037534,9670.92,5236.2036,386
2,24.0,14.5,2,False,True,False,False,True,True,False,...,2,0.133481,1.957192,1.234036,13.996094,13.11926,1.101395,4196.0,2906.7209,114
3,22.0,7.5208,0,True,False,True,False,False,True,False,...,2,0.211083,2.70398,2.564656,10.574068,12.555034,2.037534,9670.92,5236.2036,386
4,0.92,151.55,3,True,False,False,True,False,True,False,...,2,0.259387,2.900362,1.960429,14.234234,84.524889,1.091313,5598.92,13352.9125,119


In [31]:
X_train_orig.head()

Unnamed: 0,Id,PassengerId,Pclass,Sex,Age,Fare,Embarked,family_count
0,0,299,1,1,28.0,30.5,2,0
1,1,885,3,1,25.0,7.05,2,0
2,2,248,2,0,24.0,14.5,2,2
3,3,479,3,1,22.0,7.5208,2,0
4,4,306,1,1,0.92,151.55,2,3


## Save FE and load Test data

In [33]:
#creating and entity set 'es'
es_tst = ft.EntitySet(id = 'Survivors')
# adding a dataframe 
es_tst.entity_from_dataframe(entity_id = 'Passengers', dataframe = X_test_orig, index = 'Id')
# add PCLass entity
es_tst = es_tst.normalize_entity(base_entity_id='Passengers', new_entity_id='Pclass', index='Pclass')
es_tst



Entityset: Survivors
  Entities:
    Passengers [Rows: 223, Columns: 8]
    Pclass [Rows: 3, Columns: 1]
  Relationships:
    Passengers.Pclass -> Pclass.Pclass

In [34]:
feature_matrix_tst = ft.calculate_feature_matrix(features=features_enc, entityset=es_tst)

In [35]:
feature_matrix_tst

Unnamed: 0_level_0,Age,Fare,family_count,Sex = 1,Sex = 0,Pclass = 3,Pclass = 1,Pclass = 2,Embarked = 2,Embarked = 0,...,Pclass.NUM_UNIQUE(Passengers.Sex),Pclass.SKEW(Passengers.Age),Pclass.SKEW(Passengers.Fare),Pclass.SKEW(Passengers.family_count),Pclass.STD(Passengers.Age),Pclass.STD(Passengers.Fare),Pclass.STD(Passengers.family_count),Pclass.SUM(Passengers.Age),Pclass.SUM(Passengers.Fare),Pclass.SUM(Passengers.family_count)
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,28.0,15.2458,2,True,False,True,False,False,False,True,...,2,0.690926,2.147037,1.852647,11.122151,8.792546,1.659171,3062.00,1478.4915,109
1,31.0,10.5000,0,True,False,False,False,True,True,False,...,2,0.315440,2.289693,0.864271,12.275540,14.513892,0.887342,1280.83,895.1208,30
2,20.0,7.9250,0,True,False,True,False,False,True,False,...,2,0.690926,2.147037,1.852647,11.122151,8.792546,1.659171,3062.00,1478.4915,109
3,6.0,33.0000,1,False,True,False,False,True,True,False,...,2,0.315440,2.289693,0.864271,12.275540,14.513892,0.887342,1280.83,895.1208,30
4,14.0,11.2417,1,False,True,True,False,False,False,True,...,2,0.690926,2.147037,1.852647,11.122151,8.792546,1.659171,3062.00,1478.4915,109
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,25.0,26.0000,1,False,True,False,False,True,True,False,...,2,0.315440,2.289693,0.864271,12.275540,14.513892,0.887342,1280.83,895.1208,30
219,28.0,7.2500,0,True,False,True,False,False,True,False,...,2,0.690926,2.147037,1.852647,11.122151,8.792546,1.659171,3062.00,1478.4915,109
220,28.0,7.8958,0,True,False,True,False,False,True,False,...,2,0.690926,2.147037,1.852647,11.122151,8.792546,1.659171,3062.00,1478.4915,109
221,24.0,13.0000,0,False,True,False,False,True,True,False,...,2,0.315440,2.289693,0.864271,12.275540,14.513892,0.887342,1280.83,895.1208,30


In [36]:
X_test = feature_matrix_tst.copy()
X_test.head()

Unnamed: 0_level_0,Age,Fare,family_count,Sex = 1,Sex = 0,Pclass = 3,Pclass = 1,Pclass = 2,Embarked = 2,Embarked = 0,...,Pclass.NUM_UNIQUE(Passengers.Sex),Pclass.SKEW(Passengers.Age),Pclass.SKEW(Passengers.Fare),Pclass.SKEW(Passengers.family_count),Pclass.STD(Passengers.Age),Pclass.STD(Passengers.Fare),Pclass.STD(Passengers.family_count),Pclass.SUM(Passengers.Age),Pclass.SUM(Passengers.Fare),Pclass.SUM(Passengers.family_count)
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,28.0,15.2458,2,True,False,True,False,False,False,True,...,2,0.690926,2.147037,1.852647,11.122151,8.792546,1.659171,3062.0,1478.4915,109
1,31.0,10.5,0,True,False,False,False,True,True,False,...,2,0.31544,2.289693,0.864271,12.27554,14.513892,0.887342,1280.83,895.1208,30
2,20.0,7.925,0,True,False,True,False,False,True,False,...,2,0.690926,2.147037,1.852647,11.122151,8.792546,1.659171,3062.0,1478.4915,109
3,6.0,33.0,1,False,True,False,False,True,True,False,...,2,0.31544,2.289693,0.864271,12.27554,14.513892,0.887342,1280.83,895.1208,30
4,14.0,11.2417,1,False,True,True,False,False,False,True,...,2,0.690926,2.147037,1.852647,11.122151,8.792546,1.659171,3062.0,1478.4915,109


In [37]:
X_train.shape, X_test.shape

((668, 34), (223, 34))

In [39]:
# double check for nulls
for col in X_train.columns:
    if X_train[col].isnull().sum() >0:
        print(col)
        X_train.drop(col, axis=1, inplace=True)

In [40]:
# Threshold for removing correlated variables
threshold = 0.7

# Absolute value correlation matrix
corr_matrix = X_train.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


In [41]:
# Select columns with correlations above threshold
collinear_features = [column for column in upper.columns if any(upper[column] > threshold)]

print('There are %d features to remove.' % (len(collinear_features)))

There are 19 features to remove.


In [42]:
X_train_flt = X_train.drop(columns = collinear_features)
X_test_flt = X_test.drop(columns = collinear_features)
X_train_flt.shape, X_test_flt.shape

((668, 15), (223, 15))

# Modeling

In [43]:
def run_classification(trainx, testx):
    classifier =DecisionTreeClassifier(criterion='gini',max_depth=2, max_leaf_nodes=20)
    classifier.fit(trainx, y_train)
    Y_pred = classifier.predict(testx)
    print(f"Decision Tree Test score {classifier.score(testx, y_test)}")
    print(f"Decision Tree Confusion Matrix: \n {confusion_matrix(y_test, Y_pred)}")
#     feature_importances = classifier.feature_importances_
#     feature_importances     

    
    lr=LogisticRegression(solver='lbfgs',max_iter=500)
    lr = lr.fit(trainx, y_train)
    print(f"LogisticRegression Test socre {lr.score(testx, y_test)}")
    Y_pred2 = lr.predict(testx)
    print(f"Logistic Regression Confusion Matix:\n {confusion_matrix(y_test, Y_pred2)}")
    
    
    lr_weights = pd.DataFrame({
        'features': testx.columns,
        'importance': lr.coef_[0]
    }) 
    lr_weights.sort_values('importance', ascending=False, inplace=True)
    print(f"LogisticRegression Top Feature Weights\n {lr_weights.head()}")

In [44]:
run_classification(X_train_flt, X_test_flt)

Decision Tree Test score 0.7757847533632287
Decision Tree Confusion Matrix: 
 [[127   7]
 [ 43  46]]
LogisticRegression Test socre 0.7982062780269058
Logistic Regression Confusion Matix:
 [[115  19]
 [ 26  63]]
LogisticRegression Top Feature Weights
                                   features  importance
5                               Pclass = 1    0.688766
6                               Pclass = 2    0.281283
1                                     Fare    0.005010
13  Pclass.NUM_UNIQUE(Passengers.Embarked)    0.001167
14       Pclass.NUM_UNIQUE(Passengers.Sex)    0.000778
