# Titanic FeatureTools version

Follow this instruction: https://www.kaggle.com/vbmokin/titanic-featuretools-automatic-fe-fs

## Summary

(1) The first step is to automatic create a entity based on the combined df (traindf + testdf)

```es = ft.EntitySet(id = 'titanic_data')```

```es = es.entity_from_dataframe(entity_id = 'df', dataframe = df.drop(['Survived'], axis=1), 
                              variable_types = 
                              {
                                  'Embarked': ft.variable_types.Categorical,
                                  'Sex': ft.variable_types.Boolean,
                                  'Title': ft.variable_types.Categorical,
                                  'Family_Size': ft.variable_types.Numeric,
                                  'LastName': ft.variable_types.Categorical
                              },
                              index = 'PassengerId')```

(2) The second step: automatic feature engineering: 

```features, feature_names = ft.dfs(entityset = es, 
                                 target_entity = 'df', 
                                 max_depth = 2)```


(3) Feature selection

- correlation threshold
- SelectFromModel with Linear SVC
- SelectFromModel with Lasso
- SelectBest with Chi-2
- Recursive Feature Elimination (RFE) with Logistic Regression
- RFE with Random Forest

(4) Modeling (random forest)

In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline

import featuretools as ft
from featuretools.primitives import *
from featuretools.variable_types import Numeric

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression, LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import explained_variance_score
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel, SelectKBest, RFE, chi2

# model tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval

import warnings
warnings.filterwarnings("ignore")

In [3]:
traindf = pd.read_csv('titanic/train.csv').set_index('PassengerId')
testdf = pd.read_csv('titanic/test.csv').set_index('PassengerId')
df = pd.concat([traindf, testdf], axis=0, sort=False)

In [4]:
traindf

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## Preparing to modeling with manual FE

In [10]:
#Thanks to:
# https://www.kaggle.com/mauricef/titanic
# https://www.kaggle.com/vbmokin/titanic-top-3-one-line-of-the-prediction-code
#
df = pd.concat([traindf, testdf], axis=0, sort=False)
df['Title'] = df.Name.str.split(',').str[1].str.split('.').str[0].str.strip()
df['Title'] = df.Name.str.split(',').str[1].str.split('.').str[0].str.strip()
df['IsWomanOrBoy'] = ((df.Title == 'Master') | (df.Sex == 'female'))
df['LastName'] = df.Name.str.split(',').str[0]
family = df.groupby(df.LastName).Survived
df['WomanOrBoyCount'] = family.transform(lambda s: s[df.IsWomanOrBoy].fillna(0).count())
#df['WomanOrBoyCount'] = df.mask(df.IsWomanOrBoy, df.WomanOrBoyCount - 1, axis=0)
df['FamilySurvivedCount'] = family.transform(lambda s: s[df.IsWomanOrBoy].fillna(0).sum())
#df['FamilySurvivedCount'] = df.mask(df.IsWomanOrBoy, df.FamilySurvivedCount - \
                                    #df.Survived.fillna(0), axis=0)
df['WomanOrBoySurvived'] = df.FamilySurvivedCount / df.WomanOrBoyCount.replace(0, np.nan)
df.WomanOrBoyCount = df.WomanOrBoyCount.replace(np.nan, 0)
df['Alone'] = (df.WomanOrBoyCount == 0)

#Thanks to https://www.kaggle.com/kpacocha/top-6-titanic-machine-learning-from-disaster
#"Title" improvement
df['Title'] = df['Title'].replace('Ms','Miss')
df['Title'] = df['Title'].replace('Mlle','Miss')
df['Title'] = df['Title'].replace('Mme','Mrs')
# Embarked
df['Embarked'] = df['Embarked'].fillna('S')
# Cabin, Deck
df['Deck'] = df['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')
df.loc[(df['Deck'] == 'T'), 'Deck'] = 'A'

# Thanks to https://www.kaggle.com/erinsweet/simpledetect
# Fare
med_fare = df.groupby(['Pclass', 'Parch', 'SibSp']).Fare.median()[3][0][0]
df['Fare'] = df['Fare'].fillna(med_fare)
#Age
df['Age'] = df.groupby(['Sex', 'Pclass', 'Title'])['Age'].apply(lambda x: x.fillna(x.median()))
# Family_Size
df['Family_Size'] = df['SibSp'] + df['Parch'] + 1

# Thanks to https://www.kaggle.com/vbmokin/titanic-top-3-cluster-analysis
cols_to_drop = ['Name','Ticket','Cabin']
df = df.drop(cols_to_drop, axis=1)

df.WomanOrBoySurvived = df.WomanOrBoySurvived.fillna(0)
df.WomanOrBoyCount = df.WomanOrBoyCount.fillna(0)
df.FamilySurvivedCount = df.FamilySurvivedCount.fillna(0)
df.Alone = df.Alone.fillna(0)
df.Alone = df.Alone*1

In [11]:
df

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,IsWomanOrBoy,LastName,WomanOrBoyCount,FamilySurvivedCount,WomanOrBoySurvived,Alone,Deck,Family_Size
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,0.0,3,male,22.0,1,0,7.2500,S,Mr,False,Braund,0,0.0,0.000000,1,M,2
2,1.0,1,female,38.0,1,0,71.2833,C,Mrs,True,Cumings,1,1.0,1.000000,0,C,2
3,1.0,3,female,26.0,0,0,7.9250,S,Miss,True,Heikkinen,1,1.0,1.000000,0,M,1
4,1.0,1,female,35.0,1,0,53.1000,S,Mrs,True,Futrelle,1,1.0,1.000000,0,C,2
5,0.0,3,male,35.0,0,0,8.0500,S,Mr,False,Allen,1,1.0,1.000000,0,M,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,,3,male,26.0,0,0,8.0500,S,Mr,False,Spector,0,0.0,0.000000,1,M,1
1306,,1,female,39.0,0,0,108.9000,C,Dona,True,Oliva y Ocana,1,0.0,0.000000,0,C,1
1307,,3,male,38.5,0,0,7.2500,S,Mr,False,Saether,0,0.0,0.000000,1,M,1
1308,,3,male,26.0,0,0,8.0500,S,Mr,False,Ware,1,0.0,0.000000,0,M,1


In [12]:
df_optimum = pd.concat([df.WomanOrBoySurvived.fillna(0), df.Alone, df.Sex.replace({'male': 0, 'female': 1})], axis=1)

In [13]:
target = df.Survived.loc[traindf.index]
df = df.drop(['SibSp','Parch','IsWomanOrBoy','WomanOrBoyCount','FamilySurvivedCount','WomanOrBoySurvived','Alone'], axis=1)
df['PassengerId'] = df.index
df.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,LastName,Deck,Family_Size,PassengerId
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,3,male,22.0,7.25,S,Mr,Braund,M,2,1
2,1.0,1,female,38.0,71.2833,C,Mrs,Cumings,C,2,2
3,1.0,3,female,26.0,7.925,S,Miss,Heikkinen,M,1,3
4,1.0,1,female,35.0,53.1,S,Mrs,Futrelle,C,2,4
5,0.0,3,male,35.0,8.05,S,Mr,Allen,M,1,5


# Automatic FE with Featuretools

In [14]:
es = ft.EntitySet(id = 'titanic_data')
es = es.entity_from_dataframe(entity_id = 'df', dataframe = df.drop(['Survived'], axis=1), 
                              variable_types = 
                              {
                                  'Embarked': ft.variable_types.Categorical,
                                  'Sex': ft.variable_types.Boolean,
                                  'Title': ft.variable_types.Categorical,
                                  'Family_Size': ft.variable_types.Numeric,
                                  'LastName': ft.variable_types.Categorical
                              },
                              index = 'PassengerId')

In [18]:
es['df']

Entity: df
  Variables:
    PassengerId (dtype: index)
    Pclass (dtype: numeric)
    Age (dtype: numeric)
    Fare (dtype: numeric)
    Deck (dtype: categorical)
    Embarked (dtype: categorical)
    Sex (dtype: boolean)
    Title (dtype: categorical)
    Family_Size (dtype: numeric)
    LastName (dtype: categorical)
  Shape:
    (Rows: 1309, Columns: 10)

In [19]:
es['df'].df

Unnamed: 0,PassengerId,Pclass,Age,Fare,Deck,Embarked,Sex,Title,Family_Size,LastName
1,1,3,22.0,7.2500,M,S,male,Mr,2,Braund
2,2,1,38.0,71.2833,C,C,female,Mrs,2,Cumings
3,3,3,26.0,7.9250,M,S,female,Miss,1,Heikkinen
4,4,1,35.0,53.1000,C,S,female,Mrs,2,Futrelle
5,5,3,35.0,8.0500,M,S,male,Mr,1,Allen
...,...,...,...,...,...,...,...,...,...,...
1305,1305,3,26.0,8.0500,M,S,male,Mr,1,Spector
1306,1306,1,39.0,108.9000,C,C,female,Dona,1,Oliva y Ocana
1307,1307,3,38.5,7.2500,M,S,male,Mr,1,Saether
1308,1308,3,26.0,8.0500,M,S,male,Mr,1,Ware


In [20]:
es = es.normalize_entity(base_entity_id='df', new_entity_id='Pclass', index='Pclass')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Sex', index='Sex')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Age', index='Age')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Fare', index='Fare')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Embarked', index='Embarked')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Title', index='Title')
es = es.normalize_entity(base_entity_id='df', new_entity_id='LastName', index='LastName')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Deck', index='Deck')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Family_Size', index='Family_Size')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Title_Sex', index='Sex')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Sex_LastName', index='LastName')
es = es.normalize_entity(base_entity_id='df', new_entity_id='Title_LastName', index='LastName')
es

Entityset: titanic_data
  Entities:
    df [Rows: 1309, Columns: 10]
    Pclass [Rows: 3, Columns: 1]
    Sex [Rows: 2, Columns: 1]
    Age [Rows: 99, Columns: 1]
    Fare [Rows: 281, Columns: 1]
    Embarked [Rows: 3, Columns: 1]
    Title [Rows: 15, Columns: 1]
    LastName [Rows: 875, Columns: 1]
    Deck [Rows: 8, Columns: 1]
    Family_Size [Rows: 9, Columns: 1]
    Title_Sex [Rows: 2, Columns: 1]
    Sex_LastName [Rows: 875, Columns: 1]
    Title_LastName [Rows: 875, Columns: 1]
  Relationships:
    df.Pclass -> Pclass.Pclass
    df.Sex -> Sex.Sex
    df.Age -> Age.Age
    df.Fare -> Fare.Fare
    df.Embarked -> Embarked.Embarked
    df.Title -> Title.Title
    df.LastName -> LastName.LastName
    df.Deck -> Deck.Deck
    df.Family_Size -> Family_Size.Family_Size
    df.Sex -> Title_Sex.Sex
    df.LastName -> Sex_LastName.LastName
    df.LastName -> Title_LastName.LastName

In [23]:
es['Pclass'].df

Unnamed: 0,Pclass
3,3
1,1
2,2


In [24]:
es['Age'].df

Unnamed: 0,Age
22.00,22.00
38.00,38.00
26.00,26.00
35.00,35.00
54.00,54.00
...,...
60.50,60.50
11.50,11.50
0.33,0.33
0.17,0.17


In [25]:
es['Sex'].df

Unnamed: 0,Sex
male,male
female,female


In [26]:
primitives = ft.list_primitives()
pd.options.display.max_colwidth = 500
primitives[primitives['type'] == 'aggregation'].head(primitives[primitives['type'] == 'aggregation'].shape[0])

Unnamed: 0,name,type,dask_compatible,koalas_compatible,description,valid_inputs,return_type
0,first,aggregation,False,False,Determines the first value in a list.,Variable,
1,skew,aggregation,False,False,Computes the extent to which a distribution differs from a normal distribution.,Numeric,Numeric
2,n_most_common,aggregation,False,False,Determines the `n` most common elements.,Discrete,Discrete
3,num_unique,aggregation,True,True,"Determines the number of distinct values, ignoring `NaN` values.",Discrete,Numeric
4,time_since_first,aggregation,False,False,Calculates the time elapsed since the first datetime (in seconds).,DatetimeTimeIndex,Numeric
5,all,aggregation,True,False,Calculates if all values are 'True' in a list.,Boolean,Boolean
6,min,aggregation,True,True,"Calculates the smallest value, ignoring `NaN` values.",Numeric,Numeric
7,last,aggregation,False,False,Determines the last value in a list.,Variable,
8,mean,aggregation,True,True,Computes the average for a list of values.,Numeric,Numeric
9,percent_true,aggregation,True,False,Determines the percent of `True` values.,Boolean,Numeric


In [27]:
features, feature_names = ft.dfs(entityset = es, 
                                 target_entity = 'df', 
                                 max_depth = 2)
len(feature_names)

213

In [28]:
feature_names

[<Feature: Pclass>,
 <Feature: Age>,
 <Feature: Fare>,
 <Feature: Deck>,
 <Feature: Embarked>,
 <Feature: Sex>,
 <Feature: Title>,
 <Feature: Family_Size>,
 <Feature: LastName>,
 <Feature: Pclass.COUNT(df)>,
 <Feature: Pclass.MODE(df.Age)>,
 <Feature: Pclass.MODE(df.Deck)>,
 <Feature: Pclass.MODE(df.Embarked)>,
 <Feature: Pclass.MODE(df.Family_Size)>,
 <Feature: Pclass.MODE(df.Fare)>,
 <Feature: Pclass.MODE(df.LastName)>,
 <Feature: Pclass.MODE(df.Sex)>,
 <Feature: Pclass.MODE(df.Title)>,
 <Feature: Pclass.NUM_UNIQUE(df.Age)>,
 <Feature: Pclass.NUM_UNIQUE(df.Deck)>,
 <Feature: Pclass.NUM_UNIQUE(df.Embarked)>,
 <Feature: Pclass.NUM_UNIQUE(df.Family_Size)>,
 <Feature: Pclass.NUM_UNIQUE(df.Fare)>,
 <Feature: Pclass.NUM_UNIQUE(df.LastName)>,
 <Feature: Pclass.NUM_UNIQUE(df.Sex)>,
 <Feature: Pclass.NUM_UNIQUE(df.Title)>,
 <Feature: Sex.COUNT(df)>,
 <Feature: Sex.MODE(df.Age)>,
 <Feature: Sex.MODE(df.Deck)>,
 <Feature: Sex.MODE(df.Embarked)>,
 <Feature: Sex.MODE(df.Family_Size)>,
 <Feature: 

In [29]:
features

Unnamed: 0_level_0,Pclass,Age,Fare,Deck,Embarked,Sex,Title,Family_Size,LastName,Pclass.COUNT(df),...,Title_LastName.MODE(df.Sex),Title_LastName.MODE(df.Title),Title_LastName.NUM_UNIQUE(df.Age),Title_LastName.NUM_UNIQUE(df.Deck),Title_LastName.NUM_UNIQUE(df.Embarked),Title_LastName.NUM_UNIQUE(df.Family_Size),Title_LastName.NUM_UNIQUE(df.Fare),Title_LastName.NUM_UNIQUE(df.Pclass),Title_LastName.NUM_UNIQUE(df.Sex),Title_LastName.NUM_UNIQUE(df.Title)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3,22.0,7.2500,M,S,male,Mr,2,Braund,709,...,male,Mr,2,1,1,1,2,1,1,1
2,1,38.0,71.2833,C,C,female,Mrs,2,Cumings,323,...,female,Mr,2,1,1,1,1,1,2,2
3,3,26.0,7.9250,M,S,female,Miss,1,Heikkinen,709,...,female,Miss,1,1,1,1,1,1,1,1
4,1,35.0,53.1000,C,S,female,Mrs,2,Futrelle,323,...,female,Mr,2,1,1,1,1,1,2,2
5,3,35.0,8.0500,M,S,male,Mr,1,Allen,709,...,female,Miss,2,2,1,1,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,3,26.0,8.0500,M,S,male,Mr,1,Spector,709,...,male,Mr,1,1,1,1,1,1,1,1
1306,1,39.0,108.9000,C,C,female,Dona,1,Oliva y Ocana,323,...,female,Dona,1,1,1,1,1,1,1,1
1307,3,38.5,7.2500,M,S,male,Mr,1,Saether,709,...,male,Mr,1,1,1,1,1,1,1,1
1308,3,26.0,8.0500,M,S,male,Mr,1,Ware,709,...,male,Mr,4,1,1,2,3,2,2,2


## Encoding categorical features

In [30]:
# Determination categorical features
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
categorical_columns = []
cols = features.columns.values.tolist()
for col in cols:
    if features[col].dtype in numerics: continue
    categorical_columns.append(col)
categorical_columns

['Deck',
 'Embarked',
 'Sex',
 'Title',
 'LastName',
 'Pclass.MODE(df.Deck)',
 'Pclass.MODE(df.Embarked)',
 'Pclass.MODE(df.LastName)',
 'Pclass.MODE(df.Sex)',
 'Pclass.MODE(df.Title)',
 'Sex.MODE(df.Deck)',
 'Sex.MODE(df.Embarked)',
 'Sex.MODE(df.LastName)',
 'Sex.MODE(df.Title)',
 'Age.MODE(df.Deck)',
 'Age.MODE(df.Embarked)',
 'Age.MODE(df.LastName)',
 'Age.MODE(df.Sex)',
 'Age.MODE(df.Title)',
 'Fare.MODE(df.Deck)',
 'Fare.MODE(df.Embarked)',
 'Fare.MODE(df.LastName)',
 'Fare.MODE(df.Sex)',
 'Fare.MODE(df.Title)',
 'Embarked.MODE(df.Deck)',
 'Embarked.MODE(df.LastName)',
 'Embarked.MODE(df.Sex)',
 'Embarked.MODE(df.Title)',
 'Title.MODE(df.Deck)',
 'Title.MODE(df.Embarked)',
 'Title.MODE(df.LastName)',
 'Title.MODE(df.Sex)',
 'LastName.MODE(df.Deck)',
 'LastName.MODE(df.Embarked)',
 'LastName.MODE(df.Sex)',
 'LastName.MODE(df.Title)',
 'Deck.MODE(df.Embarked)',
 'Deck.MODE(df.LastName)',
 'Deck.MODE(df.Sex)',
 'Deck.MODE(df.Title)',
 'Family_Size.MODE(df.Deck)',
 'Family_Size.MODE(

In [31]:
# Encoding categorical features
for col in categorical_columns:
    if col in features.columns:
        le = LabelEncoder()
        le.fit(list(features[col].astype(str).values))
        features[col] = le.transform(list(features[col].astype(str).values))

In [32]:
features.head(3)

Unnamed: 0_level_0,Pclass,Age,Fare,Deck,Embarked,Sex,Title,Family_Size,LastName,Pclass.COUNT(df),...,Title_LastName.MODE(df.Sex),Title_LastName.MODE(df.Title),Title_LastName.NUM_UNIQUE(df.Age),Title_LastName.NUM_UNIQUE(df.Deck),Title_LastName.NUM_UNIQUE(df.Embarked),Title_LastName.NUM_UNIQUE(df.Family_Size),Title_LastName.NUM_UNIQUE(df.Fare),Title_LastName.NUM_UNIQUE(df.Pclass),Title_LastName.NUM_UNIQUE(df.Sex),Title_LastName.NUM_UNIQUE(df.Title)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3,22.0,7.25,7,2,1,10,2,100,709,...,1,10,2,1,1,1,2,1,1,1
2,1,38.0,71.2833,2,0,0,11,2,182,323,...,0,10,2,1,1,1,1,1,2,2
3,3,26.0,7.925,7,2,0,9,1,329,709,...,0,9,1,1,1,1,1,1,1,1


## Automatic feature selection (FS)

In [33]:
train, test = features.loc[traindf.index], features.loc[testdf.index]
X_norm = MinMaxScaler().fit_transform(train)

In [36]:
test

Unnamed: 0_level_0,Pclass,Age,Fare,Deck,Embarked,Sex,Title,Family_Size,LastName,Pclass.COUNT(df),...,Title_LastName.MODE(df.Sex),Title_LastName.MODE(df.Title),Title_LastName.NUM_UNIQUE(df.Age),Title_LastName.NUM_UNIQUE(df.Deck),Title_LastName.NUM_UNIQUE(df.Embarked),Title_LastName.NUM_UNIQUE(df.Family_Size),Title_LastName.NUM_UNIQUE(df.Fare),Title_LastName.NUM_UNIQUE(df.Pclass),Title_LastName.NUM_UNIQUE(df.Sex),Title_LastName.NUM_UNIQUE(df.Title)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
892,3,34.5,7.8292,7,1,1,10,1,401,709,...,0,9,4,1,2,1,4,2,2,3
893,3,47.0,7.0000,7,2,0,11,2,846,709,...,0,11,1,1,1,1,1,1,1,1
894,2,62.0,9.6875,7,1,1,10,1,552,277,...,1,10,1,1,1,1,1,1,1,1
895,3,27.0,8.6625,7,2,1,10,1,854,709,...,1,10,1,1,1,1,1,1,1,1
896,3,22.0,12.2875,7,2,0,11,3,342,709,...,0,9,2,1,1,2,1,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,3,26.0,8.0500,7,2,1,10,1,753,709,...,1,10,1,1,1,1,1,1,1,1
1306,1,39.0,108.9000,2,0,0,3,1,593,323,...,0,3,1,1,1,1,1,1,1,1
1307,3,38.5,7.2500,7,2,1,10,1,699,709,...,1,10,1,1,1,1,1,1,1,1
1308,3,26.0,8.0500,7,2,1,10,1,827,709,...,1,10,4,1,1,2,3,2,2,2


In [37]:
# Threshold for removing correlated variables
threshold = 0.9

def highlight(value):
    if value > threshold:
        style = 'background-color: pink'
    else:
        style = 'background-color: palegreen'
    return style

# Absolute value correlation matrix
corr_matrix = features.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.style.applymap(highlight)

Unnamed: 0,Pclass,Age,Fare,Deck,Embarked,Sex,Title,Family_Size,LastName,Pclass.COUNT(df),Pclass.MODE(df.Age),Pclass.MODE(df.Deck),Pclass.MODE(df.Embarked),Pclass.MODE(df.Family_Size),Pclass.MODE(df.Fare),Pclass.MODE(df.LastName),Pclass.MODE(df.Sex),Pclass.MODE(df.Title),Pclass.NUM_UNIQUE(df.Age),Pclass.NUM_UNIQUE(df.Deck),Pclass.NUM_UNIQUE(df.Embarked),Pclass.NUM_UNIQUE(df.Family_Size),Pclass.NUM_UNIQUE(df.Fare),Pclass.NUM_UNIQUE(df.LastName),Pclass.NUM_UNIQUE(df.Sex),Pclass.NUM_UNIQUE(df.Title),Sex.COUNT(df),Sex.MODE(df.Age),Sex.MODE(df.Deck),Sex.MODE(df.Embarked),Sex.MODE(df.Family_Size),Sex.MODE(df.Fare),Sex.MODE(df.LastName),Sex.MODE(df.Pclass),Sex.MODE(df.Title),Sex.NUM_UNIQUE(df.Age),Sex.NUM_UNIQUE(df.Deck),Sex.NUM_UNIQUE(df.Embarked),Sex.NUM_UNIQUE(df.Family_Size),Sex.NUM_UNIQUE(df.Fare),Sex.NUM_UNIQUE(df.LastName),Sex.NUM_UNIQUE(df.Pclass),Sex.NUM_UNIQUE(df.Title),Age.COUNT(df),Age.MODE(df.Deck),Age.MODE(df.Embarked),Age.MODE(df.Family_Size),Age.MODE(df.Fare),Age.MODE(df.LastName),Age.MODE(df.Pclass),Age.MODE(df.Sex),Age.MODE(df.Title),Age.NUM_UNIQUE(df.Deck),Age.NUM_UNIQUE(df.Embarked),Age.NUM_UNIQUE(df.Family_Size),Age.NUM_UNIQUE(df.Fare),Age.NUM_UNIQUE(df.LastName),Age.NUM_UNIQUE(df.Pclass),Age.NUM_UNIQUE(df.Sex),Age.NUM_UNIQUE(df.Title),Fare.COUNT(df),Fare.MODE(df.Age),Fare.MODE(df.Deck),Fare.MODE(df.Embarked),Fare.MODE(df.Family_Size),Fare.MODE(df.LastName),Fare.MODE(df.Pclass),Fare.MODE(df.Sex),Fare.MODE(df.Title),Fare.NUM_UNIQUE(df.Age),Fare.NUM_UNIQUE(df.Deck),Fare.NUM_UNIQUE(df.Embarked),Fare.NUM_UNIQUE(df.Family_Size),Fare.NUM_UNIQUE(df.LastName),Fare.NUM_UNIQUE(df.Pclass),Fare.NUM_UNIQUE(df.Sex),Fare.NUM_UNIQUE(df.Title),Embarked.COUNT(df),Embarked.MODE(df.Age),Embarked.MODE(df.Deck),Embarked.MODE(df.Family_Size),Embarked.MODE(df.Fare),Embarked.MODE(df.LastName),Embarked.MODE(df.Pclass),Embarked.MODE(df.Sex),Embarked.MODE(df.Title),Embarked.NUM_UNIQUE(df.Age),Embarked.NUM_UNIQUE(df.Deck),Embarked.NUM_UNIQUE(df.Family_Size),Embarked.NUM_UNIQUE(df.Fare),Embarked.NUM_UNIQUE(df.LastName),Embarked.NUM_UNIQUE(df.Pclass),Embarked.NUM_UNIQUE(df.Sex),Embarked.NUM_UNIQUE(df.Title),Title.COUNT(df),Title.MODE(df.Age),Title.MODE(df.Deck),Title.MODE(df.Embarked),Title.MODE(df.Family_Size),Title.MODE(df.Fare),Title.MODE(df.LastName),Title.MODE(df.Pclass),Title.MODE(df.Sex),Title.NUM_UNIQUE(df.Age),Title.NUM_UNIQUE(df.Deck),Title.NUM_UNIQUE(df.Embarked),Title.NUM_UNIQUE(df.Family_Size),Title.NUM_UNIQUE(df.Fare),Title.NUM_UNIQUE(df.LastName),Title.NUM_UNIQUE(df.Pclass),Title.NUM_UNIQUE(df.Sex),LastName.COUNT(df),LastName.MODE(df.Age),LastName.MODE(df.Deck),LastName.MODE(df.Embarked),LastName.MODE(df.Family_Size),LastName.MODE(df.Fare),LastName.MODE(df.Pclass),LastName.MODE(df.Sex),LastName.MODE(df.Title),LastName.NUM_UNIQUE(df.Age),LastName.NUM_UNIQUE(df.Deck),LastName.NUM_UNIQUE(df.Embarked),LastName.NUM_UNIQUE(df.Family_Size),LastName.NUM_UNIQUE(df.Fare),LastName.NUM_UNIQUE(df.Pclass),LastName.NUM_UNIQUE(df.Sex),LastName.NUM_UNIQUE(df.Title),Deck.COUNT(df),Deck.MODE(df.Age),Deck.MODE(df.Embarked),Deck.MODE(df.Family_Size),Deck.MODE(df.Fare),Deck.MODE(df.LastName),Deck.MODE(df.Pclass),Deck.MODE(df.Sex),Deck.MODE(df.Title),Deck.NUM_UNIQUE(df.Age),Deck.NUM_UNIQUE(df.Embarked),Deck.NUM_UNIQUE(df.Family_Size),Deck.NUM_UNIQUE(df.Fare),Deck.NUM_UNIQUE(df.LastName),Deck.NUM_UNIQUE(df.Pclass),Deck.NUM_UNIQUE(df.Sex),Deck.NUM_UNIQUE(df.Title),Family_Size.COUNT(df),Family_Size.MODE(df.Age),Family_Size.MODE(df.Deck),Family_Size.MODE(df.Embarked),Family_Size.MODE(df.Fare),Family_Size.MODE(df.LastName),Family_Size.MODE(df.Pclass),Family_Size.MODE(df.Sex),Family_Size.MODE(df.Title),Family_Size.NUM_UNIQUE(df.Age),Family_Size.NUM_UNIQUE(df.Deck),Family_Size.NUM_UNIQUE(df.Embarked),Family_Size.NUM_UNIQUE(df.Fare),Family_Size.NUM_UNIQUE(df.LastName),Family_Size.NUM_UNIQUE(df.Pclass),Family_Size.NUM_UNIQUE(df.Sex),Family_Size.NUM_UNIQUE(df.Title),Title_Sex.COUNT(df),Title_Sex.MODE(df.Age),Title_Sex.MODE(df.Deck),Title_Sex.MODE(df.Embarked),Title_Sex.MODE(df.Family_Size),Title_Sex.MODE(df.Fare),Title_Sex.MODE(df.LastName),Title_Sex.MODE(df.Pclass),Title_Sex.MODE(df.Title),Title_Sex.NUM_UNIQUE(df.Age),Title_Sex.NUM_UNIQUE(df.Deck),Title_Sex.NUM_UNIQUE(df.Embarked),Title_Sex.NUM_UNIQUE(df.Family_Size),Title_Sex.NUM_UNIQUE(df.Fare),Title_Sex.NUM_UNIQUE(df.LastName),Title_Sex.NUM_UNIQUE(df.Pclass),Title_Sex.NUM_UNIQUE(df.Title),Sex_LastName.COUNT(df),Sex_LastName.MODE(df.Age),Sex_LastName.MODE(df.Deck),Sex_LastName.MODE(df.Embarked),Sex_LastName.MODE(df.Family_Size),Sex_LastName.MODE(df.Fare),Sex_LastName.MODE(df.Pclass),Sex_LastName.MODE(df.Sex),Sex_LastName.MODE(df.Title),Sex_LastName.NUM_UNIQUE(df.Age),Sex_LastName.NUM_UNIQUE(df.Deck),Sex_LastName.NUM_UNIQUE(df.Embarked),Sex_LastName.NUM_UNIQUE(df.Family_Size),Sex_LastName.NUM_UNIQUE(df.Fare),Sex_LastName.NUM_UNIQUE(df.Pclass),Sex_LastName.NUM_UNIQUE(df.Sex),Sex_LastName.NUM_UNIQUE(df.Title),Title_LastName.COUNT(df),Title_LastName.MODE(df.Age),Title_LastName.MODE(df.Deck),Title_LastName.MODE(df.Embarked),Title_LastName.MODE(df.Family_Size),Title_LastName.MODE(df.Fare),Title_LastName.MODE(df.Pclass),Title_LastName.MODE(df.Sex),Title_LastName.MODE(df.Title),Title_LastName.NUM_UNIQUE(df.Age),Title_LastName.NUM_UNIQUE(df.Deck),Title_LastName.NUM_UNIQUE(df.Embarked),Title_LastName.NUM_UNIQUE(df.Family_Size),Title_LastName.NUM_UNIQUE(df.Fare),Title_LastName.NUM_UNIQUE(df.Pclass),Title_LastName.NUM_UNIQUE(df.Sex),Title_LastName.NUM_UNIQUE(df.Title)
Pclass,,0.453936,0.558742,0.733576,0.185479,0.124617,0.006818,0.050027,0.047069,0.881788,0.971843,0.884911,,,0.974015,1.0,,,0.915201,0.884911,,0.915201,0.489974,0.881688,,0.956785,0.124617,0.124617,,,,0.124617,0.124617,,0.124617,0.124617,0.124617,,,0.124617,0.124617,,0.124617,0.338058,0.23934,0.094161,0.078451,0.253074,0.267332,0.513723,0.032889,0.202727,0.189637,0.267021,0.316218,0.309427,0.335621,0.169033,0.162008,0.063779,0.243477,0.341949,0.727084,0.195392,0.026092,0.048291,0.979956,0.262791,0.008093,0.166846,0.347534,0.037259,0.036343,0.236393,0.152096,0.080238,0.173154,0.05371,,,,0.210715,0.185479,0.269658,,0.230491,0.086073,0.136571,0.05026,0.003645,0.053378,,,0.156814,0.130858,0.178472,0.130476,0.121239,0.055997,0.152144,0.084479,0.238482,0.122209,0.128818,0.035635,0.147998,0.17703,0.104719,0.123989,0.162527,0.097832,0.113054,0.368873,0.703976,0.166952,0.059378,0.539622,0.954679,0.174806,0.083874,0.075786,0.17377,0.141028,0.013431,0.053301,0.097167,0.101218,0.022556,0.708953,0.349957,,0.464585,0.661131,0.691098,0.746672,0.430755,0.052133,0.665622,0.450529,0.665454,0.700999,0.709091,0.6597,0.052133,0.436071,0.113383,0.087918,,,0.072337,0.043927,0.231638,0.1837,0.083935,0.000491,0.112288,0.11561,0.072578,0.126642,0.139546,,0.073354,0.124617,0.124617,,,,0.124617,0.124617,,0.124617,0.124617,0.124617,,,0.124617,0.124617,,0.124617,0.113054,0.368873,0.703976,0.166952,0.059378,0.539622,0.954679,0.174806,0.083874,0.075786,0.17377,0.141028,0.013431,0.053301,0.097167,0.101218,0.022556,0.113054,0.368873,0.703976,0.166952,0.059378,0.539622,0.954679,0.174806,0.083874,0.075786,0.17377,0.141028,0.013431,0.053301,0.097167,0.101218,0.022556
Age,,,0.198427,0.337997,0.050197,0.070794,0.227646,0.21747,0.00218,0.364253,0.459151,0.437266,,,0.459438,0.453936,,,0.384663,0.437266,,0.384663,0.155837,0.364193,,0.456529,0.070794,0.070794,,,,0.070794,0.070794,,0.070794,0.070794,0.070794,,,0.070794,0.070794,,0.070794,0.208856,0.408593,0.131926,0.378266,0.244406,0.119011,0.710172,0.171935,0.569608,0.177267,0.323587,0.446981,0.264518,0.209577,0.165927,0.245304,0.184459,0.008723,0.507736,0.318784,0.055477,0.217998,0.016627,0.449115,0.035384,0.193067,0.01966,0.221112,0.06427,0.038672,0.008001,0.087671,0.069684,0.021985,0.003775,,,,0.061198,0.050197,0.088353,,0.105021,0.056334,0.074155,0.005125,0.023114,0.003905,,,0.081092,0.180061,0.550041,0.128881,0.115151,0.137064,0.038692,0.06008,0.290434,0.073172,0.175455,0.077122,0.148318,0.179979,0.23625,0.197716,0.175239,0.085977,0.200143,0.694443,0.334156,0.038455,0.214002,0.193096,0.436978,0.040135,0.081012,0.185275,0.125763,0.105269,0.009862,0.005167,0.080585,0.068404,0.160228,0.309608,0.215848,,0.205774,0.286156,0.299546,0.34551,0.175073,0.079548,0.288394,0.196729,0.293247,0.304349,0.309538,0.283576,0.079548,0.177351,0.150541,0.212887,,,0.093795,0.204121,0.077455,0.002979,0.21418,0.213259,0.187101,0.219299,0.186193,0.137162,0.132059,,0.187508,0.070794,0.070794,,,,0.070794,0.070794,,0.070794,0.070794,0.070794,,,0.070794,0.070794,,0.070794,0.200143,0.694443,0.334156,0.038455,0.214002,0.193096,0.436978,0.040135,0.081012,0.185275,0.125763,0.105269,0.009862,0.005167,0.080585,0.068404,0.160228,0.200143,0.694443,0.334156,0.038455,0.214002,0.193096,0.436978,0.040135,0.081012,0.185275,0.125763,0.105269,0.009862,0.005167,0.080585,0.068404,0.160228
Fare,,,,0.548587,0.238182,0.185746,0.03508,0.226654,0.036241,0.385678,0.596473,0.600116,,,0.595611,0.558742,,,0.419923,0.600116,,0.419923,0.075977,0.38558,,0.600576,0.185746,0.185746,,,,0.185746,0.185746,,0.185746,0.185746,0.185746,,,0.185746,0.185746,,0.185746,0.172161,0.144272,0.067521,0.012898,0.177037,0.112988,0.257946,0.053751,0.04396,0.07121,0.130954,0.097841,0.162862,0.172961,0.063956,0.014127,0.032293,0.292726,0.044964,0.651841,0.25011,0.256332,0.090871,0.549882,0.3964,0.14826,0.276294,0.0456,0.140617,0.038528,0.301858,0.081501,0.070176,0.126018,0.140276,,,,0.254343,0.238182,0.286406,,0.129921,0.018135,0.030532,0.137473,0.098534,0.140007,,,0.050848,0.187926,0.001398,0.044088,0.054013,0.108105,0.143533,0.13356,0.153082,0.186062,0.156455,0.070368,0.028284,0.041054,0.154689,0.185498,0.033303,0.030289,0.141254,0.084697,0.527878,0.227962,0.215972,0.958838,0.536809,0.236023,0.12947,0.146824,0.089313,0.051129,0.002366,0.055063,0.009277,0.246179,0.226831,0.498129,0.229703,,0.345833,0.599086,0.54532,0.532882,0.343228,0.022831,0.434171,0.303978,0.420806,0.488332,0.498889,0.545871,0.022831,0.202234,0.279708,0.167054,,,0.251896,0.169725,0.175952,0.205018,0.187713,0.259033,0.154609,0.14751,0.276149,0.27736,0.039363,,0.277918,0.185746,0.185746,,,,0.185746,0.185746,,0.185746,0.185746,0.185746,,,0.185746,0.185746,,0.185746,0.141254,0.084697,0.527878,0.227962,0.215972,0.958838,0.536809,0.236023,0.12947,0.146824,0.089313,0.051129,0.002366,0.055063,0.009277,0.246179,0.226831,0.141254,0.084697,0.527878,0.227962,0.215972,0.958838,0.536809,0.236023,0.12947,0.146824,0.089313,0.051129,0.002366,0.055063,0.009277,0.246179,0.226831
Deck,,,,,0.231926,0.119538,0.02844,0.011856,0.053863,0.483053,0.794756,0.81091,,,0.793172,0.733576,,,0.531406,0.81091,,0.531406,0.056674,0.482914,,0.802869,0.119538,0.119538,,,,0.119538,0.119538,,0.119538,0.119538,0.119538,,,0.119538,0.119538,,0.119538,0.228242,0.258778,0.122233,0.048738,0.171493,0.137851,0.360376,0.005649,0.131785,0.119364,0.180528,0.184939,0.228318,0.229054,0.128619,0.101142,0.019295,0.303806,0.246748,0.854314,0.236474,0.011364,0.022244,0.718707,0.280414,0.066152,0.292297,0.07784,0.049813,0.122118,0.285397,0.00107,0.113207,0.028699,0.13138,,,,0.24885,0.231926,0.283259,,0.138646,0.008255,0.040316,0.128531,0.089086,0.131107,,,0.060514,0.127339,0.13691,0.193935,0.131173,0.075941,0.170078,0.069434,0.195927,0.117312,0.125945,0.019557,0.135401,0.140381,0.10714,0.122388,0.123529,0.064628,0.038782,0.263658,0.949494,0.211212,0.004874,0.538074,0.695946,0.179754,0.119241,0.011421,0.187241,0.085947,0.0068,0.029946,0.036655,0.137801,0.079607,0.939768,0.467616,,0.567997,0.863526,0.932903,0.965689,0.595913,0.001514,0.895245,0.677031,0.874494,0.931775,0.940006,0.942206,0.001514,0.518422,0.135905,0.025124,,,0.105709,0.057036,0.20281,0.180118,0.016821,0.056775,0.036891,0.046258,0.106493,0.144458,0.084428,,0.107663,0.119538,0.119538,,,,0.119538,0.119538,,0.119538,0.119538,0.119538,,,0.119538,0.119538,,0.119538,0.038782,0.263658,0.949494,0.211212,0.004874,0.538074,0.695946,0.179754,0.119241,0.011421,0.187241,0.085947,0.0068,0.029946,0.036655,0.137801,0.079607,0.038782,0.263658,0.949494,0.211212,0.004874,0.538074,0.695946,0.179754,0.119241,0.011421,0.187241,0.085947,0.0068,0.029946,0.036655,0.137801,0.079607
Embarked,,,,,,0.09796,0.047471,0.067598,0.031779,0.062117,0.230932,0.264302,,,0.229368,0.185479,,,0.083079,0.264302,,0.083079,0.096602,0.062059,,0.240004,0.09796,0.09796,,,,0.09796,0.09796,,0.09796,0.09796,0.09796,,,0.09796,0.09796,,0.09796,0.043062,0.093954,0.19262,0.024795,0.011388,0.081584,0.055465,0.053942,0.017781,0.002002,0.03136,0.020771,0.025769,0.04116,0.059444,0.027538,0.014829,0.213263,0.055107,0.264322,0.946639,0.054757,0.144375,0.180808,0.228209,0.032534,0.255189,0.107915,0.062595,0.137168,0.199514,0.090728,0.027091,0.059179,0.907005,,,,0.995386,1.0,0.93539,,0.195261,0.650057,0.517901,0.902485,0.832326,0.906574,,,0.457938,0.089727,0.020897,0.056625,0.119803,0.031688,0.066893,0.078365,0.076691,0.099001,0.073875,0.030746,0.038533,0.039355,0.073406,0.088743,0.034615,0.011417,0.087685,0.039584,0.216077,0.963635,0.066643,0.227458,0.178227,0.101618,0.066488,0.096207,0.002464,0.055161,0.040969,0.054258,0.019002,0.021554,0.040376,0.20327,0.120075,,0.139311,0.208635,0.218922,0.2224,0.149434,0.038533,0.184519,0.153306,0.181534,0.1996,0.203343,0.217882,0.038533,0.090094,0.035928,0.077064,,,0.012537,0.00494,0.121642,0.083231,0.099691,0.044595,0.110179,0.114793,0.006853,0.046339,0.103142,,0.004661,0.09796,0.09796,,,,0.09796,0.09796,,0.09796,0.09796,0.09796,,,0.09796,0.09796,,0.09796,0.087685,0.039584,0.216077,0.963635,0.066643,0.227458,0.178227,0.101618,0.066488,0.096207,0.002464,0.055161,0.040969,0.054258,0.019002,0.021554,0.040376,0.087685,0.039584,0.216077,0.963635,0.066643,0.227458,0.178227,0.101618,0.066488,0.096207,0.002464,0.055161,0.040969,0.054258,0.019002,0.021554,0.040376
Sex,,,,,,,0.034649,0.188583,0.037205,0.112826,0.119639,0.107371,,,0.119966,0.124617,,,0.116562,0.107371,,0.116562,0.066494,0.112815,,0.117418,1.0,1.0,,,,1.0,1.0,,1.0,1.0,1.0,,,1.0,1.0,,1.0,0.143371,0.02403,0.01923,0.056678,0.045627,0.070839,0.008608,0.358809,0.044409,0.192846,0.017324,0.15433,0.0307,0.144115,0.066043,0.175026,0.095477,0.173152,0.144624,0.155791,0.112159,0.190287,0.052422,0.123205,0.461574,0.208091,0.174995,0.061448,0.08697,0.022491,0.191596,0.043733,0.097398,0.101997,0.118703,,,,0.090707,0.09796,0.066564,,0.088651,0.117546,0.111374,0.11894,0.120827,0.118726,,,0.107878,0.777325,0.08742,0.007087,0.017359,0.251801,0.337659,0.847117,0.523192,0.998335,0.511761,0.588418,0.039275,0.063756,0.550851,0.770035,0.065681,0.037831,0.123005,0.135756,0.110088,0.095413,0.172405,0.167479,0.119409,0.692815,0.167309,0.126995,0.067229,0.034637,0.041713,0.005319,0.002678,0.225166,0.229255,0.136314,0.033414,,0.12366,0.136955,0.15423,0.12807,0.127466,0.083285,0.126374,0.106749,0.126669,0.1358,0.136477,0.125232,0.083285,0.112995,0.282998,0.115534,,,0.215149,0.227549,0.190687,0.233039,0.107876,0.229693,0.088957,0.119537,0.271883,0.284117,0.044904,,0.27296,1.0,1.0,,,,1.0,1.0,,1.0,1.0,1.0,,,1.0,1.0,,1.0,0.123005,0.135756,0.110088,0.095413,0.172405,0.167479,0.119409,0.692815,0.167309,0.126995,0.067229,0.034637,0.041713,0.005319,0.002678,0.225166,0.229255,0.123005,0.135756,0.110088,0.095413,0.172405,0.167479,0.119409,0.692815,0.167309,0.126995,0.067229,0.034637,0.041713,0.005319,0.002678,0.225166,0.229255
Title,,,,,,,,0.10117,0.016156,0.033054,0.026144,0.044613,,,0.025401,0.006818,,,0.02714,0.044613,,0.02714,0.068866,0.03307,,0.03061,0.034649,0.034649,,,,0.034649,0.034649,,0.034649,0.034649,0.034649,,,0.034649,0.034649,,0.034649,0.08856,0.065386,0.080062,0.29426,0.018284,0.006362,0.058048,0.117327,0.394855,0.183544,0.098332,0.033835,0.131551,0.093971,0.153265,0.019131,0.140375,0.064954,0.098641,0.023408,0.025257,0.116034,0.013412,0.003863,0.02286,0.358076,0.078254,0.03731,0.043088,0.038473,0.062508,0.036045,0.005735,0.043855,0.078785,,,,0.039112,0.047471,0.014403,,0.092477,0.095328,0.097162,0.079385,0.086538,0.078843,,,0.097161,0.282884,0.417375,0.265767,0.447729,0.00594,0.02357,0.125755,0.256809,0.043054,0.431518,0.33399,0.290027,0.474136,0.485545,0.309881,0.37854,0.412077,0.08314,0.095121,0.023262,0.048192,0.09745,0.028758,0.008512,0.04003,0.674384,0.07878,0.040124,0.024991,0.02107,0.006093,0.044099,0.007661,0.060642,0.011671,0.05333,,0.012864,0.009683,0.006875,0.010796,0.016158,0.00117,0.014811,0.015412,0.008298,0.01163,0.011696,0.03372,0.00117,0.025749,0.010116,0.144081,,,0.01518,0.07505,0.15482,0.106755,0.116076,0.073726,0.102816,0.117245,0.042136,0.000118,0.091602,,0.039962,0.034649,0.034649,,,,0.034649,0.034649,,0.034649,0.034649,0.034649,,,0.034649,0.034649,,0.034649,0.08314,0.095121,0.023262,0.048192,0.09745,0.028758,0.008512,0.04003,0.674384,0.07878,0.040124,0.024991,0.02107,0.006093,0.044099,0.007661,0.060642,0.08314,0.095121,0.023262,0.048192,0.09745,0.028758,0.008512,0.04003,0.674384,0.07878,0.040124,0.024991,0.02107,0.006093,0.044099,0.007661,0.060642
Family_Size,,,,,,,,,0.018183,0.058912,0.041225,0.029656,,,0.041621,0.050027,,,0.05843,0.029656,,0.05843,0.051864,0.058913,,0.038741,0.188583,0.188583,,,,0.188583,0.188583,,0.188583,0.188583,0.188583,,,0.188583,0.188583,,0.188583,0.13068,0.009192,0.024632,0.400447,0.08479,0.034324,0.063779,0.211615,0.321877,0.120462,0.109852,0.149196,0.161014,0.136646,0.100915,0.080708,0.1065,0.278407,0.387968,0.025859,0.063847,0.92879,0.21177,0.052239,0.259661,0.465477,0.24996,0.189938,0.186896,0.142487,0.345909,0.045961,0.183738,0.271247,0.093083,,,,0.060048,0.067598,0.036553,,0.08719,0.101271,0.099546,0.093501,0.098081,0.093123,,,0.098024,0.349937,0.248195,0.016256,0.025208,0.392948,0.292588,0.032756,0.141834,0.189565,0.349527,0.044842,0.034309,0.034892,0.35048,0.351223,0.037147,0.006632,0.812458,0.384581,0.002428,0.069389,0.959446,0.233694,0.066231,0.180793,0.274656,0.721893,0.009615,0.046134,0.183217,0.070048,0.059855,0.556446,0.736225,0.012761,0.018252,,0.02837,0.041584,0.026653,0.00829,0.016081,0.035835,0.005438,0.002557,0.002091,0.012084,0.012972,0.030576,0.035835,0.006093,0.783629,0.802639,,,0.802589,0.613712,0.090891,0.303004,0.787103,0.939822,0.765114,0.872084,0.862085,0.746817,0.691232,,0.862431,0.188583,0.188583,,,,0.188583,0.188583,,0.188583,0.188583,0.188583,,,0.188583,0.188583,,0.188583,0.812458,0.384581,0.002428,0.069389,0.959446,0.233694,0.066231,0.180793,0.274656,0.721893,0.009615,0.046134,0.183217,0.070048,0.059855,0.556446,0.736225,0.812458,0.384581,0.002428,0.069389,0.959446,0.233694,0.066231,0.180793,0.274656,0.721893,0.009615,0.046134,0.183217,0.070048,0.059855,0.556446,0.736225
LastName,,,,,,,,,,0.045422,0.043787,0.037784,,,0.043965,0.047069,,,0.046425,0.037784,,0.046425,0.030302,0.045419,,0.04262,0.037205,0.037205,,,,0.037205,0.037205,,0.037205,0.037205,0.037205,,,0.037205,0.037205,,0.037205,0.092783,0.013455,0.027648,0.007505,0.022827,0.154175,0.061868,0.006972,0.00356,0.041689,0.030933,0.024634,0.059173,0.090794,0.011163,0.05057,0.061033,0.00861,0.01425,0.039999,0.040554,0.001165,0.383985,0.042876,0.02784,0.033116,0.011181,0.037617,0.005446,0.010779,0.013153,0.001137,0.021846,0.026272,0.025416,,,,0.032409,0.031779,0.032587,,0.001731,0.014509,0.009536,0.025195,0.021965,0.025395,,,0.007359,0.034386,0.004402,0.018378,0.012393,0.018425,0.013978,0.038632,0.025552,0.037251,0.025394,0.013867,0.000914,0.003486,0.02623,0.033874,0.007629,0.00045,0.074815,0.017013,0.050838,0.018712,0.046311,0.016312,0.033968,0.089444,0.028727,0.112194,0.070416,0.011044,0.045961,0.099665,0.002831,0.065134,0.092802,0.035676,0.047348,,0.026375,0.040396,0.03329,0.046796,0.028487,0.074298,0.030983,0.026163,0.028205,0.034998,0.035607,0.038967,0.074298,0.003692,0.041313,0.024426,,,0.003276,0.091817,0.038554,0.091157,0.022515,0.049731,0.044031,0.074581,0.043762,0.039077,0.083631,,0.048146,0.037205,0.037205,,,,0.037205,0.037205,,0.037205,0.037205,0.037205,,,0.037205,0.037205,,0.037205,0.074815,0.017013,0.050838,0.018712,0.046311,0.016312,0.033968,0.089444,0.028727,0.112194,0.070416,0.011044,0.045961,0.099665,0.002831,0.065134,0.092802,0.074815,0.017013,0.050838,0.018712,0.046311,0.016312,0.033968,0.089444,0.028727,0.112194,0.070416,0.011044,0.045961,0.099665,0.002831,0.065134,0.092802
Pclass.COUNT(df),,,,,,,,,,,0.745825,0.560631,,,0.752056,0.881788,,,0.997086,0.560631,,0.997086,0.843204,1.0,,0.706529,0.112826,0.112826,,,,0.112826,0.112826,,0.112826,0.112826,0.112826,,,0.112826,0.112826,,0.112826,0.340792,0.162309,0.045378,0.06234,0.209899,0.282017,0.423769,0.064038,0.162052,0.170669,0.242664,0.281838,0.295861,0.336805,0.101273,0.095838,0.099216,0.08436,0.263251,0.471765,0.073497,0.04429,0.054611,0.872488,0.169778,0.023063,0.025749,0.474576,0.049399,0.056616,0.089641,0.227365,0.035591,0.298324,0.052601,,,,0.086652,0.062117,0.149583,,0.241585,0.156198,0.189127,0.055365,0.091688,0.052867,,,0.201529,0.124863,0.155491,0.082662,0.07681,0.058759,0.134888,0.07979,0.222561,0.111317,0.122435,0.041611,0.125347,0.156833,0.099944,0.118134,0.154467,0.081242,0.120342,0.286429,0.470999,0.055898,0.065073,0.375687,0.844234,0.149508,0.024166,0.080989,0.146406,0.11116,0.004268,0.053115,0.11079,0.079089,0.010131,0.480024,0.226746,,0.299091,0.424235,0.450701,0.498569,0.28359,0.056798,0.456075,0.290266,0.456482,0.476032,0.480029,0.430538,0.056798,0.318078,0.103644,0.096883,,,0.040749,0.072506,0.201913,0.158763,0.090329,0.00842,0.127389,0.118361,0.067045,0.116118,0.152032,,0.06807,0.112826,0.112826,,,,0.112826,0.112826,,0.112826,0.112826,0.112826,,,0.112826,0.112826,,0.112826,0.120342,0.286429,0.470999,0.055898,0.065073,0.375687,0.844234,0.149508,0.024166,0.080989,0.146406,0.11116,0.004268,0.053115,0.11079,0.079089,0.010131,0.120342,0.286429,0.470999,0.055898,0.065073,0.375687,0.844234,0.149508,0.024166,0.080989,0.146406,0.11116,0.004268,0.053115,0.11079,0.079089,0.010131


In [38]:
# Select columns with correlations above threshold
collinear_features = [column for column in upper.columns if any(upper[column] > threshold)]
features_filtered = features.drop(columns = collinear_features)
#features_positive = features_filtered.loc[:, features_filtered.ge(0).all()]
print('The number of features that passed the collinearity threshold: ', features_filtered.shape[1])

The number of features that passed the collinearity threshold:  107


In [39]:
FE_option0 = features.columns
FE_option1 = features_filtered.columns
print(len(FE_option0), len(FE_option1))

213 107


## FS by the SelectFromModel with LinearSVC

In [40]:
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(train, target)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(train)
X_selected_df = pd.DataFrame(X_new, columns=[train.columns[i] for i in range(len(train.columns)) if model.get_support()[i]])
X_selected_df.shape

(891, 39)

In [44]:
model.get_support()

array([False,  True,  True, False, False, False, False, False,  True,
        True,  True, False, False, False, False, False, False, False,
        True, False, False, False, False,  True, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True,  True, False, False,  True, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False,  True,  True, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False,  True, False, False, False,
        True, False, False, False,  True,  True, False, False, False,
        True, False, False, False,  True, False, False, False,  True,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [45]:
FE_option2 = X_selected_df.columns
FE_option2

Index(['Age', 'Fare', 'LastName', 'Pclass.COUNT(df)', 'Pclass.MODE(df.Age)',
       'Pclass.NUM_UNIQUE(df.Age)', 'Pclass.NUM_UNIQUE(df.LastName)',
       'Sex.COUNT(df)', 'Sex.NUM_UNIQUE(df.Fare)',
       'Sex.NUM_UNIQUE(df.LastName)', 'Age.COUNT(df)', 'Age.MODE(df.LastName)',
       'Fare.COUNT(df)', 'Fare.MODE(df.Age)', 'Fare.MODE(df.LastName)',
       'Embarked.COUNT(df)', 'Embarked.NUM_UNIQUE(df.Age)',
       'Embarked.NUM_UNIQUE(df.LastName)', 'Title.COUNT(df)',
       'Title.MODE(df.Age)', 'Title.MODE(df.Fare)', 'Title.NUM_UNIQUE(df.Age)',
       'Title.NUM_UNIQUE(df.Fare)', 'Title.NUM_UNIQUE(df.LastName)',
       'Deck.COUNT(df)', 'Deck.MODE(df.Fare)', 'Deck.NUM_UNIQUE(df.Age)',
       'Deck.NUM_UNIQUE(df.LastName)', 'Family_Size.COUNT(df)',
       'Family_Size.MODE(df.Age)', 'Family_Size.MODE(df.Fare)',
       'Family_Size.NUM_UNIQUE(df.Age)', 'Family_Size.NUM_UNIQUE(df.LastName)',
       'Title_Sex.COUNT(df)', 'Title_Sex.NUM_UNIQUE(df.Fare)',
       'Title_Sex.NUM_UNIQUE(df.La

## FS by the SelectFromModel with Lasso

In [46]:
lasso = LassoCV(cv=5).fit(train, target)
model = SelectFromModel(lasso, prefit=True)
X_new = model.transform(train)
X_selected_df = pd.DataFrame(X_new, columns=[train.columns[i] for i in range(len(train.columns)) 
                                             if model.get_support()[i]])

In [47]:
FE_option3 = X_selected_df.columns
FE_option3

Index(['Age', 'Fare', 'LastName', 'Pclass.COUNT(df)',
       'Pclass.NUM_UNIQUE(df.Fare)', 'Sex.COUNT(df)', 'Age.COUNT(df)',
       'Age.MODE(df.LastName)', 'Fare.COUNT(df)', 'Fare.MODE(df.Age)',
       'Fare.MODE(df.LastName)', 'Embarked.COUNT(df)', 'Title.COUNT(df)',
       'Title.MODE(df.Age)', 'Title.MODE(df.Fare)',
       'Title.NUM_UNIQUE(df.Fare)', 'Title.NUM_UNIQUE(df.LastName)',
       'Deck.COUNT(df)', 'Deck.MODE(df.Fare)', 'Family_Size.MODE(df.Age)',
       'Family_Size.MODE(df.Fare)', 'Family_Size.NUM_UNIQUE(df.Age)',
       'Family_Size.NUM_UNIQUE(df.LastName)'],
      dtype='object')

In [48]:
X_selected_df

Unnamed: 0,Age,Fare,LastName,Pclass.COUNT(df),Pclass.NUM_UNIQUE(df.Fare),Sex.COUNT(df),Age.COUNT(df),Age.MODE(df.LastName),Fare.COUNT(df),Fare.MODE(df.Age),...,Title.MODE(df.Age),Title.MODE(df.Fare),Title.NUM_UNIQUE(df.Fare),Title.NUM_UNIQUE(df.LastName),Deck.COUNT(df),Deck.MODE(df.Fare),Family_Size.MODE(df.Age),Family_Size.MODE(df.Fare),Family_Size.NUM_UNIQUE(df.Age),Family_Size.NUM_UNIQUE(df.LastName)
0,22.0,7.2500,100.0,709.0,133.0,843.0,43.0,21.0,18.0,22.0,...,26.0,8.05,215.0,650.0,1014.0,8.050,26.0,26.0000,61.0,131.0
1,38.0,71.2833,182.0,323.0,107.0,466.0,14.0,9.0,2.0,38.0,...,31.0,26.00,140.0,194.0,94.0,263.000,26.0,26.0000,61.0,131.0
2,26.0,7.9250,329.0,709.0,133.0,466.0,166.0,69.0,23.0,20.0,...,18.0,7.75,129.0,219.0,1014.0,8.050,26.0,8.0500,79.0,701.0
3,35.0,53.1000,267.0,323.0,107.0,466.0,23.0,1.0,6.0,18.0,...,31.0,26.00,140.0,194.0,94.0,263.000,26.0,26.0000,61.0,131.0
4,35.0,8.0500,15.0,709.0,133.0,843.0,23.0,1.0,60.0,26.0,...,26.0,8.05,215.0,650.0,1014.0,8.050,26.0,8.0500,79.0,701.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,27.0,13.0000,535.0,277.0,48.0,843.0,30.0,9.0,59.0,30.0,...,27.0,13.00,5.0,8.0,1014.0,8.050,26.0,8.0500,79.0,701.0
887,19.0,30.0000,294.0,323.0,107.0,466.0,29.0,7.0,7.0,19.0,...,18.0,7.75,129.0,219.0,65.0,262.375,26.0,8.0500,79.0,701.0
888,18.0,23.4500,383.0,709.0,133.0,466.0,87.0,69.0,4.0,6.0,...,18.0,7.75,129.0,219.0,1014.0,8.050,24.0,19.2583,26.0,15.0
889,26.0,30.0000,69.0,323.0,107.0,843.0,166.0,69.0,7.0,19.0,...,26.0,8.05,215.0,650.0,94.0,263.000,26.0,8.0500,79.0,701.0


## FS by the SelectKBest with Chi-2

In [49]:
# Visualization from https://towardsdatascience.com/feature-selection-techniques-in-machine-learning-with-python-f24e7da3f36e
# but to k='all'
bestfeatures = SelectKBest(score_func=chi2, k='all')
fit = bestfeatures.fit(train, target)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(train.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Feature','Score']  #naming the dataframe columns
print(featureScores.nlargest(len(dfcolumns),'Score')) 

                               Feature         Score
94                     Title.COUNT(df)  35488.726510
108      Title.NUM_UNIQUE(df.LastName)  29591.581330
128                     Deck.COUNT(df)  18118.376846
141       Deck.NUM_UNIQUE(df.LastName)  12352.761769
26                       Sex.COUNT(df)  12015.112823
..                                 ...           ...
37         Sex.NUM_UNIQUE(df.Embarked)      0.000000
13         Pclass.MODE(df.Family_Size)      0.000000
160     Family_Size.NUM_UNIQUE(df.Sex)      0.000000
38      Sex.NUM_UNIQUE(df.Family_Size)      0.000000
173  Title_Sex.NUM_UNIQUE(df.Embarked)      0.000000

[201 rows x 2 columns]


In [50]:
FE_option4 = featureScores[featureScores['Score'] > 1000]['Feature']
len(FE_option4)

23

In [51]:
FE_option5 = featureScores[featureScores['Score'] > 100]['Feature']
len(FE_option5)

49

## FS by the Recursive Feature Elimination (RFE) with Logistic Regression

In [52]:
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=50, step=10, verbose=5)
rfe_selector.fit(X_norm, target)

Fitting estimator with 213 features.
Fitting estimator with 203 features.
Fitting estimator with 193 features.
Fitting estimator with 183 features.
Fitting estimator with 173 features.
Fitting estimator with 163 features.
Fitting estimator with 153 features.
Fitting estimator with 143 features.
Fitting estimator with 133 features.
Fitting estimator with 123 features.
Fitting estimator with 113 features.
Fitting estimator with 103 features.
Fitting estimator with 93 features.
Fitting estimator with 83 features.
Fitting estimator with 73 features.
Fitting estimator with 63 features.
Fitting estimator with 53 features.


RFE(estimator=LogisticRegression(), n_features_to_select=50, step=10, verbose=5)

In [53]:
rfe_selector.get_support()

array([False,  True,  True, False, False, False, False,  True, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
        True, False, False,  True, False, False, False, False, False,
       False, False, False, False,  True, False,  True,  True,  True,
        True,  True,  True,  True, False, False, False,  True, False,
        True, False,  True, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False,  True,  True,  True, False,  True,
       False,  True, False, False, False,  True, False,  True,  True,
        True,  True, False, False, False, False, False, False, False,
       False, False, False,  True, False,  True, False, False, False,
       False, False,

In [54]:
rfe_support = rfe_selector.get_support()
rfe_feature = train.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

50 selected features


In [55]:
FE_option6 = rfe_feature

## FS by the Recursive Feature Elimination (RFE) with Random Forest

In [56]:
embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=200), threshold='1.25*median')
embeded_rf_selector.fit(train, target)

SelectFromModel(estimator=RandomForestClassifier(n_estimators=200),
                threshold='1.25*median')

In [57]:
embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = train.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')

89 selected features


In [58]:
FE_option7 = embeded_rf_feature

# Modeling

## The Random Forest Classifiers for 8 options of selected feature sets

In [59]:
def RF (features_set,file):
    # Tuning Random Forest model for features "features_set", makes prediction and save it into file  
    train_fe = train[features_set]
    test_fe = test[features_set]
    random_forest = GridSearchCV(estimator=RandomForestClassifier(), param_grid={'n_estimators': [100, 500]}, cv=5).fit(train_fe, target)
    random_forest.fit(train_fe, target)
    Y_pred = random_forest.predict(test_fe).astype(int)
    random_forest.score(train_fe, target)
    acc_random_forest = round(random_forest.score(train_fe, target) * 100, 2)
    #pd.DataFrame({'Survived': Y_pred}, index=testdf.index).reset_index().to_csv(file, index=False)
    return acc_random_forest

In [60]:
acc0 = RF(FE_option0, 'survived_FT.csv')
acc1 = RF(FE_option1, 'survived_FE1_Pearson.csv')
acc2 = RF(FE_option2, 'survived_FE2_LinSVC.csv')
acc3 = RF(FE_option3, 'survived_FE3_Lasso.csv')
acc4 = RF(FE_option4, 'survived_FE4_Chi2_1000.csv')
acc5 = RF(FE_option5, 'survived_FE5_Chi2_100.csv')
acc6 = RF(FE_option6, 'survived_FE6_RFE_LogR.csv')
acc7 = RF(FE_option7, 'survived_FE7_RFE_RF.csv')

In [61]:
models = pd.DataFrame({
    'Model': ['FT',
              'FT + Pearson correlation', 
              'FT + SelectFromModel with LinearSVC',
              'FT + SelectFromModel with Lasso', 
              'FT + SelectKBest with Chi-2 with Score > 1000',
              'FT + SelectKBest with Chi-2 with Score > 100',
              'FT + RFE with Logistic Regression',
              'FT + RFE with Random Forest'],
    
    'acc':  [acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7]})

In [66]:
models

Unnamed: 0,Model,acc
0,FT,99.89
1,FT + Pearson correlation,99.89
2,FT + SelectFromModel with LinearSVC,99.89
3,FT + SelectFromModel with Lasso,99.89
4,FT + SelectKBest with Chi-2 with Score > 1000,98.88
5,FT + SelectKBest with Chi-2 with Score > 100,99.89
6,FT + RFE with Logistic Regression,98.88
7,FT + RFE with Random Forest,99.89
