In [144]:
%matplotlib inline

In [145]:
import pandas as pd

In [146]:
import featuretools as ft

In [147]:
raw = 'raw'
processed = 'processed'
features = 'features'

In [148]:
df_train = pd.read_csv(f'{raw}/train.csv')
df_test = pd.read_csv(f'{raw}/test.csv')

df_train['src'] = 'train'
df_test['src'] = 'test'
union = pd.concat([df_train, df_test], sort=False)
union.reset_index(drop=True, inplace=True)

In [149]:
es = ft.EntitySet(id="titanic")

In [150]:
es.entity_from_dataframe(entity_id="Passengers",
                         index='PassengerId',
                         dataframe=union[['PassengerId', 'Pclass', 'Age', 'SibSp', 'Fare', 'Embarked', 'Sex']])

Entityset: titanic
  Entities:
    Passengers [Rows: 1309, Columns: 7]
  Relationships:
    No relationships

In [151]:
es = es.normalize_entity(base_entity_id='Passengers', new_entity_id='Pclass', index='Pclass')

In [152]:
es.add_interesting_values(verbose=True)

2020-02-11 19:47:21,605 featuretools.entityset - INFO    Variable Embarked: Marking S as an interesting value
2020-02-11 19:47:21,607 featuretools.entityset - INFO    Variable Embarked: Marking C as an interesting value
2020-02-11 19:47:21,608 featuretools.entityset - INFO    Variable Embarked: Marking Q as an interesting value
2020-02-11 19:47:21,613 featuretools.entityset - INFO    Variable Sex: Marking male as an interesting value
2020-02-11 19:47:21,615 featuretools.entityset - INFO    Variable Sex: Marking female as an interesting value


In [153]:
feature_matrix, feature_defs = ft.dfs(entityset=es,
                          max_depth=3,
                          target_entity="Passengers",
                          where_primitives=["count"], trans_primitives=[],
                          verbose=3)

Built 29 features
Elapsed: 00:00 | Progress: 100%|██████████


In [154]:
feature_defs

[<Feature: Pclass>,
 <Feature: Age>,
 <Feature: SibSp>,
 <Feature: Fare>,
 <Feature: Embarked>,
 <Feature: Sex>,
 <Feature: Pclass.SUM(Passengers.SibSp)>,
 <Feature: Pclass.SUM(Passengers.Fare)>,
 <Feature: Pclass.SUM(Passengers.Age)>,
 <Feature: Pclass.STD(Passengers.SibSp)>,
 <Feature: Pclass.STD(Passengers.Fare)>,
 <Feature: Pclass.STD(Passengers.Age)>,
 <Feature: Pclass.MAX(Passengers.SibSp)>,
 <Feature: Pclass.MAX(Passengers.Fare)>,
 <Feature: Pclass.MAX(Passengers.Age)>,
 <Feature: Pclass.SKEW(Passengers.SibSp)>,
 <Feature: Pclass.SKEW(Passengers.Fare)>,
 <Feature: Pclass.SKEW(Passengers.Age)>,
 <Feature: Pclass.MIN(Passengers.SibSp)>,
 <Feature: Pclass.MIN(Passengers.Fare)>,
 <Feature: Pclass.MIN(Passengers.Age)>,
 <Feature: Pclass.MEAN(Passengers.SibSp)>,
 <Feature: Pclass.MEAN(Passengers.Fare)>,
 <Feature: Pclass.MEAN(Passengers.Age)>,
 <Feature: Pclass.COUNT(Passengers)>,
 <Feature: Pclass.NUM_UNIQUE(Passengers.Sex)>,
 <Feature: Pclass.NUM_UNIQUE(Passengers.Embarked)>,
 <Feat

In [161]:
feature_columns = [column for column in feature_matrix.columns if column not in [
    'Sex', 'Pclass.MODE(Passengers.Sex)', 'Pclass.MODE(Passengers.Embarked)', 'Embarked', 'Fare', 'Age'] + list(union.columns.values)]

In [162]:
t = feature_matrix[feature_columns].isnull().sum()
t[t > 0]

Series([], dtype: int64)

In [163]:
t = feature_matrix[feature_columns].isna().sum().sum()
t[t > 0]

array([], dtype=int64)

In [164]:
feature_matrix[feature_columns].head(1)

Unnamed: 0_level_0,Pclass.SUM(Passengers.SibSp),Pclass.SUM(Passengers.Fare),Pclass.SUM(Passengers.Age),Pclass.STD(Passengers.SibSp),Pclass.STD(Passengers.Fare),Pclass.STD(Passengers.Age),Pclass.MAX(Passengers.SibSp),Pclass.MAX(Passengers.Fare),Pclass.MAX(Passengers.Age),Pclass.SKEW(Passengers.SibSp),...,Pclass.SKEW(Passengers.Age),Pclass.MIN(Passengers.SibSp),Pclass.MIN(Passengers.Fare),Pclass.MIN(Passengers.Age),Pclass.MEAN(Passengers.SibSp),Pclass.MEAN(Passengers.Fare),Pclass.MEAN(Passengers.Age),Pclass.COUNT(Passengers),Pclass.NUM_UNIQUE(Passengers.Sex),Pclass.NUM_UNIQUE(Passengers.Embarked)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,403,9418.4452,12433.0,1.299681,11.494358,11.958202,8,69.55,74.0,3.420424,...,0.424489,0,0.0,0.17,0.568406,13.302889,24.816367,709,2,3


In [167]:
union = union.merge(feature_matrix[feature_columns], right_index=True, left_on='PassengerId')

In [179]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

trn_idx = union['src'] == 'train'
trn_idx

target_column = 'Survived'
cl = LogisticRegression(penalty="l2", solver='lbfgs', max_iter=1000)
selector = SelectFromModel(cl, max_features=10)
selector.fit(union[trn_idx][feature_columns], union[trn_idx][target_column])

SelectFromModel(estimator=LogisticRegression(C=1.0, class_weight=None,
                                             dual=False, fit_intercept=True,
                                             intercept_scaling=1, l1_ratio=None,
                                             max_iter=1000, multi_class='warn',
                                             n_jobs=None, penalty='l2',
                                             random_state=None, solver='lbfgs',
                                             tol=0.0001, verbose=0,
                                             warm_start=False),
                max_features=10, norm_order=1, prefit=False, threshold=None)

In [178]:
ft_support = embeded_lr_selector.get_support()
ft_support = union[trn_idx][feature_columns].loc[:,ft_support].columns.tolist()
ft_support

['Pclass.SUM(Passengers.SibSp)',
 'Pclass.MAX(Passengers.Fare)',
 'Pclass.MAX(Passengers.Age)',
 'Pclass.COUNT(Passengers)']

In [181]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
kfold = KFold(random_state=42, n_splits=3)
cross_val_score(cl, union[trn_idx][feature_columns], union[trn_idx][target_column], cv=kfold)

array([0.63973064, 0.65993266, 0.7003367 ])

In [189]:
from sklearn.ensemble import GradientBoostingClassifier
cl = GradientBoostingClassifier()
cross_val_score(cl, union[trn_idx][ft_support], union[trn_idx][target_column], cv=kfold)

array([0.63973064, 0.65993266, 0.7003367 ])

In [191]:
union[trn_idx][ft_support].describe()

Unnamed: 0,Pclass.SUM(Passengers.SibSp),Pclass.MAX(Passengers.Fare),Pclass.MAX(Passengers.Age),Pclass.COUNT(Passengers)
count,891.0,891.0,891.0,891.0
mean,278.771044,177.706125,74.628507,526.212121
std,138.128231,189.403904,3.413133,203.211261
min,109.0,69.55,70.0,277.0
25%,141.0,69.55,74.0,323.0
50%,403.0,69.55,74.0,709.0
75%,403.0,73.5,74.0,709.0
max,403.0,512.3292,80.0,709.0


In [193]:
df_train = union[trn_idx]
df_test = union[~trn_idx]

In [194]:
df_train[['PassengerId'] + feature_columns].to_csv(f'{features}/ftools_all_train.csv', index=False)
df_test[['PassengerId'] + feature_columns].to_csv(f'{features}/ftools_all_test.csv', index=False)

In [195]:
df_train[['PassengerId'] + ft_support].to_csv(f'{features}/ftools_selected_train.csv', index=False)
df_test[['PassengerId'] + ft_support].to_csv(f'{features}/ftools_selected_test.csv', index=False)

In [196]:
ls -la features/ft*

-rw-r--r--  1 ivan.sobolev  staff   99434 Feb 11 19:56 features/ftools_all_test.csv
-rw-r--r--  1 ivan.sobolev  staff  210516 Feb 11 19:56 features/ftools_all_train.csv
-rw-r--r--  1 ivan.sobolev  staff   10273 Feb 11 19:56 features/ftools_selected_test.csv
-rw-r--r--  1 ivan.sobolev  staff   20970 Feb 11 19:56 features/ftools_selected_train.csv


In [197]:
!head -n1 features/ftools_all_test.csv

PassengerId,Pclass.SUM(Passengers.SibSp),Pclass.SUM(Passengers.Fare),Pclass.SUM(Passengers.Age),Pclass.STD(Passengers.SibSp),Pclass.STD(Passengers.Fare),Pclass.STD(Passengers.Age),Pclass.MAX(Passengers.SibSp),Pclass.MAX(Passengers.Fare),Pclass.MAX(Passengers.Age),Pclass.SKEW(Passengers.SibSp),Pclass.SKEW(Passengers.Fare),Pclass.SKEW(Passengers.Age),Pclass.MIN(Passengers.SibSp),Pclass.MIN(Passengers.Fare),Pclass.MIN(Passengers.Age),Pclass.MEAN(Passengers.SibSp),Pclass.MEAN(Passengers.Fare),Pclass.MEAN(Passengers.Age),Pclass.COUNT(Passengers),Pclass.NUM_UNIQUE(Passengers.Sex),Pclass.NUM_UNIQUE(Passengers.Embarked)


In [199]:
!head -n2 features/ftools_selected_train.csv

PassengerId,Pclass.SUM(Passengers.SibSp),Pclass.MAX(Passengers.Fare),Pclass.MAX(Passengers.Age),Pclass.COUNT(Passengers)
1,403,69.55,74.0,709
