## Import libraries

In [1]:
import gc
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import featuretools as ft
import matplotlib.pyplot as plt
from sklearn.preprocessing import QuantileTransformer

## Load source datasets

In [2]:
train_df = pd.read_csv("../input/tabular-playground-series-may-2021/train.csv")
test_df = pd.read_csv("../input/tabular-playground-series-may-2021/test.csv")
print("train_df: {} \ntest_df: {}".format(train_df.shape, test_df.shape))

train_df: (100000, 52) 
test_df: (50000, 51)


In [3]:
train_df.set_index('id', inplace=True)
test_df.set_index('id', inplace=True)
train_df.head()

Unnamed: 0_level_0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,1,0,1,0,0,0,0,0,...,0,0,21,0,0,0,0,0,0,Class_2
1,0,0,0,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,0,0,0,0,0,0,0,0,0,2,...,0,1,0,0,0,0,13,2,0,Class_1
3,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,1,0,Class_4
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,Class_2


## Data Cleansing

In [4]:
test_df.loc[test_df['feature_3']==25,'feature_3']=26
test_df.loc[test_df['feature_4']==36,'feature_4']=37
test_df.loc[test_df['feature_21']==31,'feature_21']=36
test_df.loc[test_df['feature_25']==24,'feature_25']=23
test_df.loc[test_df['feature_34']==26,'feature_34']=25
test_df.loc[test_df['feature_49']==21,'feature_49']=20

test_df.shape

(50000, 50)

In [5]:
train_df = train_df[train_df['feature_5']!=10]

train_df = train_df[train_df['feature_6']!=26]
train_df = train_df[train_df['feature_6']!=27]

train_df = train_df[train_df['feature_7']!=30]
train_df = train_df[train_df['feature_7']!=31]

train_df = train_df[train_df['feature_9']!=17]

train_df = train_df[train_df['feature_10']!=16]

train_df = train_df[train_df['feature_11']!=12]

train_df = train_df[train_df['feature_15']!=20]

train_df = train_df[train_df['feature_16']!=18]

train_df = train_df[train_df['feature_23']!=18]
train_df = train_df[train_df['feature_23']!=19]

train_df = train_df[train_df['feature_27']!=29]

train_df = train_df[train_df['feature_28']!=23]

train_df = train_df[train_df['feature_29']!=13]

train_df = train_df[train_df['feature_33']!=24]

train_df = train_df[train_df['feature_32']!=26]
train_df = train_df[train_df['feature_32']!=27]

train_df = train_df[train_df['feature_35']!=43]
train_df = train_df[train_df['feature_35']!=-2]
train_df = train_df[train_df['feature_35']!=38]
train_df = train_df[train_df['feature_35']!=39]


train_df = train_df[train_df['feature_38']!=65]
train_df = train_df[train_df['feature_38']!=55]
train_df = train_df[train_df['feature_38']!=-8]
train_df = train_df[train_df['feature_38']!=-3]
train_df = train_df[train_df['feature_38']!=-2]
train_df = train_df[train_df['feature_38']!=63]

train_df = train_df[train_df['feature_39']!=65]
train_df = train_df[train_df['feature_39']!=66]
train_df = train_df[train_df['feature_39']!=-5]
train_df = train_df[train_df['feature_39']!=-3]
train_df = train_df[train_df['feature_39']!=-2]
train_df = train_df[train_df['feature_39']!=63]

train_df = train_df[train_df['feature_42']!=37]
train_df = train_df[train_df['feature_42']!=-2]
train_df = train_df[train_df['feature_42']!=-1]

train_df = train_df[train_df['feature_43']!=33]
train_df = train_df[train_df['feature_43']!=31]

train_df.shape

(99918, 51)

## Extract target labels

In [6]:
class_map = {
    'Class_1': 0,
    'Class_2': 1,
    'Class_3': 2,
    'Class_4': 3
}

train_df['target'] = train_df['target'].map(class_map)
train_df.groupby(['target']).size().reset_index().rename(columns={0:'Count'})

Unnamed: 0,target,Count
0,0,8481
1,1,57446
2,2,21408
3,3,12583


In [7]:
train_y = train_df['target'].values
train_df.drop(['target'], inplace=True, axis=1)
print("train_y: {}".format(train_y.shape))

train_y: (99918,)


## Feature Engineering

In [8]:
train_df.reset_index(inplace=True)
test_df.reset_index(inplace=True)

cat_cols = ['id','feature_2','feature_13','feature_22','feature_36','feature_44']
num_cols = [col for col in train_df.columns if col not in cat_cols]

combined_df = train_df.append(test_df, sort=False, ignore_index=False)
combined_df.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49
0,0,0,0,1,0,1,0,0,0,0,...,3,0,0,21,0,0,0,0,0,0
1,1,0,0,0,0,2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,13,2,0
3,3,0,0,0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,0,1,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [9]:
fet_engg_dict = {}

for col in num_cols:
    fet_engg_dict[col] = ['count','sum','mean']

fet_engg_dict

{'feature_0': ['count', 'sum', 'mean'],
 'feature_1': ['count', 'sum', 'mean'],
 'feature_3': ['count', 'sum', 'mean'],
 'feature_4': ['count', 'sum', 'mean'],
 'feature_5': ['count', 'sum', 'mean'],
 'feature_6': ['count', 'sum', 'mean'],
 'feature_7': ['count', 'sum', 'mean'],
 'feature_8': ['count', 'sum', 'mean'],
 'feature_9': ['count', 'sum', 'mean'],
 'feature_10': ['count', 'sum', 'mean'],
 'feature_11': ['count', 'sum', 'mean'],
 'feature_12': ['count', 'sum', 'mean'],
 'feature_14': ['count', 'sum', 'mean'],
 'feature_15': ['count', 'sum', 'mean'],
 'feature_16': ['count', 'sum', 'mean'],
 'feature_17': ['count', 'sum', 'mean'],
 'feature_18': ['count', 'sum', 'mean'],
 'feature_19': ['count', 'sum', 'mean'],
 'feature_20': ['count', 'sum', 'mean'],
 'feature_21': ['count', 'sum', 'mean'],
 'feature_23': ['count', 'sum', 'mean'],
 'feature_24': ['count', 'sum', 'mean'],
 'feature_25': ['count', 'sum', 'mean'],
 'feature_26': ['count', 'sum', 'mean'],
 'feature_27': ['count', 

In [10]:
temp = combined_df.groupby(['feature_2']).agg(fet_engg_dict)
temp.columns = ['f2_'+'_'.join(x) for x in temp.columns]
combined_df = pd.merge(combined_df,temp,on=['feature_2'],how='left')

del temp
gc.collect()

combined_df.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,f2_feature_46_mean,f2_feature_47_count,f2_feature_47_sum,f2_feature_47_mean,f2_feature_48_count,f2_feature_48_sum,f2_feature_48_mean,f2_feature_49_count,f2_feature_49_sum,f2_feature_49_mean
0,0,0,0,1,0,1,0,0,0,0,...,0.502647,5855,2467,0.421349,5855,6147,1.049872,5855,3537,0.604099
1,1,0,0,0,0,2,1,0,0,0,...,0.519344,140094,54276,0.387426,140094,135874,0.969877,140094,77772,0.555142
2,2,0,0,0,0,0,0,0,0,0,...,0.519344,140094,54276,0.387426,140094,135874,0.969877,140094,77772,0.555142
3,3,0,0,0,0,0,0,0,3,0,...,0.519344,140094,54276,0.387426,140094,135874,0.969877,140094,77772,0.555142
4,4,0,0,0,0,0,0,0,0,0,...,0.519344,140094,54276,0.387426,140094,135874,0.969877,140094,77772,0.555142


In [11]:
temp = combined_df.groupby(['feature_13']).agg(fet_engg_dict)
temp.columns = ['f13_'+'_'.join(x) for x in temp.columns]
combined_df = pd.merge(combined_df,temp,on=['feature_13'],how='left')

del temp
gc.collect()

combined_df.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,f13_feature_46_mean,f13_feature_47_count,f13_feature_47_sum,f13_feature_47_mean,f13_feature_48_count,f13_feature_48_sum,f13_feature_48_mean,f13_feature_49_count,f13_feature_49_sum,f13_feature_49_mean
0,0,0,0,1,0,1,0,0,0,0,...,0.519328,142253,55271,0.38854,142253,138229,0.971712,142253,79366,0.557921
1,1,0,0,0,0,2,1,0,0,0,...,0.519328,142253,55271,0.38854,142253,138229,0.971712,142253,79366,0.557921
2,2,0,0,0,0,0,0,0,0,0,...,0.519328,142253,55271,0.38854,142253,138229,0.971712,142253,79366,0.557921
3,3,0,0,0,0,0,0,0,3,0,...,0.570321,4643,1819,0.391773,4643,4836,1.041568,4643,2755,0.593366
4,4,0,0,0,0,0,0,0,0,0,...,0.519328,142253,55271,0.38854,142253,138229,0.971712,142253,79366,0.557921


In [12]:
temp = combined_df.groupby(['feature_22']).agg(fet_engg_dict)
temp.columns = ['f22_'+'_'.join(x) for x in temp.columns]
combined_df = pd.merge(combined_df,temp,on=['feature_22'],how='left')

del temp
gc.collect()

combined_df.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,f22_feature_46_mean,f22_feature_47_count,f22_feature_47_sum,f22_feature_47_mean,f22_feature_48_count,f22_feature_48_sum,f22_feature_48_mean,f22_feature_49_count,f22_feature_49_sum,f22_feature_49_mean
0,0,0,0,1,0,1,0,0,0,0,...,0.513429,130577,50583,0.387381,130577,126534,0.969037,130577,72512,0.55532
1,1,0,0,0,0,2,1,0,0,0,...,0.513429,130577,50583,0.387381,130577,126534,0.969037,130577,72512,0.55532
2,2,0,0,0,0,0,0,0,0,0,...,0.580414,3625,1451,0.400276,3625,3603,0.993931,3625,2258,0.622897
3,3,0,0,0,0,0,0,0,3,0,...,0.513429,130577,50583,0.387381,130577,126534,0.969037,130577,72512,0.55532
4,4,0,0,0,0,0,0,0,0,0,...,0.513429,130577,50583,0.387381,130577,126534,0.969037,130577,72512,0.55532


In [13]:
temp = combined_df.groupby(['feature_36']).agg(fet_engg_dict)
temp.columns = ['f36_'+'_'.join(x) for x in temp.columns]
combined_df = pd.merge(combined_df,temp,on=['feature_36'],how='left')

del temp
gc.collect()

combined_df.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,f36_feature_46_mean,f36_feature_47_count,f36_feature_47_sum,f36_feature_47_mean,f36_feature_48_count,f36_feature_48_sum,f36_feature_48_mean,f36_feature_49_count,f36_feature_49_sum,f36_feature_49_mean
0,0,0,0,1,0,1,0,0,0,0,...,0.518943,136541,52794,0.386653,136541,133044,0.974389,136541,76210,0.558147
1,1,0,0,0,0,2,1,0,0,0,...,0.518943,136541,52794,0.386653,136541,133044,0.974389,136541,76210,0.558147
2,2,0,0,0,0,0,0,0,0,0,...,0.532458,9905,3870,0.390712,9905,9756,0.984957,9905,5563,0.561636
3,3,0,0,0,0,0,0,0,3,0,...,0.518943,136541,52794,0.386653,136541,133044,0.974389,136541,76210,0.558147
4,4,0,0,0,0,0,0,0,0,0,...,0.518943,136541,52794,0.386653,136541,133044,0.974389,136541,76210,0.558147


In [14]:
temp = combined_df.groupby(['feature_44']).agg(fet_engg_dict)
temp.columns = ['f44_'+'_'.join(x) for x in temp.columns]
combined_df = pd.merge(combined_df,temp,on=['feature_44'],how='left')

del temp
gc.collect()

combined_df.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,f44_feature_46_mean,f44_feature_47_count,f44_feature_47_sum,f44_feature_47_mean,f44_feature_48_count,f44_feature_48_sum,f44_feature_48_mean,f44_feature_49_count,f44_feature_49_sum,f44_feature_49_mean
0,0,0,0,1,0,1,0,0,0,0,...,0.519129,139790,53956,0.385979,139790,136112,0.973689,139790,77816,0.556664
1,1,0,0,0,0,2,1,0,0,0,...,0.519129,139790,53956,0.385979,139790,136112,0.973689,139790,77816,0.556664
2,2,0,0,0,0,0,0,0,0,0,...,0.519129,139790,53956,0.385979,139790,136112,0.973689,139790,77816,0.556664
3,3,0,0,0,0,0,0,0,3,0,...,0.519129,139790,53956,0.385979,139790,136112,0.973689,139790,77816,0.556664
4,4,0,0,0,0,0,0,0,0,0,...,0.519129,139790,53956,0.385979,139790,136112,0.973689,139790,77816,0.556664


## FeatureTools

In [15]:
col_list_1 = ['id','feature_13','feature_36','feature_22','feature_2','feature_44']
col_list_2 = ['id','feature_0','feature_5','feature_12','feature_11','feature_18',
              'feature_29','feature_37','feature_10','feature_17','feature_9',
              'feature_20','feature_16','feature_23']
col_list_3 = ['id','feature_15','feature_49','feature_26','feature_25','feature_28',
              'feature_40','feature_33','feature_3','feature_34','feature_47',
              'feature_6','feature_45','feature_7','feature_32','feature_46']
col_list_4 = ['id','feature_41','feature_1','feature_27','feature_43','feature_24',
              'feature_21','feature_8','feature_4','feature_42']
col_list_5 = ['id','feature_30','feature_35','feature_48','feature_31','feature_14',
              'feature_19','feature_39','feature_38']

In [16]:
es = ft.EntitySet(id='TPS')
es.entity_from_dataframe(entity_id='May', dataframe=combined_df[col_list_1], index='id')
feature_matrix1, feature_names = ft.dfs(entityset=es, target_entity='May', 
                                        trans_primitives=['add_numeric','subtract_numeric',
                                                          'multiply_numeric','divide_numeric',
                                                          'percentile'],
                                        max_depth=2, max_features=200, verbose=3, n_jobs=1)

feature_matrix1.drop([col for col in col_list_1 if col!='id'], axis=1, inplace=True)
feature_matrix1.replace([np.inf, -np.inf], np.nan, inplace=True)
feature_matrix1.fillna(0, inplace=True)
feature_matrix1.head()

Built 60 features
Elapsed: 00:01 | Progress: 100%|██████████


Unnamed: 0_level_0,feature_13 + feature_2,feature_13 + feature_22,feature_13 + feature_36,feature_13 + feature_44,feature_2 + feature_22,feature_2 + feature_36,feature_2 + feature_44,feature_22 + feature_36,feature_22 + feature_44,feature_36 + feature_44,...,feature_13 - feature_2,feature_13 - feature_22,feature_13 - feature_36,feature_13 - feature_44,feature_2 - feature_22,feature_2 - feature_36,feature_2 - feature_44,feature_22 - feature_36,feature_22 - feature_44,feature_36 - feature_44
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,0,0,0,1,1,1,0,0,0,...,-1,0,0,0,1,1,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,2,1,0,2,1,0,3,2,1,...,0,-2,-1,0,-2,-1,0,1,2,1
3,1,1,1,1,0,0,0,0,0,0,...,1,1,1,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
es = ft.EntitySet(id='TPS')
es.entity_from_dataframe(entity_id='May', dataframe=combined_df[col_list_2], index='id')
feature_matrix2, feature_names = ft.dfs(entityset=es, target_entity='May', 
                                        trans_primitives=['add_numeric','subtract_numeric',
                                                          'multiply_numeric','divide_numeric',
                                                          'percentile'],
                                        max_depth=2, max_features=200, verbose=3, n_jobs=1)

feature_matrix2.drop([col for col in col_list_2 if col!='id'], axis=1, inplace=True)
feature_matrix2.replace([np.inf, -np.inf], np.nan, inplace=True)
feature_matrix2.fillna(0, inplace=True)
feature_matrix2.head()

Built 200 features
Elapsed: 00:00 | Progress:  30%|███       

  trans_primitives: ['multiply_numeric', 'percentile', 'subtract_numeric']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible variable types for the primitive were found in the data.


Elapsed: 00:02 | Progress: 100%|██████████


Unnamed: 0_level_0,feature_0 + feature_10,feature_0 + feature_11,feature_0 + feature_12,feature_0 + feature_16,feature_0 + feature_17,feature_0 + feature_18,feature_0 + feature_20,feature_0 + feature_23,feature_0 + feature_29,feature_0 + feature_37,...,feature_23 / feature_12,feature_23 / feature_16,feature_23 / feature_17,feature_23 / feature_18,feature_23 / feature_20,feature_23 / feature_29,feature_23 / feature_37,feature_23 / feature_5,feature_23 / feature_9,feature_29 / feature_0
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,6,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,3,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,1,0,0,0,0,1,0,7,...,1.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.5,0.0
3,0,0,0,1,0,3,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,9,3,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
es = ft.EntitySet(id='TPS')
es.entity_from_dataframe(entity_id='May', dataframe=combined_df[col_list_3], index='id')
feature_matrix3, feature_names = ft.dfs(entityset=es, target_entity='May', 
                                        trans_primitives=['add_numeric','subtract_numeric',
                                                          'multiply_numeric','divide_numeric',
                                                          'percentile'],
                                        max_depth=2, max_features=200, verbose=3, n_jobs=1)

feature_matrix3.drop([col for col in col_list_3 if col!='id'], axis=1, inplace=True)
feature_matrix3.replace([np.inf, -np.inf], np.nan, inplace=True)
feature_matrix3.fillna(0, inplace=True)
feature_matrix3.head()

Built 200 features
Elapsed: 00:00 | Progress:  33%|███▎      

  trans_primitives: ['multiply_numeric', 'percentile', 'subtract_numeric']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible variable types for the primitive were found in the data.


Elapsed: 00:01 | Progress: 100%|██████████


Unnamed: 0_level_0,feature_15 + feature_25,feature_15 + feature_26,feature_15 + feature_28,feature_15 + feature_3,feature_15 + feature_32,feature_15 + feature_33,feature_15 + feature_34,feature_15 + feature_40,feature_15 + feature_45,feature_15 + feature_46,...,feature_32 / feature_15,feature_32 / feature_25,feature_32 / feature_26,feature_32 / feature_28,feature_32 / feature_3,feature_32 / feature_33,feature_32 / feature_34,feature_32 / feature_40,feature_32 / feature_45,feature_32 / feature_46
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,1,3,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,3,3,3,5,3,3,3,3,3,...,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,1,2,0,0,19,2,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,11,11,12,11,11,11,12,11,11,11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
es = ft.EntitySet(id='TPS')
es.entity_from_dataframe(entity_id='May', dataframe=combined_df[col_list_4], index='id')
feature_matrix4, feature_names = ft.dfs(entityset=es, target_entity='May', 
                                        trans_primitives=['add_numeric','subtract_numeric',
                                                          'multiply_numeric','divide_numeric',
                                                          'percentile'],
                                        max_depth=2, max_features=200, verbose=3, n_jobs=1)

feature_matrix4.drop([col for col in col_list_4 if col!='id'], axis=1, inplace=True)
feature_matrix4.replace([np.inf, -np.inf], np.nan, inplace=True)
feature_matrix4.fillna(0, inplace=True)
feature_matrix4.head()

Built 198 features
Elapsed: 00:02 | Progress: 100%|██████████


Unnamed: 0_level_0,feature_1 + feature_21,feature_1 + feature_24,feature_1 + feature_27,feature_1 + feature_4,feature_1 + feature_41,feature_1 + feature_42,feature_1 + feature_43,feature_1 + feature_8,feature_21 + feature_24,feature_21 + feature_27,...,feature_4 - feature_41,feature_4 - feature_42,feature_4 - feature_43,feature_4 - feature_8,feature_41 - feature_42,feature_41 - feature_43,feature_41 - feature_8,feature_42 - feature_43,feature_42 - feature_8,feature_43 - feature_8
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,1,0,0,21,0,0,0,...,1,1,-20,1,0,-21,0,-21,0,21
1,0,0,0,2,0,0,0,0,0,0,...,2,2,2,2,0,0,0,0,0,0
2,5,1,0,0,0,1,0,0,6,5,...,0,-1,0,0,-1,0,0,1,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
es = ft.EntitySet(id='TPS')
es.entity_from_dataframe(entity_id='May', dataframe=combined_df[col_list_5], index='id')
feature_matrix5, feature_names = ft.dfs(entityset=es, target_entity='May', 
                                        trans_primitives=['add_numeric','subtract_numeric',
                                                          'multiply_numeric','divide_numeric',
                                                          'percentile'],
                                        max_depth=2, max_features=200, verbose=3, n_jobs=1)

feature_matrix5.drop([col for col in col_list_5 if col!='id'], axis=1, inplace=True)
feature_matrix5.replace([np.inf, -np.inf], np.nan, inplace=True)
feature_matrix5.fillna(0, inplace=True)
feature_matrix5.head()

Built 156 features
Elapsed: 00:02 | Progress: 100%|██████████


Unnamed: 0_level_0,feature_14 + feature_19,feature_14 + feature_30,feature_14 + feature_31,feature_14 + feature_35,feature_14 + feature_38,feature_14 + feature_39,feature_14 + feature_48,feature_19 + feature_30,feature_19 + feature_31,feature_19 + feature_35,...,feature_31 - feature_35,feature_31 - feature_38,feature_31 - feature_39,feature_31 - feature_48,feature_35 - feature_38,feature_35 - feature_39,feature_35 - feature_48,feature_38 - feature_39,feature_38 - feature_48,feature_39 - feature_48
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,1,1,1,4,1,3,1,1,1,...,0,-3,0,-2,-3,0,-2,3,1,-2
3,8,8,7,7,11,7,8,2,1,1,...,0,-4,0,-1,-4,0,-1,4,3,-1
4,2,2,3,2,3,2,3,0,1,0,...,1,0,1,0,-1,0,-1,1,0,-1


In [21]:
feature_df = pd.merge(feature_matrix1, feature_matrix2, how='inner', 
                      on='id', sort=False, suffixes=('_x', '_y'))
feature_df = pd.merge(feature_df, feature_matrix3, how='inner', 
                      on='id', sort=False, suffixes=('_x', '_y'))
feature_df = pd.merge(feature_df, feature_matrix4, how='inner', 
                      on='id', sort=False, suffixes=('_x', '_y'))
feature_df = pd.merge(feature_df, feature_matrix5, how='inner', 
                      on='id', sort=False, suffixes=('_x', '_y'))
print("feature_df: {}".format(feature_df.shape))
feature_df.head()

feature_df: (149918, 764)


Unnamed: 0_level_0,feature_13 + feature_2,feature_13 + feature_22,feature_13 + feature_36,feature_13 + feature_44,feature_2 + feature_22,feature_2 + feature_36,feature_2 + feature_44,feature_22 + feature_36,feature_22 + feature_44,feature_36 + feature_44,...,feature_31 - feature_35,feature_31 - feature_38,feature_31 - feature_39,feature_31 - feature_48,feature_35 - feature_38,feature_35 - feature_39,feature_35 - feature_48,feature_38 - feature_39,feature_38 - feature_48,feature_39 - feature_48
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,2,1,0,2,1,0,3,2,1,...,0,-3,0,-2,-3,0,-2,3,1,-2
3,1,1,1,1,0,0,0,0,0,0,...,0,-4,0,-1,-4,0,-1,4,3,-1
4,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,-1,0,-1,1,0,-1


In [22]:
combined_df.set_index('id', inplace=True)

combined_df = pd.merge(
    combined_df, 
    feature_df, 
    how='inner', 
    on='id', 
    sort=False, 
    suffixes=('_x', '_y')
)

combined_df.head()

Unnamed: 0_level_0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_31 - feature_35,feature_31 - feature_38,feature_31 - feature_39,feature_31 - feature_48,feature_35 - feature_38,feature_35 - feature_39,feature_35 - feature_48,feature_38 - feature_39,feature_38 - feature_48,feature_39 - feature_48
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,2,...,0,-3,0,-2,-3,0,-2,3,1,-2
3,0,0,0,0,0,0,0,3,0,0,...,0,-4,0,-1,-4,0,-1,4,3,-1
4,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,-1,0,-1,1,0,-1


In [23]:
combined_df[combined_df.isin([np.nan, np.inf, -np.inf]).any(1)].head()

Unnamed: 0_level_0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_31 - feature_35,feature_31 - feature_38,feature_31 - feature_39,feature_31 - feature_48,feature_35 - feature_38,feature_35 - feature_39,feature_35 - feature_48,feature_38 - feature_39,feature_38 - feature_48,feature_39 - feature_48
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [24]:
col_filter_list = []

for col in combined_df.columns:
    uniq_val = combined_df[col].nunique()
    if uniq_val <= 2:
        col_filter_list.append(col)

combined_df.drop(col_filter_list, axis=1, inplace=True)
combined_df.head()

Unnamed: 0_level_0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_31 - feature_35,feature_31 - feature_38,feature_31 - feature_39,feature_31 - feature_48,feature_35 - feature_38,feature_35 - feature_39,feature_35 - feature_48,feature_38 - feature_39,feature_38 - feature_48,feature_39 - feature_48
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,2,...,0,-3,0,-2,-3,0,-2,3,1,-2
3,0,0,0,0,0,0,0,3,0,0,...,0,-4,0,-1,-4,0,-1,4,3,-1
4,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,-1,0,-1,1,0,-1


In [25]:
train_df = combined_df[:train_y.shape[0]].copy()
test_df = combined_df[train_y.shape[0]:].copy()
train_df.shape, test_df.shape

((99918, 1489), (50000, 1489))

## Quantile Transformation

In [26]:
train_df1 = train_df.copy()
test_df1 = test_df.copy()
train_df1.shape, test_df1.shape

((99918, 1489), (50000, 1489))

In [27]:
cat_cols = ['feature_0','feature_2','feature_5','feature_9','feature_10','feature_11',
            'feature_12','feature_13','feature_17','feature_18','feature_22',
            'feature_29','feature_36','feature_37','feature_44']

num_cols = [col for col in train_df.columns if col not in cat_cols]

len(cat_cols), len(num_cols)

(15, 1474)

In [28]:
for col in tqdm(num_cols):
    transformer = QuantileTransformer(n_quantiles=1000, 
                                      random_state=10, 
                                      output_distribution="normal")
    
    vec_len = len(train_df[col].values)
    vec_len_test = len(test_df[col].values)

    raw_vec = train_df[col].values.reshape(vec_len, 1)
    test_vec = test_df[col].values.reshape(vec_len_test, 1)
    transformer.fit(raw_vec)
    
    train_df[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test_df[col] = transformer.transform(test_vec).reshape(1, vec_len_test)[0]

100%|██████████| 1474/1474 [04:52<00:00,  5.04it/s]


In [29]:
train_df['target'] = train_y.ravel()
train_df.head()

Unnamed: 0_level_0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_31 - feature_38,feature_31 - feature_39,feature_31 - feature_48,feature_35 - feature_38,feature_35 - feature_39,feature_35 - feature_48,feature_38 - feature_39,feature_38 - feature_48,feature_39 - feature_48,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,-5.199338,1,-5.199338,1.3402,0,-5.199338,-5.199338,-5.199338,0,...,0.353487,-0.251016,-0.025094,0.487893,-0.143512,0.100535,-0.630254,-0.403356,0.24584,1
1,0,-5.199338,0,-5.199338,1.60221,1,-5.199338,-5.199338,-5.199338,0,...,0.353487,-0.251016,-0.025094,0.487893,-0.143512,0.100535,-0.630254,-0.403356,0.24584,0
2,0,-5.199338,0,-5.199338,-5.199338,0,-5.199338,-5.199338,-5.199338,2,...,-0.651824,-0.251016,-1.207532,-0.595437,-0.143512,-1.159515,0.555462,0.15366,-1.116369,0
3,0,-5.199338,0,-5.199338,-5.199338,0,-5.199338,1.093272,-5.199338,0,...,-0.830234,-0.251016,-0.857097,-0.779924,-0.143512,-0.797044,0.74304,0.622621,-0.736442,3
4,0,-5.199338,0,-5.199338,-5.199338,0,-5.199338,-5.199338,-5.199338,0,...,0.353487,0.661162,-0.025094,-0.114415,-0.143512,-0.797044,0.048948,-0.403356,-0.736442,1


In [30]:
train_df1['target'] = train_y.ravel()
train_df1.head()

Unnamed: 0_level_0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_31 - feature_38,feature_31 - feature_39,feature_31 - feature_48,feature_35 - feature_38,feature_35 - feature_39,feature_35 - feature_48,feature_38 - feature_39,feature_38 - feature_48,feature_39 - feature_48,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,2,...,-3,0,-2,-3,0,-2,3,1,-2,0
3,0,0,0,0,0,0,0,3,0,0,...,-4,0,-1,-4,0,-1,4,3,-1,3
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,-1,0,-1,1,0,-1,1


## Save the processed datasets

In [31]:
data_dict = {}
data_dict['train_df'] = train_df
data_dict['test_df'] = test_df

file = open("./TPS_May_Dataset_w_Quantile.txt", 'wb')
pickle.dump(data_dict, file)
file.close()

In [32]:
data_dict = {}
data_dict['train_df'] = train_df1
data_dict['test_df'] = test_df1

file = open("./TPS_May_Dataset_wo_Quantile.txt", 'wb')
pickle.dump(data_dict, file)
file.close()