## Import libraries

In [1]:
import gc
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import QuantileTransformer

## Load source datasets

In [2]:
train_df = pd.read_csv("../input/tabular-playground-series-may-2021/train.csv")
test_df = pd.read_csv("../input/tabular-playground-series-may-2021/test.csv")
print("train_df: {} \ntest_df: {}".format(train_df.shape, test_df.shape))

train_df: (100000, 52) 
test_df: (50000, 51)


## Data Cleansing

In [3]:
test_df.loc[test_df['feature_3']==25,'feature_3']=26
test_df.loc[test_df['feature_4']==36,'feature_4']=37
test_df.loc[test_df['feature_21']==31,'feature_21']=36
test_df.loc[test_df['feature_25']==24,'feature_25']=23
test_df.loc[test_df['feature_34']==26,'feature_34']=25
test_df.loc[test_df['feature_49']==21,'feature_49']=20

test_df.shape

(50000, 51)

In [4]:
train_df = train_df[train_df['feature_5']!=10]

train_df = train_df[train_df['feature_6']!=26]
train_df = train_df[train_df['feature_6']!=27]

train_df = train_df[train_df['feature_7']!=30]
train_df = train_df[train_df['feature_7']!=31]

train_df = train_df[train_df['feature_9']!=17]

train_df = train_df[train_df['feature_10']!=16]

train_df = train_df[train_df['feature_11']!=12]

train_df = train_df[train_df['feature_15']!=20]

train_df = train_df[train_df['feature_16']!=18]

train_df = train_df[train_df['feature_23']!=18]
train_df = train_df[train_df['feature_23']!=19]

train_df = train_df[train_df['feature_27']!=29]

train_df = train_df[train_df['feature_28']!=23]

train_df = train_df[train_df['feature_29']!=13]

train_df = train_df[train_df['feature_33']!=24]

train_df = train_df[train_df['feature_32']!=26]
train_df = train_df[train_df['feature_32']!=27]

train_df = train_df[train_df['feature_35']!=43]
train_df = train_df[train_df['feature_35']!=-2]
train_df = train_df[train_df['feature_35']!=38]
train_df = train_df[train_df['feature_35']!=39]


train_df = train_df[train_df['feature_38']!=65]
train_df = train_df[train_df['feature_38']!=55]
train_df = train_df[train_df['feature_38']!=-8]
train_df = train_df[train_df['feature_38']!=-3]
train_df = train_df[train_df['feature_38']!=-2]
train_df = train_df[train_df['feature_38']!=63]

train_df = train_df[train_df['feature_39']!=65]
train_df = train_df[train_df['feature_39']!=66]
train_df = train_df[train_df['feature_39']!=-5]
train_df = train_df[train_df['feature_39']!=-3]
train_df = train_df[train_df['feature_39']!=-2]
train_df = train_df[train_df['feature_39']!=63]

train_df = train_df[train_df['feature_42']!=37]
train_df = train_df[train_df['feature_42']!=-2]
train_df = train_df[train_df['feature_42']!=-1]

train_df = train_df[train_df['feature_43']!=33]
train_df = train_df[train_df['feature_43']!=31]

train_df.shape

(99918, 52)

## Extract target labels

In [5]:
class_map = {
    'Class_1': 0,
    'Class_2': 1,
    'Class_3': 2,
    'Class_4': 3
}

train_df['target'] = train_df['target'].map(class_map)
train_df.groupby(['target']).size().reset_index().rename(columns={0:'Count'})

Unnamed: 0,target,Count
0,0,8481
1,1,57446
2,2,21408
3,3,12583


In [6]:
train_y = train_df['target'].values
train_df.drop(['target'], inplace=True, axis=1)
print("train_y: {}".format(train_y.shape))

train_y: (99918,)


## Feature Engineering

In [7]:
cat_cols = ['id','feature_2','feature_13','feature_22','feature_36','feature_44']
num_cols = [col for col in train_df.columns if col not in cat_cols]

combined_df = train_df.append(test_df, sort=False, ignore_index=False)
combined_df.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49
0,0,0,0,1,0,1,0,0,0,0,...,3,0,0,21,0,0,0,0,0,0
1,1,0,0,0,0,2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,13,2,0
3,3,0,0,0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,0,1,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [8]:
fet_engg_dict = {}

for col in num_cols:
    fet_engg_dict[col] = ['count','sum','mean','min','max']

fet_engg_dict

{'feature_0': ['count', 'sum', 'mean', 'min', 'max'],
 'feature_1': ['count', 'sum', 'mean', 'min', 'max'],
 'feature_3': ['count', 'sum', 'mean', 'min', 'max'],
 'feature_4': ['count', 'sum', 'mean', 'min', 'max'],
 'feature_5': ['count', 'sum', 'mean', 'min', 'max'],
 'feature_6': ['count', 'sum', 'mean', 'min', 'max'],
 'feature_7': ['count', 'sum', 'mean', 'min', 'max'],
 'feature_8': ['count', 'sum', 'mean', 'min', 'max'],
 'feature_9': ['count', 'sum', 'mean', 'min', 'max'],
 'feature_10': ['count', 'sum', 'mean', 'min', 'max'],
 'feature_11': ['count', 'sum', 'mean', 'min', 'max'],
 'feature_12': ['count', 'sum', 'mean', 'min', 'max'],
 'feature_14': ['count', 'sum', 'mean', 'min', 'max'],
 'feature_15': ['count', 'sum', 'mean', 'min', 'max'],
 'feature_16': ['count', 'sum', 'mean', 'min', 'max'],
 'feature_17': ['count', 'sum', 'mean', 'min', 'max'],
 'feature_18': ['count', 'sum', 'mean', 'min', 'max'],
 'feature_19': ['count', 'sum', 'mean', 'min', 'max'],
 'feature_20': ['co

In [9]:
temp = combined_df.groupby(['feature_2']).agg(fet_engg_dict)
temp.columns = ['f2_'+'_'.join(x) for x in temp.columns]
combined_df = pd.merge(combined_df,temp,on=['feature_2'],how='left')

del temp
gc.collect()

combined_df.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,f2_feature_48_count,f2_feature_48_sum,f2_feature_48_mean,f2_feature_48_min,f2_feature_48_max,f2_feature_49_count,f2_feature_49_sum,f2_feature_49_mean,f2_feature_49_min,f2_feature_49_max
0,0,0,0,1,0,1,0,0,0,0,...,5855,6147,1.049872,0,39,5855,3537,0.604099,0,18
1,1,0,0,0,0,2,1,0,0,0,...,140094,135874,0.969877,0,46,140094,77772,0.555142,0,20
2,2,0,0,0,0,0,0,0,0,0,...,140094,135874,0.969877,0,46,140094,77772,0.555142,0,20
3,3,0,0,0,0,0,0,0,3,0,...,140094,135874,0.969877,0,46,140094,77772,0.555142,0,20
4,4,0,0,0,0,0,0,0,0,0,...,140094,135874,0.969877,0,46,140094,77772,0.555142,0,20


In [10]:
temp = combined_df.groupby(['feature_13']).agg(fet_engg_dict)
temp.columns = ['f13_'+'_'.join(x) for x in temp.columns]
combined_df = pd.merge(combined_df,temp,on=['feature_13'],how='left')

del temp
gc.collect()

combined_df.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,f13_feature_48_count,f13_feature_48_sum,f13_feature_48_mean,f13_feature_48_min,f13_feature_48_max,f13_feature_49_count,f13_feature_49_sum,f13_feature_49_mean,f13_feature_49_min,f13_feature_49_max
0,0,0,0,1,0,1,0,0,0,0,...,142253,138229,0.971712,0,46,142253,79366,0.557921,0,20
1,1,0,0,0,0,2,1,0,0,0,...,142253,138229,0.971712,0,46,142253,79366,0.557921,0,20
2,2,0,0,0,0,0,0,0,0,0,...,142253,138229,0.971712,0,46,142253,79366,0.557921,0,20
3,3,0,0,0,0,0,0,0,3,0,...,4643,4836,1.041568,0,38,4643,2755,0.593366,0,16
4,4,0,0,0,0,0,0,0,0,0,...,142253,138229,0.971712,0,46,142253,79366,0.557921,0,20


In [11]:
temp = combined_df.groupby(['feature_22']).agg(fet_engg_dict)
temp.columns = ['f22_'+'_'.join(x) for x in temp.columns]
combined_df = pd.merge(combined_df,temp,on=['feature_22'],how='left')

del temp
gc.collect()

combined_df.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,f22_feature_48_count,f22_feature_48_sum,f22_feature_48_mean,f22_feature_48_min,f22_feature_48_max,f22_feature_49_count,f22_feature_49_sum,f22_feature_49_mean,f22_feature_49_min,f22_feature_49_max
0,0,0,0,1,0,1,0,0,0,0,...,130577,126534,0.969037,0,46,130577,72512,0.55532,0,20
1,1,0,0,0,0,2,1,0,0,0,...,130577,126534,0.969037,0,46,130577,72512,0.55532,0,20
2,2,0,0,0,0,0,0,0,0,0,...,3625,3603,0.993931,0,39,3625,2258,0.622897,0,18
3,3,0,0,0,0,0,0,0,3,0,...,130577,126534,0.969037,0,46,130577,72512,0.55532,0,20
4,4,0,0,0,0,0,0,0,0,0,...,130577,126534,0.969037,0,46,130577,72512,0.55532,0,20


In [12]:
temp = combined_df.groupby(['feature_36']).agg(fet_engg_dict)
temp.columns = ['f36_'+'_'.join(x) for x in temp.columns]
combined_df = pd.merge(combined_df,temp,on=['feature_36'],how='left')

del temp
gc.collect()

combined_df.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,f36_feature_48_count,f36_feature_48_sum,f36_feature_48_mean,f36_feature_48_min,f36_feature_48_max,f36_feature_49_count,f36_feature_49_sum,f36_feature_49_mean,f36_feature_49_min,f36_feature_49_max
0,0,0,0,1,0,1,0,0,0,0,...,136541,133044,0.974389,0,46,136541,76210,0.558147,0,20
1,1,0,0,0,0,2,1,0,0,0,...,136541,133044,0.974389,0,46,136541,76210,0.558147,0,20
2,2,0,0,0,0,0,0,0,0,0,...,9905,9756,0.984957,0,39,9905,5563,0.561636,0,18
3,3,0,0,0,0,0,0,0,3,0,...,136541,133044,0.974389,0,46,136541,76210,0.558147,0,20
4,4,0,0,0,0,0,0,0,0,0,...,136541,133044,0.974389,0,46,136541,76210,0.558147,0,20


In [13]:
temp = combined_df.groupby(['feature_44']).agg(fet_engg_dict)
temp.columns = ['f44_'+'_'.join(x) for x in temp.columns]
combined_df = pd.merge(combined_df,temp,on=['feature_44'],how='left')

del temp
gc.collect()

combined_df.head()

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,f44_feature_48_count,f44_feature_48_sum,f44_feature_48_mean,f44_feature_48_min,f44_feature_48_max,f44_feature_49_count,f44_feature_49_sum,f44_feature_49_mean,f44_feature_49_min,f44_feature_49_max
0,0,0,0,1,0,1,0,0,0,0,...,139790,136112,0.973689,0,46,139790,77816,0.556664,0,20
1,1,0,0,0,0,2,1,0,0,0,...,139790,136112,0.973689,0,46,139790,77816,0.556664,0,20
2,2,0,0,0,0,0,0,0,0,0,...,139790,136112,0.973689,0,46,139790,77816,0.556664,0,20
3,3,0,0,0,0,0,0,0,3,0,...,139790,136112,0.973689,0,46,139790,77816,0.556664,0,20
4,4,0,0,0,0,0,0,0,0,0,...,139790,136112,0.973689,0,46,139790,77816,0.556664,0,20


In [14]:
for col in tqdm(train_df.columns):
    combined_df[col+'_sq'] = combined_df[col].apply(lambda x: x**2)
    combined_df[col+'_cb'] = combined_df[col].apply(lambda x: x**3)
    combined_df[col+'_cbrt'] = combined_df[col].apply(lambda x: np.cbrt(x))

combined_df.head()

100%|██████████| 51/51 [00:33<00:00,  1.51it/s]


Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_46_cbrt,feature_47_sq,feature_47_cb,feature_47_cbrt,feature_48_sq,feature_48_cb,feature_48_cbrt,feature_49_sq,feature_49_cb,feature_49_cbrt
0,0,0,0,1,0,1,0,0,0,0,...,0.0,0,0,0.0,0,0,0.0,0,0,0.0
1,1,0,0,0,0,2,1,0,0,0,...,0.0,0,0,0.0,0,0,0.0,0,0,0.0
2,2,0,0,0,0,0,0,0,0,0,...,0.0,169,2197,2.351335,4,8,1.259921,0,0,0.0
3,3,0,0,0,0,0,0,0,3,0,...,0.0,0,0,0.0,1,1,1.0,0,0,0.0
4,4,0,0,0,0,0,0,0,0,0,...,0.0,0,0,0.0,1,1,1.0,0,0,0.0


In [15]:
col_fltr_list = ['feature_14','feature_19','feature_30','feature_31','feature_32',
                 'feature_35','feature_38','feature_39','feature_42']
col_list = [col for col in train_df.columns if col not in col_fltr_list]

for col in tqdm(col_list):
    combined_df[col+'_log'] = combined_df[col].apply(lambda x: np.log1p(x))
    combined_df[col+'_sqrt'] = combined_df[col].apply(lambda x: np.sqrt(x))

combined_df.head()

100%|██████████| 42/42 [00:32<00:00,  1.28it/s]


Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_45_log,feature_45_sqrt,feature_46_log,feature_46_sqrt,feature_47_log,feature_47_sqrt,feature_48_log,feature_48_sqrt,feature_49_log,feature_49_sqrt
0,0,0,0,1,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,0,0,0,2,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,2.639057,3.605551,1.098612,1.414214,0.0,0.0
3,3,0,0,0,0,0,0,0,3,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,1.0,0.0,0.0
4,4,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,1.0,0.0,0.0


In [16]:
combined_df.set_index('id', inplace=True)
combined_df[combined_df.isin([np.nan, np.inf, -np.inf]).any(1)].head()

Unnamed: 0_level_0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_45_log,feature_45_sqrt,feature_46_log,feature_46_sqrt,feature_47_log,feature_47_sqrt,feature_48_log,feature_48_sqrt,feature_49_log,feature_49_sqrt
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [17]:
col_filter_list = []

for col in combined_df.columns:
    uniq_val = combined_df[col].nunique()
    if uniq_val <= 2:
        col_filter_list.append(col)

combined_df.drop(col_filter_list, axis=1, inplace=True)
combined_df.head()

Unnamed: 0_level_0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_45_log,feature_45_sqrt,feature_46_log,feature_46_sqrt,feature_47_log,feature_47_sqrt,feature_48_log,feature_48_sqrt,feature_49_log,feature_49_sqrt
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,1,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,2,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0,0,0,0,0,2,...,0.0,0.0,0.0,0.0,2.639057,3.605551,1.098612,1.414214,0.0,0.0
3,0,0,0,0,0,0,0,3,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,1.0,0.0,0.0
4,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,1.0,0.0,0.0


In [18]:
train_df = combined_df[:train_y.shape[0]].copy()
test_df = combined_df[train_y.shape[0]:].copy()

del combined_df
gc.collect()

train_df.shape, test_df.shape

((99918, 1188), (50000, 1188))

## Quantile Transformation

In [19]:
cat_cols = ['feature_0','feature_2','feature_5','feature_9','feature_10','feature_11',
            'feature_12','feature_13','feature_17','feature_18','feature_22',
            'feature_29','feature_36','feature_37','feature_44']

num_cols = [col for col in train_df.columns if col not in cat_cols]

len(cat_cols), len(num_cols)

(15, 1173)

In [20]:
for col in tqdm(num_cols):
    transformer = QuantileTransformer(n_quantiles=1000, 
                                      random_state=10, 
                                      output_distribution="normal")
    
    vec_len = len(train_df[col].values)
    vec_len_test = len(test_df[col].values)

    raw_vec = train_df[col].values.reshape(vec_len, 1)
    test_vec = test_df[col].values.reshape(vec_len_test, 1)
    transformer.fit(raw_vec)
    
    train_df[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test_df[col] = transformer.transform(test_vec).reshape(1, vec_len_test)[0]

100%|██████████| 1173/1173 [02:39<00:00,  7.38it/s]


In [21]:
train_df['target'] = train_y.ravel()
train_df.head()

Unnamed: 0_level_0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_45_sqrt,feature_46_log,feature_46_sqrt,feature_47_log,feature_47_sqrt,feature_48_log,feature_48_sqrt,feature_49_log,feature_49_sqrt,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,-5.199338,1,-5.199338,1.3402,0,-5.199338,-5.199338,-5.199338,0,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,1
1,0,-5.199338,0,-5.199338,1.60221,1,-5.199338,-5.199338,-5.199338,0,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,0
2,0,-5.199338,0,-5.199338,-5.199338,0,-5.199338,-5.199338,-5.199338,2,...,-5.199338,-5.199338,-5.199338,2.747453,2.747453,1.068522,0.949547,-5.199338,-5.199338,0
3,0,-5.199338,0,-5.199338,-5.199338,0,-5.199338,1.093272,-5.199338,0,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,0.480839,0.672129,-5.199338,-5.199338,3
4,0,-5.199338,0,-5.199338,-5.199338,0,-5.199338,-5.199338,-5.199338,0,...,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,0.480839,0.672129,-5.199338,-5.199338,1


## Save the processed datasets

In [22]:
data_dict = {}
data_dict['train_df'] = train_df
data_dict['test_df'] = test_df

file = open("./TPS_May_Dataset.txt", 'wb')
pickle.dump(data_dict, file)
file.close()