## Import libraries

In [1]:
import gc
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import QuantileTransformer

## Load source datasets

In [2]:
train_df = pd.read_csv("../input/tabular-playground-series-may-2021/train.csv")
test_df = pd.read_csv("../input/tabular-playground-series-may-2021/test.csv")
print("train_df: {} \ntest_df: {}".format(train_df.shape, test_df.shape))

train_df: (100000, 52) 
test_df: (50000, 51)


In [3]:
train_df.set_index('id', inplace=True)
test_df.set_index('id', inplace=True)
train_df.head()

Unnamed: 0_level_0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,1,0,1,0,0,0,0,0,...,0,0,21,0,0,0,0,0,0,Class_2
1,0,0,0,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,0,0,0,0,0,0,0,0,0,2,...,0,1,0,0,0,0,13,2,0,Class_1
3,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,1,0,Class_4
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,Class_2


## Data Cleansing

In [4]:
test_df.loc[test_df['feature_3']==25,'feature_3']=26
test_df.loc[test_df['feature_4']==36,'feature_4']=37
test_df.loc[test_df['feature_21']==31,'feature_21']=36
test_df.loc[test_df['feature_25']==24,'feature_25']=23
test_df.loc[test_df['feature_34']==26,'feature_34']=25
test_df.loc[test_df['feature_49']==21,'feature_49']=20

test_df.shape

(50000, 50)

In [5]:
train_df = train_df[train_df['feature_5']!=10]

train_df = train_df[train_df['feature_6']!=26]
train_df = train_df[train_df['feature_6']!=27]

train_df = train_df[train_df['feature_7']!=30]
train_df = train_df[train_df['feature_7']!=31]

train_df = train_df[train_df['feature_9']!=17]

train_df = train_df[train_df['feature_10']!=16]

train_df = train_df[train_df['feature_11']!=12]

train_df = train_df[train_df['feature_15']!=20]

train_df = train_df[train_df['feature_16']!=18]

train_df = train_df[train_df['feature_23']!=18]
train_df = train_df[train_df['feature_23']!=19]

train_df = train_df[train_df['feature_27']!=29]

train_df = train_df[train_df['feature_28']!=23]

train_df = train_df[train_df['feature_29']!=13]

train_df = train_df[train_df['feature_33']!=24]

train_df = train_df[train_df['feature_32']!=26]
train_df = train_df[train_df['feature_32']!=27]

train_df = train_df[train_df['feature_35']!=43]
train_df = train_df[train_df['feature_35']!=-2]
train_df = train_df[train_df['feature_35']!=38]
train_df = train_df[train_df['feature_35']!=39]


train_df = train_df[train_df['feature_38']!=65]
train_df = train_df[train_df['feature_38']!=55]
train_df = train_df[train_df['feature_38']!=-8]
train_df = train_df[train_df['feature_38']!=-3]
train_df = train_df[train_df['feature_38']!=-2]
train_df = train_df[train_df['feature_38']!=63]

train_df = train_df[train_df['feature_39']!=65]
train_df = train_df[train_df['feature_39']!=66]
train_df = train_df[train_df['feature_39']!=-5]
train_df = train_df[train_df['feature_39']!=-3]
train_df = train_df[train_df['feature_39']!=-2]
train_df = train_df[train_df['feature_39']!=63]

train_df = train_df[train_df['feature_42']!=37]
train_df = train_df[train_df['feature_42']!=-2]
train_df = train_df[train_df['feature_42']!=-1]

train_df = train_df[train_df['feature_43']!=33]
train_df = train_df[train_df['feature_43']!=31]

train_df.shape

(99918, 51)

## Extract target labels

In [6]:
class_map = {
    'Class_1': 0,
    'Class_2': 1,
    'Class_3': 2,
    'Class_4': 3
}

train_df['target'] = train_df['target'].map(class_map)
train_df.groupby(['target']).size().reset_index().rename(columns={0:'Count'})

Unnamed: 0,target,Count
0,0,8481
1,1,57446
2,2,21408
3,3,12583


In [7]:
train_y = train_df['target'].values
train_df.drop(['target'], inplace=True, axis=1)
print("train_y: {}".format(train_y.shape))

train_y: (99918,)


In [8]:
cat_cols = ['feature_2','feature_13','feature_22','feature_36','feature_44']
num_cols = [col for col in train_df.columns if col not in cat_cols]

## Feature Engineering

In [9]:
combined_df = train_df.append(test_df, sort=False, ignore_index=False)
combined_df.head()

Unnamed: 0_level_0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,1,0,1,0,0,0,0,0,...,3,0,0,21,0,0,0,0,0,0
1,0,0,0,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,2,...,0,0,1,0,0,0,0,13,2,0
3,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [10]:
fet_engg_dict = {}

for col in num_cols:
    fet_engg_dict[col] = ['count','sum','min','max','mean']

fet_engg_dict

{'feature_0': ['count', 'sum', 'min', 'max', 'mean'],
 'feature_1': ['count', 'sum', 'min', 'max', 'mean'],
 'feature_3': ['count', 'sum', 'min', 'max', 'mean'],
 'feature_4': ['count', 'sum', 'min', 'max', 'mean'],
 'feature_5': ['count', 'sum', 'min', 'max', 'mean'],
 'feature_6': ['count', 'sum', 'min', 'max', 'mean'],
 'feature_7': ['count', 'sum', 'min', 'max', 'mean'],
 'feature_8': ['count', 'sum', 'min', 'max', 'mean'],
 'feature_9': ['count', 'sum', 'min', 'max', 'mean'],
 'feature_10': ['count', 'sum', 'min', 'max', 'mean'],
 'feature_11': ['count', 'sum', 'min', 'max', 'mean'],
 'feature_12': ['count', 'sum', 'min', 'max', 'mean'],
 'feature_14': ['count', 'sum', 'min', 'max', 'mean'],
 'feature_15': ['count', 'sum', 'min', 'max', 'mean'],
 'feature_16': ['count', 'sum', 'min', 'max', 'mean'],
 'feature_17': ['count', 'sum', 'min', 'max', 'mean'],
 'feature_18': ['count', 'sum', 'min', 'max', 'mean'],
 'feature_19': ['count', 'sum', 'min', 'max', 'mean'],
 'feature_20': ['co

In [11]:
temp = combined_df.groupby(['feature_2']).agg(fet_engg_dict)
temp.columns = ['f2_'+'_'.join(x) for x in temp.columns]
combined_df = pd.merge(combined_df,temp,on=['feature_2'],how='left')

del temp
gc.collect()

combined_df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,f2_feature_48_count,f2_feature_48_sum,f2_feature_48_min,f2_feature_48_max,f2_feature_48_mean,f2_feature_49_count,f2_feature_49_sum,f2_feature_49_min,f2_feature_49_max,f2_feature_49_mean
0,0,0,1,0,1,0,0,0,0,0,...,5855,6147,0,39,1.049872,5855,3537,0,18,0.604099
1,0,0,0,0,2,1,0,0,0,0,...,140094,135874,0,46,0.969877,140094,77772,0,20,0.555142
2,0,0,0,0,0,0,0,0,0,2,...,140094,135874,0,46,0.969877,140094,77772,0,20,0.555142
3,0,0,0,0,0,0,0,3,0,0,...,140094,135874,0,46,0.969877,140094,77772,0,20,0.555142
4,0,0,0,0,0,0,0,0,0,0,...,140094,135874,0,46,0.969877,140094,77772,0,20,0.555142


In [12]:
temp = combined_df.groupby(['feature_13']).agg(fet_engg_dict)
temp.columns = ['f13_'+'_'.join(x) for x in temp.columns]
combined_df = pd.merge(combined_df,temp,on=['feature_13'],how='left')

del temp
gc.collect()

combined_df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,f13_feature_48_count,f13_feature_48_sum,f13_feature_48_min,f13_feature_48_max,f13_feature_48_mean,f13_feature_49_count,f13_feature_49_sum,f13_feature_49_min,f13_feature_49_max,f13_feature_49_mean
0,0,0,1,0,1,0,0,0,0,0,...,142253,138229,0,46,0.971712,142253,79366,0,20,0.557921
1,0,0,0,0,2,1,0,0,0,0,...,142253,138229,0,46,0.971712,142253,79366,0,20,0.557921
2,0,0,0,0,0,0,0,0,0,2,...,142253,138229,0,46,0.971712,142253,79366,0,20,0.557921
3,0,0,0,0,0,0,0,3,0,0,...,4643,4836,0,38,1.041568,4643,2755,0,16,0.593366
4,0,0,0,0,0,0,0,0,0,0,...,142253,138229,0,46,0.971712,142253,79366,0,20,0.557921


In [13]:
temp = combined_df.groupby(['feature_22']).agg(fet_engg_dict)
temp.columns = ['f22_'+'_'.join(x) for x in temp.columns]
combined_df = pd.merge(combined_df,temp,on=['feature_22'],how='left')

del temp
gc.collect()

combined_df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,f22_feature_48_count,f22_feature_48_sum,f22_feature_48_min,f22_feature_48_max,f22_feature_48_mean,f22_feature_49_count,f22_feature_49_sum,f22_feature_49_min,f22_feature_49_max,f22_feature_49_mean
0,0,0,1,0,1,0,0,0,0,0,...,130577,126534,0,46,0.969037,130577,72512,0,20,0.55532
1,0,0,0,0,2,1,0,0,0,0,...,130577,126534,0,46,0.969037,130577,72512,0,20,0.55532
2,0,0,0,0,0,0,0,0,0,2,...,3625,3603,0,39,0.993931,3625,2258,0,18,0.622897
3,0,0,0,0,0,0,0,3,0,0,...,130577,126534,0,46,0.969037,130577,72512,0,20,0.55532
4,0,0,0,0,0,0,0,0,0,0,...,130577,126534,0,46,0.969037,130577,72512,0,20,0.55532


In [14]:
temp = combined_df.groupby(['feature_36']).agg(fet_engg_dict)
temp.columns = ['f36_'+'_'.join(x) for x in temp.columns]
combined_df = pd.merge(combined_df,temp,on=['feature_36'],how='left')

del temp
gc.collect()

combined_df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,f36_feature_48_count,f36_feature_48_sum,f36_feature_48_min,f36_feature_48_max,f36_feature_48_mean,f36_feature_49_count,f36_feature_49_sum,f36_feature_49_min,f36_feature_49_max,f36_feature_49_mean
0,0,0,1,0,1,0,0,0,0,0,...,136541,133044,0,46,0.974389,136541,76210,0,20,0.558147
1,0,0,0,0,2,1,0,0,0,0,...,136541,133044,0,46,0.974389,136541,76210,0,20,0.558147
2,0,0,0,0,0,0,0,0,0,2,...,9905,9756,0,39,0.984957,9905,5563,0,18,0.561636
3,0,0,0,0,0,0,0,3,0,0,...,136541,133044,0,46,0.974389,136541,76210,0,20,0.558147
4,0,0,0,0,0,0,0,0,0,0,...,136541,133044,0,46,0.974389,136541,76210,0,20,0.558147


In [15]:
temp = combined_df.groupby(['feature_44']).agg(fet_engg_dict)
temp.columns = ['f44_'+'_'.join(x) for x in temp.columns]
combined_df = pd.merge(combined_df,temp,on=['feature_44'],how='left')

del temp
gc.collect()

combined_df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,f44_feature_48_count,f44_feature_48_sum,f44_feature_48_min,f44_feature_48_max,f44_feature_48_mean,f44_feature_49_count,f44_feature_49_sum,f44_feature_49_min,f44_feature_49_max,f44_feature_49_mean
0,0,0,1,0,1,0,0,0,0,0,...,139790,136112,0,46,0.973689,139790,77816,0,20,0.556664
1,0,0,0,0,2,1,0,0,0,0,...,139790,136112,0,46,0.973689,139790,77816,0,20,0.556664
2,0,0,0,0,0,0,0,0,0,2,...,139790,136112,0,46,0.973689,139790,77816,0,20,0.556664
3,0,0,0,0,0,0,0,3,0,0,...,139790,136112,0,46,0.973689,139790,77816,0,20,0.556664
4,0,0,0,0,0,0,0,0,0,0,...,139790,136112,0,46,0.973689,139790,77816,0,20,0.556664


In [16]:
combined_df[combined_df.isin([np.nan, np.inf, -np.inf]).any(1)].head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,f44_feature_48_count,f44_feature_48_sum,f44_feature_48_min,f44_feature_48_max,f44_feature_48_mean,f44_feature_49_count,f44_feature_49_sum,f44_feature_49_min,f44_feature_49_max,f44_feature_49_mean


In [17]:
col_filter_list = []

for col in combined_df.columns:
    uniq_val = combined_df[col].nunique()
    if uniq_val <= 2:
        col_filter_list.append(col)

combined_df.drop(col_filter_list, axis=1, inplace=True)
combined_df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,f44_feature_47_max,f44_feature_47_mean,f44_feature_48_count,f44_feature_48_sum,f44_feature_48_max,f44_feature_48_mean,f44_feature_49_count,f44_feature_49_sum,f44_feature_49_max,f44_feature_49_mean
0,0,0,1,0,1,0,0,0,0,0,...,26,0.385979,139790,136112,46,0.973689,139790,77816,20,0.556664
1,0,0,0,0,2,1,0,0,0,0,...,26,0.385979,139790,136112,46,0.973689,139790,77816,20,0.556664
2,0,0,0,0,0,0,0,0,0,2,...,26,0.385979,139790,136112,46,0.973689,139790,77816,20,0.556664
3,0,0,0,0,0,0,0,3,0,0,...,26,0.385979,139790,136112,46,0.973689,139790,77816,20,0.556664
4,0,0,0,0,0,0,0,0,0,0,...,26,0.385979,139790,136112,46,0.973689,139790,77816,20,0.556664


In [18]:
train_df = combined_df[:train_y.shape[0]].copy()
test_df = combined_df[train_y.shape[0]:].copy()
train_df.shape, test_df.shape

((99918, 951), (50000, 951))

## Quantile Transformation

In [19]:
cat_cols = train_df.iloc[:,0:50].columns
num_cols = [col for col in train_df.columns if col not in cat_cols]
len(cat_cols), len(num_cols)

(50, 901)

In [20]:
for col in tqdm(num_cols):
    transformer = QuantileTransformer(n_quantiles=1000, 
                                      random_state=10, 
                                      output_distribution="normal")
    
    vec_len = len(train_df[col].values)
    vec_len_test = len(test_df[col].values)

    raw_vec = train_df[col].values.reshape(vec_len, 1)
    test_vec = test_df[col].values.reshape(vec_len_test, 1)
    transformer.fit(raw_vec)
    
    train_df[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test_df[col] = transformer.transform(test_vec).reshape(1, vec_len_test)[0]

100%|██████████| 901/901 [02:33<00:00,  5.86it/s]


In [21]:
train_df['target'] = train_y.ravel()
train_df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,f44_feature_47_mean,f44_feature_48_count,f44_feature_48_sum,f44_feature_48_max,f44_feature_48_mean,f44_feature_49_count,f44_feature_49_sum,f44_feature_49_max,f44_feature_49_mean,target
0,0,0,1,0,1,0,0,0,0,0,...,-2.241016,5.199338,5.199338,5.199338,-2.273052,5.199338,5.199338,5.199338,-0.076604,1
1,0,0,0,0,2,1,0,0,0,0,...,-2.241016,5.199338,5.199338,5.199338,-2.273052,5.199338,5.199338,5.199338,-0.076604,0
2,0,0,0,0,0,0,0,0,0,2,...,-2.241016,5.199338,5.199338,5.199338,-2.273052,5.199338,5.199338,5.199338,-0.076604,0
3,0,0,0,0,0,0,0,3,0,0,...,-2.241016,5.199338,5.199338,5.199338,-2.273052,5.199338,5.199338,5.199338,-0.076604,3
4,0,0,0,0,0,0,0,0,0,0,...,-2.241016,5.199338,5.199338,5.199338,-2.273052,5.199338,5.199338,5.199338,-0.076604,1


## Save the processed datasets

In [22]:
data_dict = {}
data_dict['train_df'] = train_df
data_dict['test_df'] = test_df

file = open("./TPS_May_Dataset.txt", 'wb')
pickle.dump(data_dict, file)
file.close()