## Import libraries

In [1]:
import gc
import pickle
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model, load_model
from tensorflow_addons.optimizers import AdamW, Lookahead

## Load autoencoder model

In [2]:
autoencoder = load_model('../input/tps-may-dae-model/DAE_model.h5')
feature_model = Model(inputs=autoencoder.input,
                      outputs=autoencoder.get_layer('Embedding').output)
feature_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 322)]             0         
_________________________________________________________________
dense (Dense)                (None, 512)               165376    
_________________________________________________________________
batch_normalization (BatchNo (None, 512)               2048      
_________________________________________________________________
activation (Activation)      (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1024)              525312    
_________________________________________________________________
batch_normalization_1 (Batch (None, 1024)              4096      
_________________________________________________________________
Embedding (Activation)       (None, 1024)              0     

## Load source datasets

In [3]:
train_df = pd.read_csv("../input/tabular-playground-series-may-2021/train.csv")
test_df = pd.read_csv("../input/tabular-playground-series-may-2021/test.csv")

train_df.set_index('id', inplace=True)
test_df.set_index('id', inplace=True)

print("train_df: {} \ntest_df: {}".format(train_df.shape, test_df.shape))

train_df: (100000, 51) 
test_df: (50000, 50)


In [4]:
test_df.loc[test_df['feature_3']==25,'feature_3']=26
test_df.loc[test_df['feature_4']==36,'feature_4']=37
test_df.loc[test_df['feature_21']==31,'feature_21']=36
test_df.loc[test_df['feature_25']==24,'feature_25']=23
test_df.loc[test_df['feature_34']==26,'feature_34']=25
test_df.loc[test_df['feature_49']==21,'feature_49']=20

train_df = train_df[train_df['feature_5']!=10]

train_df = train_df[train_df['feature_6']!=26]
train_df = train_df[train_df['feature_6']!=27]

train_df = train_df[train_df['feature_7']!=30]
train_df = train_df[train_df['feature_7']!=31]

train_df = train_df[train_df['feature_9']!=17]

train_df = train_df[train_df['feature_10']!=16]

train_df = train_df[train_df['feature_11']!=12]

train_df = train_df[train_df['feature_15']!=20]

train_df = train_df[train_df['feature_16']!=18]

train_df = train_df[train_df['feature_23']!=18]
train_df = train_df[train_df['feature_23']!=19]

train_df = train_df[train_df['feature_27']!=29]

train_df = train_df[train_df['feature_28']!=23]

train_df = train_df[train_df['feature_29']!=13]

train_df = train_df[train_df['feature_33']!=24]

train_df = train_df[train_df['feature_32']!=26]
train_df = train_df[train_df['feature_32']!=27]

train_df = train_df[train_df['feature_35']!=43]
train_df = train_df[train_df['feature_35']!=-2]
train_df = train_df[train_df['feature_35']!=38]
train_df = train_df[train_df['feature_35']!=39]


train_df = train_df[train_df['feature_38']!=65]
train_df = train_df[train_df['feature_38']!=55]
train_df = train_df[train_df['feature_38']!=-8]
train_df = train_df[train_df['feature_38']!=-3]
train_df = train_df[train_df['feature_38']!=-2]
train_df = train_df[train_df['feature_38']!=63]

train_df = train_df[train_df['feature_39']!=65]
train_df = train_df[train_df['feature_39']!=66]
train_df = train_df[train_df['feature_39']!=-5]
train_df = train_df[train_df['feature_39']!=-3]
train_df = train_df[train_df['feature_39']!=-2]
train_df = train_df[train_df['feature_39']!=63]

train_df = train_df[train_df['feature_42']!=37]
train_df = train_df[train_df['feature_42']!=-2]
train_df = train_df[train_df['feature_42']!=-1]

train_df = train_df[train_df['feature_43']!=33]
train_df = train_df[train_df['feature_43']!=31]

train_df.shape, test_df.shape

((99918, 51), (50000, 50))

In [5]:
class_map = {
    'Class_1': 0,
    'Class_2': 1,
    'Class_3': 2,
    'Class_4': 3
}

train_df['target'] = train_df['target'].map(class_map)

## Prepare data for model training

In [6]:
with open("../input/tps-may-data-preprocess-v2/TPS_May_Dataset.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
train_df1 = processed_data['train_df']
test_df1 = processed_data['test_df']
print("train_df1: {} \ntest_df1: {}".format(train_df1.shape, test_df1.shape))

del processed_data
gc.collect()

train_df1: (99918, 323) 
test_df1: (50000, 322)


62

In [7]:
Xtrain_embed = feature_model.predict(train_df1.loc[:, train_df1.columns != 'target'].values, verbose=1)
Xtest_embed = feature_model.predict(test_df1.values, verbose=1)
Xtrain_embed_df = pd.DataFrame(Xtrain_embed, index=train_df1.index)
Xtest_embed_df = pd.DataFrame(Xtest_embed, index=test_df1.index)

train_df.index = train_df1.index
test_df.index = test_df1.index

train_df = pd.merge(train_df, Xtrain_embed_df, on='id', sort=False)
test_df = pd.merge(test_df, Xtest_embed_df, on='id', sort=False)

Xtrain_embed_df['target'] = train_df1['target']
train_df['target'] = train_df1['target']
print("\n\nXtrain_embed_df: {} \nXtest_embed_df: {}".format(Xtrain_embed_df.shape, Xtest_embed_df.shape))
print("\n\ntrain_df: {} \ntest_df: {}".format(train_df.shape, test_df.shape))

del Xtrain_embed
del Xtest_embed
gc.collect()



Xtrain_embed_df: (99918, 1025) 
Xtest_embed_df: (50000, 1024)


train_df: (99918, 1075) 
test_df: (50000, 1074)


1389

## Save the processed datasets

In [8]:
data_dict = {}
data_dict['train_df'] = train_df
data_dict['test_df'] = test_df

file = open("./TPS_May_Dataset_w_Org.txt", 'wb')
pickle.dump(data_dict, file)
file.close()

In [9]:
data_dict = {}
data_dict['train_df'] = Xtrain_embed_df
data_dict['test_df'] = Xtest_embed_df

file = open("./TPS_May_Dataset_wo_Org.txt", 'wb')
pickle.dump(data_dict, file)
file.close()