## Import libraries

In [1]:
import gc
import pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Model, load_model
from tensorflow_addons.optimizers import AdamW, Lookahead

## Load autoencoder model

In [2]:
autoencoder = load_model('../input/mathcothon-th-re-shalt-beest-noise/DAE_model.h5')
feature_model = Model(inputs=autoencoder.input,
                      outputs=autoencoder.get_layer('Embedding').output)
feature_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Categorical Features (InputLaye [(None, 29)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 29, 8)        1024        Categorical Features[0][0]       
__________________________________________________________________________________________________
layer_normalization (LayerNorma (None, 29, 8)        16          embedding[0][0]                  
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 29, 8)        72          layer_normalization[0][0]        
______________________________________________________________________________________________

## Prepare data for model training

In [3]:
with open("../input/mathcothon-i-shalt-prepareth-data/MathCoThon_Ready_Meatballs.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
train_df = processed_data['train_df']
test_df = processed_data['test_df']
train_df.drop(['Price'], inplace=True, axis=1)
print(f"train_df: {train_df.shape} \ntest_df: {test_df.shape}")

del processed_data
gc.collect()

train_df: (15628, 893) 
test_df: (8245, 893)


0

In [4]:
cat_cols = ['Manufacturer','Model','Category','Leather interior','Cylinders',
            'Doors','Wheel','Color','Airbags','Turbo_Engine','Leap_Year',
            'Hybrid_Car','Numbers_in_Model','Drive wheels Front','Drive wheels Rear',
            'Manual_Gear','Automatic_Gear','Continuous_Gear','Fuel Tank Petrol',
            'Fuel Tank Diesel','Fuel Tank Gas','id0','id1','id2','id3','id4','id5','id6','id7']

vec_cols = [col for col in test_df.columns if (col.startswith('tfidf_') or col.startswith('countvec_'))]
num_cols = [col for col in test_df.columns if not(col in cat_cols or col in vec_cols)]

train_df[cat_cols] = train_df[cat_cols].astype(int)
test_df[cat_cols] = test_df[cat_cols].astype(int)

In [5]:
scaler = MinMaxScaler().fit(train_df)
temp_x1 = scaler.transform(train_df)
temp_x2 = scaler.transform(test_df)

train_df = pd.DataFrame(temp_x1, columns=test_df.columns, index=train_df.index)
test_df = pd.DataFrame(temp_x2, columns=test_df.columns, index=test_df.index)

In [6]:
Xtrain_embed = feature_model.predict([train_df[cat_cols], train_df[num_cols], train_df[vec_cols]], verbose=1)
Xtest_embed = feature_model.predict([test_df[cat_cols], test_df[num_cols], test_df[vec_cols]], verbose=1)

col_list = ['dae_'+str(i) for i in range(Xtrain_embed.shape[1])]
Xtrain_embed_df = pd.DataFrame(Xtrain_embed, columns=col_list, index=train_df.index)
Xtest_embed_df = pd.DataFrame(Xtest_embed, columns=col_list, index=test_df.index)

print(f"\nXtrain_embed_df: {Xtrain_embed_df.shape} \nXtest_embed_df: {Xtest_embed_df.shape}")

del Xtrain_embed, Xtest_embed
del feature_model, autoencoder
gc.collect()


Xtrain_embed_df: (15628, 256) 
Xtest_embed_df: (8245, 256)


4060

## Save the processed datasets

In [7]:
data_dict = {}
data_dict['dae_train_df'] = Xtrain_embed_df
data_dict['dae_test_df'] = Xtest_embed_df

file = open("./MathCoThon_Denoised_Meatballs.txt", 'wb')
pickle.dump(data_dict, file)
file.close()