## Import libraries

In [24]:
import gc
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow_addons.optimizers import AdamW, Lookahead
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import Concatenate, Add
from tensorflow.keras.layers import Activation, Input
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Embedding, Reshape

## Prepare data for model training

In [52]:
train_df = pd.read_csv("../input/tabular-playground-series-jun-2021/train.csv")
train_df['target'] = train_df['target'].apply(lambda x: int(x.replace("Class_","")))
train_df['target'] = train_df['target'].apply(lambda x: 0 if x==9 else x)
train_df.set_index("id", inplace=True)
train_df.head()

Unnamed: 0_level_0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,6,1,0,0,0,0,7,0,...,0,0,0,0,0,0,2,0,0,6
1,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,1,0,6
2,0,0,0,0,0,1,0,3,0,0,...,0,0,0,0,1,0,0,0,0,2
3,0,0,7,0,1,5,2,2,0,1,...,0,4,0,2,2,0,4,3,0,8
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [53]:
test_df = pd.read_csv("../input/tabular-playground-series-jun-2021/test.csv")
test_df.set_index("id", inplace=True)
test_df.head()

Unnamed: 0_level_0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
200001,1,2,0,0,0,0,0,0,0,0,...,3,1,3,0,0,0,0,3,0,0
200002,0,1,7,1,0,0,0,0,6,0,...,3,0,0,0,0,3,0,2,0,0
200003,0,0,0,4,3,1,0,0,0,0,...,0,0,0,1,0,0,0,4,0,0
200004,0,0,5,0,0,0,0,0,0,8,...,0,0,0,0,0,0,0,0,1,0


In [54]:
cat_cols = []
for col in tqdm(test_df.columns):
    train_ = train_df[col].unique().tolist()
    test_ = test_df[col].unique().tolist()
    
    train_.sort()
    test_.sort()
    
    if train_ == test_:
        cat_cols.append(col)

print(len(cat_cols))

100%|██████████| 75/75 [00:00<00:00, 483.77it/s]

69





In [55]:
train_df[cat_cols] = train_df[cat_cols].astype(int)
test_df[cat_cols] = test_df[cat_cols].astype(int)
num_cols = [col for col in test_df.columns if col not in cat_cols]
len(cat_cols), len(num_cols)

(69, 6)

In [56]:
temp_df = train_df.groupby(['target']).size().reset_index().rename(columns={0:'count'})
total_count = np.sum(temp_df['count'].values)
temp_df['class%'] = (temp_df['count'] / total_count) * 100
lowest_pct = min(temp_df['class%'])
temp_df['class_weight'] = lowest_pct / temp_df['class%']
class_weight = temp_df[['target', 'class_weight']].to_dict()['class_weight']
class_weight

{0: 0.11995928274998044,
 1: 0.3360386049572275,
 2: 0.12541443248332038,
 3: 0.20705500743343697,
 4: 0.6513605442176871,
 5: 1.0,
 6: 0.059138020883596154,
 7: 0.20746157492044148,
 8: 0.059192859764696786}

## Build the model

In [63]:
def dnn_model(data, cat_cols, num_cols):

    inputs = []
    outputs = []
    for c in cat_cols:
        num_unique_values = int(data[c].nunique())
        embed_dim = int(min(np.ceil((num_unique_values)/2), 50))
        inp = Input(shape=(1,))
        out = Embedding(num_unique_values + 250, embed_dim, name=c)(inp)
        out = Reshape(target_shape=(embed_dim, ))(out)
        inputs.append(inp)
        outputs.append(out)
    
    num_input = Input(shape=(num_cols,))
    outputs.append(num_input)
    
    x = Concatenate()(outputs)
    x = BatchNormalization()(x)

    x = Dense(units=512, kernel_initializer='he_uniform', 
                kernel_regularizer=l2(0.0001))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(rate=0.5)(x)
    
    x = Dense(units=128, kernel_initializer='he_uniform', 
                kernel_regularizer=l2(0.0001))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(rate=0.5)(x)

    x_output = Dense(units=9, activation='softmax', 
                     kernel_initializer='he_uniform')(x)

    model = Model(inputs=[inputs, num_input], outputs=x_output, 
                  name='DNN_Model')
    return model

In [64]:
model = dnn_model(train_df, cat_cols, len(num_cols))
model.compile(loss='categorical_crossentropy',
              optimizer=Lookahead(AdamW(lr=1e-2, 
                                        weight_decay=1e-5, 
                                        clipvalue=700), 
                                  sync_period=10))
model.summary()

Model: "DNN_Model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_913 (InputLayer)          [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_914 (InputLayer)          [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_915 (InputLayer)          [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_916 (InputLayer)          [(None, 1)]          0                                            
__________________________________________________________________________________________

In [65]:
FOLD = 10
NUM_SEED = 3
VERBOSE = 1

# Prediction Clipping Thresholds
p_min = 0.025
p_max = 1 - p_min

np.random.seed(3)
seeds = np.random.randint(0, 100, size=NUM_SEED)

oof_score = 0
y_pred_meta_dnn = np.zeros((train_df.shape[0], 9))
y_pred_final_dnn = np.zeros((test_df.shape[0], 9))
counter = 0
mini_batch_size = 128


for sidx, seed in enumerate(seeds):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(train_df.loc[:, train_df.columns!='target'], train_df['target'])):
        counter += 1

        train_, val_ = train_df.iloc[train].copy(), train_df.iloc[val].copy()
        Ytrain = to_categorical(train_['target'].values, 9)
        Yval = to_categorical(val_['target'], 9)

        model = dnn_model(train_df, cat_cols, len(num_cols))
        model.compile(loss='categorical_crossentropy',
                      optimizer=Lookahead(AdamW(lr=1e-2, 
                                                weight_decay=1e-5, 
                                                clipvalue=700), 
                                          sync_period=10))

        early = EarlyStopping(monitor="val_loss", mode="min", 
                              restore_best_weights=True, 
                              patience=7, verbose=VERBOSE)

        reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.1, 
                                      min_lr=1e-6, patience=4, 
                                      verbose=VERBOSE, mode='min')

        chk_point = ModelCheckpoint('./DNN_model.h5', 
                                    monitor='val_loss', verbose=VERBOSE, 
                                    save_best_only=True, mode='min')
        
        history = model.fit(
            [[train_[col] for col in cat_cols], train_[num_cols]], Ytrain, 
            batch_size=mini_batch_size,
            class_weight=class_weight, 
            epochs=250, 
            verbose=VERBOSE, 
            workers=5,
            callbacks=[reduce_lr, early, chk_point], 
            validation_data=([[val_[col] for col in cat_cols], val_[num_cols]], Yval)
        )
        
        model = load_model('./DNN_model.h5')

        y_pred = model.predict([[val_[col] for col in cat_cols], val_[num_cols]])
        y_pred = np.clip(y_pred, p_min, p_max)
        y_pred_meta_dnn[val] += y_pred
        y_pred_final_dnn += model.predict([[test_df[col] for col in cat_cols], test_df[num_cols]])
        
        score = log_loss(Yval, y_pred)
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_meta_dnn = y_pred_meta_dnn / float(NUM_SEED)
y_pred_final_dnn = y_pred_final_dnn / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

Epoch 1/250

Epoch 00001: val_loss improved from inf to 2.13232, saving model to ./DNN_model.h5
Epoch 2/250
 271/1407 [====>.........................] - ETA: 42s - loss: 0.3183

KeyboardInterrupt: 

In [None]:
np.savez_compressed('./DNN_Meta_Features.npz',
                    y_pred_meta_dnn=y_pred_meta_dnn, 
                    oof_score=oof_score,
                    y_pred_final_dnn=y_pred_final_dnn)

## Create submission file

In [None]:
y_pred_final_dnn = np.clip(y_pred_final_dnn, p_min, p_max)

test_df = pd.read_csv("../input/tabular-playground-series-jun-2021/test.csv")
submit_df = pd.DataFrame()
submit_df['id'] = test_df['id']
submit_df['Class_1'] = y_pred_final_dnn[:,1]
submit_df['Class_2'] = y_pred_final_dnn[:,2]
submit_df['Class_3'] = y_pred_final_dnn[:,3]
submit_df['Class_4'] = y_pred_final_dnn[:,4]
submit_df['Class_5'] = y_pred_final_dnn[:,5]
submit_df['Class_6'] = y_pred_final_dnn[:,6]
submit_df['Class_7'] = y_pred_final_dnn[:,7]
submit_df['Class_8'] = y_pred_final_dnn[:,8]
submit_df['Class_9'] = y_pred_final_dnn[:,0]
submit_df.head()

In [None]:
submit_df.to_csv("./DNN_submission.csv", index=False)