## Import libraries

In [1]:
import gc
import pickle
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

## Prepare data for model training

In [2]:
with open("../input/tps-may-data-preprocess/TPS_May_Dataset_w_Quantile.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
train_df = processed_data['train_df']
test_df = processed_data['test_df']

del processed_data
gc.collect()

0

In [3]:
Xtrain = train_df.loc[:, train_df.columns != 'target'].values
Ytrain = train_df['target'].values
Ytrain_oh = pd.get_dummies(train_df['target']).values
Xtest = test_df.values

print("Xtrain: {} \nYtrain: {} \nYtrain_oh: {} \nXtest: {}".format(Xtrain.shape, Ytrain.shape, 
                                                                   Ytrain_oh.shape, Xtest.shape))

del train_df
del test_df
gc.collect()

Xtrain: (99918, 951) 
Ytrain: (99918,) 
Ytrain_oh: (99918, 4) 
Xtest: (50000, 951)


0

## Build and validate the model

In [4]:
params = {}
params["objective"] = 'multiclass'
params['metric'] = 'multi_logloss'
params['boosting'] = 'gbdt'
#params["device_type"] = 'gpu'
params['num_class'] = 4
params['is_unbalance'] = True
params["learning_rate"] = 0.02
params["lambda_l2"] = 0.0256
params["num_leaves"] = 52
params["max_depth"] = 10
params["feature_fraction"] = 0.503
params["bagging_fraction"] = 0.741
params["bagging_freq"] = 12
params["bagging_seed"] = 10
params["min_data_in_leaf"] = 10
params["verbosity"] = -1
num_rounds = 5000

In [5]:
FOLD = 10
NUM_SEED = 3

# Prediction Clipping Thresholds
p_min = 0.025
p_max = 1 - p_min

np.random.seed(3)
seeds = np.random.randint(0, 100, size=NUM_SEED)

oof_score = 0
y_pred_meta_lgb = np.zeros((Ytrain.shape[0], 4))
y_pred_final_lgb = np.zeros((Xtest.shape[0], 4))
counter = 0


for sidx, seed in enumerate(seeds):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain, Ytrain)):
        counter += 1

        train_x, train_y, train_y_oh = Xtrain[train], Ytrain[train], Ytrain_oh[train]
        val_x, val_y, val_y_oh = Xtrain[val], Ytrain[val], Ytrain_oh[val]

        lgtrain = lgb.Dataset(train_x, label=train_y.ravel())
        lgvalidation = lgb.Dataset(val_x, label=val_y.ravel())

        model = lgb.train(params, lgtrain, num_rounds, 
                          valid_sets=[lgtrain, lgvalidation], 
                          early_stopping_rounds=200, verbose_eval=100)

        y_pred = model.predict(val_x, num_iteration=model.best_iteration)
        y_pred = np.clip(y_pred, p_min, p_max)
        y_pred_meta_lgb[val] += y_pred
        y_pred_final_lgb += model.predict(Xtest, num_iteration=model.best_iteration)
        
        score = log_loss(val_y_oh, y_pred)
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_meta_lgb = y_pred_meta_lgb / float(NUM_SEED)
y_pred_final_lgb = y_pred_final_lgb / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 1.07536	valid_1's multi_logloss: 1.09952
[200]	training's multi_logloss: 1.04822	valid_1's multi_logloss: 1.09454
[300]	training's multi_logloss: 1.027	valid_1's multi_logloss: 1.09293
[400]	training's multi_logloss: 1.00849	valid_1's multi_logloss: 1.09261
[500]	training's multi_logloss: 0.991938	valid_1's multi_logloss: 1.09351
Early stopping, best iteration is:
[390]	training's multi_logloss: 1.01024	valid_1's multi_logloss: 1.0925
Seed-24 | Fold-0 | OOF Score: 1.0924997531613356
Training until validation scores don't improve for 200 rounds
[100]	training's multi_logloss: 1.07543	valid_1's multi_logloss: 1.09926
[200]	training's multi_logloss: 1.04818	valid_1's multi_logloss: 1.09387
[300]	training's multi_logloss: 1.02669	valid_1's multi_logloss: 1.09187
[400]	training's multi_logloss: 1.00813	valid_1's multi_logloss: 1.09177
[500]	training's multi_logloss: 0.991727	valid_1's multi_logloss

In [6]:
np.savez_compressed('./LGB_Meta_Features.npz',
                    y_pred_meta_lgb=y_pred_meta_lgb, 
                    oof_score=oof_score,
                    y_pred_final_lgb=y_pred_final_lgb)

## Create submission file

In [7]:
y_pred_final_lgb = np.clip(y_pred_final_lgb, p_min, p_max)

test_df = pd.read_csv("../input/tabular-playground-series-may-2021/test.csv")
submit_df = pd.DataFrame()
submit_df['id'] = test_df['id']
submit_df['Class_1'] = y_pred_final_lgb[:,0]
submit_df['Class_2'] = y_pred_final_lgb[:,1]
submit_df['Class_3'] = y_pred_final_lgb[:,2]
submit_df['Class_4'] = y_pred_final_lgb[:,3]
submit_df.head()

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4
0,100000,0.086429,0.594643,0.201267,0.117662
1,100001,0.074733,0.673329,0.167697,0.084242
2,100002,0.080657,0.646302,0.177677,0.095364
3,100003,0.077064,0.552272,0.259954,0.110709
4,100004,0.074398,0.621289,0.200194,0.104119


In [8]:
submit_df.to_csv("./LGB_submission.csv", index=False)