## Import libraries

In [1]:
import gc
import pickle
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

## Prepare data for model training

In [2]:
with open("../input/tps-may-data-preprocess/TPS_May_Dataset_w_Quantile.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
train_df = processed_data['train_df']
test_df = processed_data['test_df']

del processed_data
gc.collect()

0

In [3]:
Xtrain = train_df.loc[:, train_df.columns != 'target'].values
Ytrain = train_df['target'].values
Ytrain_oh = pd.get_dummies(train_df['target']).values
Xtest = test_df.values

print("Xtrain: {} \nYtrain: {} \nYtrain_oh: {} \nXtest: {}".format(Xtrain.shape, Ytrain.shape, 
                                                                   Ytrain_oh.shape, Xtest.shape))

del train_df
del test_df
gc.collect()

Xtrain: (99918, 951) 
Ytrain: (99918,) 
Ytrain_oh: (99918, 4) 
Xtest: (50000, 951)


0

## Build and validate the model

In [4]:
FOLD = 10
NUM_SEED = 3

# Prediction Clipping Thresholds
p_min = 0.025
p_max = 1 - p_min

np.random.seed(3)
seeds = np.random.randint(0, 100, size=NUM_SEED)

oof_score = 0
y_pred_meta_xgb = np.zeros((Ytrain.shape[0], 4))
y_pred_final_xgb = np.zeros((Xtest.shape[0], 4))
counter = 0


for sidx, seed in enumerate(seeds):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain, Ytrain)):
        counter += 1

        train_x, train_y, train_y_oh = Xtrain[train], Ytrain[train], Ytrain_oh[train]
        val_x, val_y, val_y_oh = Xtrain[val], Ytrain[val], Ytrain_oh[val]
        
        model = XGBClassifier(
            objective='multi:softmax',
            eval_metric='mlogloss',
            booster='gbtree',
            sample_type='weighted',
            tree_method='gpu_hist',
            grow_policy='lossguide',
            use_label_encoder=False,
            num_round=5000,
            num_class=4,
            max_depth=10, 
            max_leaves=53,
            learning_rate=0.0782,
            subsample=0.76,
            colsample_bytree=0.5216,
            min_child_weight=10,
            reg_lambda=0.1515,
            verbosity=0
        )

        model.fit(train_x, train_y, eval_set=[(train_x, train_y), (val_x, val_y)], 
                  early_stopping_rounds=200, verbose=50)

        y_pred = model.predict_proba(val_x, iteration_range=(0, model.best_iteration))
        y_pred = np.clip(y_pred, p_min, p_max)
        y_pred_meta_xgb[val] += y_pred
        y_pred_final_xgb += model.predict_proba(Xtest, iteration_range=(0, model.best_iteration))
        
        score = log_loss(val_y_oh, y_pred)
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_meta_xgb = y_pred_meta_xgb / float(NUM_SEED)
y_pred_final_xgb = y_pred_final_xgb / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

[0]	validation_0-mlogloss:1.35435	validation_1-mlogloss:1.35482
[50]	validation_0-mlogloss:1.07275	validation_1-mlogloss:1.09797
[99]	validation_0-mlogloss:1.04428	validation_1-mlogloss:1.09338


  "because it will generate extra copies and increase " +


Seed-24 | Fold-0 | OOF Score: 1.093378456416812
[0]	validation_0-mlogloss:1.35434	validation_1-mlogloss:1.35487
[50]	validation_0-mlogloss:1.07254	validation_1-mlogloss:1.09724
[99]	validation_0-mlogloss:1.04418	validation_1-mlogloss:1.09234


  "because it will generate extra copies and increase " +


Seed-24 | Fold-1 | OOF Score: 1.09241167474553
[0]	validation_0-mlogloss:1.35430	validation_1-mlogloss:1.35493
[50]	validation_0-mlogloss:1.07223	validation_1-mlogloss:1.10069
[99]	validation_0-mlogloss:1.04365	validation_1-mlogloss:1.09716


  "because it will generate extra copies and increase " +


Seed-24 | Fold-2 | OOF Score: 1.0971003483438058
[0]	validation_0-mlogloss:1.35435	validation_1-mlogloss:1.35486
[50]	validation_0-mlogloss:1.07268	validation_1-mlogloss:1.09667
[99]	validation_0-mlogloss:1.04419	validation_1-mlogloss:1.09083


  "because it will generate extra copies and increase " +


Seed-24 | Fold-3 | OOF Score: 1.0909582990213478
[0]	validation_0-mlogloss:1.35428	validation_1-mlogloss:1.35475
[50]	validation_0-mlogloss:1.07247	validation_1-mlogloss:1.09849
[99]	validation_0-mlogloss:1.04394	validation_1-mlogloss:1.09486


  "because it will generate extra copies and increase " +


Seed-24 | Fold-4 | OOF Score: 1.0948949756161916
[0]	validation_0-mlogloss:1.35427	validation_1-mlogloss:1.35482
[50]	validation_0-mlogloss:1.07260	validation_1-mlogloss:1.09885
[99]	validation_0-mlogloss:1.04397	validation_1-mlogloss:1.09399


  "because it will generate extra copies and increase " +


Seed-24 | Fold-5 | OOF Score: 1.0940125742054778
[0]	validation_0-mlogloss:1.35426	validation_1-mlogloss:1.35475
[50]	validation_0-mlogloss:1.07250	validation_1-mlogloss:1.09808
[99]	validation_0-mlogloss:1.04410	validation_1-mlogloss:1.09244


  "because it will generate extra copies and increase " +


Seed-24 | Fold-6 | OOF Score: 1.0924605977333026
[0]	validation_0-mlogloss:1.35419	validation_1-mlogloss:1.35485
[50]	validation_0-mlogloss:1.07237	validation_1-mlogloss:1.09975
[99]	validation_0-mlogloss:1.04371	validation_1-mlogloss:1.09528


  "because it will generate extra copies and increase " +


Seed-24 | Fold-7 | OOF Score: 1.0952866780545543
[0]	validation_0-mlogloss:1.35439	validation_1-mlogloss:1.35481
[50]	validation_0-mlogloss:1.07298	validation_1-mlogloss:1.09765
[99]	validation_0-mlogloss:1.04452	validation_1-mlogloss:1.09275


  "because it will generate extra copies and increase " +


Seed-24 | Fold-8 | OOF Score: 1.0927854749506365
[0]	validation_0-mlogloss:1.35431	validation_1-mlogloss:1.35495
[50]	validation_0-mlogloss:1.07208	validation_1-mlogloss:1.09994
[99]	validation_0-mlogloss:1.04345	validation_1-mlogloss:1.09588


  "because it will generate extra copies and increase " +


Seed-24 | Fold-9 | OOF Score: 1.0959185325634657

Seed: 24 | Aggregate OOF Score: 1.0939207611651125


[0]	validation_0-mlogloss:1.35431	validation_1-mlogloss:1.35473
[50]	validation_0-mlogloss:1.07260	validation_1-mlogloss:1.09726
[99]	validation_0-mlogloss:1.04420	validation_1-mlogloss:1.09165


  "because it will generate extra copies and increase " +


Seed-3 | Fold-0 | OOF Score: 1.0916533213797763
[0]	validation_0-mlogloss:1.35421	validation_1-mlogloss:1.35473
[50]	validation_0-mlogloss:1.07257	validation_1-mlogloss:1.09824
[99]	validation_0-mlogloss:1.04412	validation_1-mlogloss:1.09261


  "because it will generate extra copies and increase " +


Seed-3 | Fold-1 | OOF Score: 1.0926950872287429
[0]	validation_0-mlogloss:1.35423	validation_1-mlogloss:1.35476
[50]	validation_0-mlogloss:1.07241	validation_1-mlogloss:1.09929
[99]	validation_0-mlogloss:1.04406	validation_1-mlogloss:1.09439


  "because it will generate extra copies and increase " +


Seed-3 | Fold-2 | OOF Score: 1.0944107535785084
[0]	validation_0-mlogloss:1.35437	validation_1-mlogloss:1.35496
[50]	validation_0-mlogloss:1.07228	validation_1-mlogloss:1.09960
[99]	validation_0-mlogloss:1.04365	validation_1-mlogloss:1.09527


  "because it will generate extra copies and increase " +


Seed-3 | Fold-3 | OOF Score: 1.0953617519836982
[0]	validation_0-mlogloss:1.35428	validation_1-mlogloss:1.35467
[50]	validation_0-mlogloss:1.07241	validation_1-mlogloss:1.09804
[99]	validation_0-mlogloss:1.04384	validation_1-mlogloss:1.09409


  "because it will generate extra copies and increase " +


Seed-3 | Fold-4 | OOF Score: 1.0941121386672803
[0]	validation_0-mlogloss:1.35426	validation_1-mlogloss:1.35493
[50]	validation_0-mlogloss:1.07223	validation_1-mlogloss:1.10009
[99]	validation_0-mlogloss:1.04377	validation_1-mlogloss:1.09645


  "because it will generate extra copies and increase " +


Seed-3 | Fold-5 | OOF Score: 1.0964953173727416
[0]	validation_0-mlogloss:1.35424	validation_1-mlogloss:1.35464
[50]	validation_0-mlogloss:1.07247	validation_1-mlogloss:1.09820
[99]	validation_0-mlogloss:1.04388	validation_1-mlogloss:1.09410


  "because it will generate extra copies and increase " +


Seed-3 | Fold-6 | OOF Score: 1.0940692737546442
[0]	validation_0-mlogloss:1.35430	validation_1-mlogloss:1.35485
[50]	validation_0-mlogloss:1.07262	validation_1-mlogloss:1.09857
[99]	validation_0-mlogloss:1.04428	validation_1-mlogloss:1.09361


  "because it will generate extra copies and increase " +


Seed-3 | Fold-7 | OOF Score: 1.0937307437194335
[0]	validation_0-mlogloss:1.35417	validation_1-mlogloss:1.35498
[50]	validation_0-mlogloss:1.07266	validation_1-mlogloss:1.09881
[99]	validation_0-mlogloss:1.04433	validation_1-mlogloss:1.09415


  "because it will generate extra copies and increase " +


Seed-3 | Fold-8 | OOF Score: 1.0941199998325348
[0]	validation_0-mlogloss:1.35431	validation_1-mlogloss:1.35493
[50]	validation_0-mlogloss:1.07205	validation_1-mlogloss:1.09992
[99]	validation_0-mlogloss:1.04355	validation_1-mlogloss:1.09550


  "because it will generate extra copies and increase " +


Seed-3 | Fold-9 | OOF Score: 1.09545347524225

Seed: 3 | Aggregate OOF Score: 1.094210186275961


[0]	validation_0-mlogloss:1.35427	validation_1-mlogloss:1.35501
[50]	validation_0-mlogloss:1.07239	validation_1-mlogloss:1.09828
[99]	validation_0-mlogloss:1.04385	validation_1-mlogloss:1.09322


  "because it will generate extra copies and increase " +


Seed-56 | Fold-0 | OOF Score: 1.0932599073730773
[0]	validation_0-mlogloss:1.35427	validation_1-mlogloss:1.35489
[50]	validation_0-mlogloss:1.07268	validation_1-mlogloss:1.09980
[99]	validation_0-mlogloss:1.04449	validation_1-mlogloss:1.09477


  "because it will generate extra copies and increase " +


Seed-56 | Fold-1 | OOF Score: 1.0947708842203654
[0]	validation_0-mlogloss:1.35428	validation_1-mlogloss:1.35483
[50]	validation_0-mlogloss:1.07236	validation_1-mlogloss:1.09852
[99]	validation_0-mlogloss:1.04409	validation_1-mlogloss:1.09376


  "because it will generate extra copies and increase " +


Seed-56 | Fold-2 | OOF Score: 1.0937546167128964
[0]	validation_0-mlogloss:1.35421	validation_1-mlogloss:1.35462
[50]	validation_0-mlogloss:1.07252	validation_1-mlogloss:1.09938
[99]	validation_0-mlogloss:1.04390	validation_1-mlogloss:1.09533


  "because it will generate extra copies and increase " +


Seed-56 | Fold-3 | OOF Score: 1.0954785447036794
[0]	validation_0-mlogloss:1.35427	validation_1-mlogloss:1.35479
[50]	validation_0-mlogloss:1.07303	validation_1-mlogloss:1.09528
[99]	validation_0-mlogloss:1.04474	validation_1-mlogloss:1.09015


  "because it will generate extra copies and increase " +


Seed-56 | Fold-4 | OOF Score: 1.0902587033679978
[0]	validation_0-mlogloss:1.35428	validation_1-mlogloss:1.35472
[50]	validation_0-mlogloss:1.07267	validation_1-mlogloss:1.09619
[99]	validation_0-mlogloss:1.04427	validation_1-mlogloss:1.09101


  "because it will generate extra copies and increase " +


Seed-56 | Fold-5 | OOF Score: 1.0910669801811919
[0]	validation_0-mlogloss:1.35424	validation_1-mlogloss:1.35485
[50]	validation_0-mlogloss:1.07224	validation_1-mlogloss:1.09928
[99]	validation_0-mlogloss:1.04361	validation_1-mlogloss:1.09457


  "because it will generate extra copies and increase " +


Seed-56 | Fold-6 | OOF Score: 1.0946208783719924
[0]	validation_0-mlogloss:1.35419	validation_1-mlogloss:1.35484
[50]	validation_0-mlogloss:1.07245	validation_1-mlogloss:1.09981
[99]	validation_0-mlogloss:1.04402	validation_1-mlogloss:1.09432


  "because it will generate extra copies and increase " +


Seed-56 | Fold-7 | OOF Score: 1.0943389854448333
[0]	validation_0-mlogloss:1.35420	validation_1-mlogloss:1.35459
[50]	validation_0-mlogloss:1.07245	validation_1-mlogloss:1.09775
[99]	validation_0-mlogloss:1.04388	validation_1-mlogloss:1.09333


  "because it will generate extra copies and increase " +


Seed-56 | Fold-8 | OOF Score: 1.0933908122731406
[0]	validation_0-mlogloss:1.35430	validation_1-mlogloss:1.35489
[50]	validation_0-mlogloss:1.07227	validation_1-mlogloss:1.10033
[99]	validation_0-mlogloss:1.04411	validation_1-mlogloss:1.09582


  "because it will generate extra copies and increase " +


Seed-56 | Fold-9 | OOF Score: 1.095913453893957

Seed: 56 | Aggregate OOF Score: 1.0936853766543133


Aggregate OOF Score: 1.0939387746984621


In [5]:
np.savez_compressed('./XGB_Meta_Features.npz',
                    y_pred_meta_xgb=y_pred_meta_xgb, 
                    oof_score=oof_score,
                    y_pred_final_xgb=y_pred_final_xgb)

## Create submission file

In [6]:
y_pred_final_xgb = np.clip(y_pred_final_xgb, p_min, p_max)

test_df = pd.read_csv("../input/tabular-playground-series-may-2021/test.csv")
submit_df = pd.DataFrame()
submit_df['id'] = test_df['id']
submit_df['Class_1'] = y_pred_final_xgb[:,0]
submit_df['Class_2'] = y_pred_final_xgb[:,1]
submit_df['Class_3'] = y_pred_final_xgb[:,2]
submit_df['Class_4'] = y_pred_final_xgb[:,3]
submit_df.head()

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4
0,100000,0.09244,0.594898,0.196043,0.116619
1,100001,0.075105,0.662869,0.174878,0.087149
2,100002,0.086936,0.629473,0.181745,0.101846
3,100003,0.080011,0.541024,0.259991,0.118974
4,100004,0.074768,0.612501,0.205877,0.106854


In [7]:
submit_df.to_csv("./XGB_submission.csv", index=False)