## Import libraries

In [1]:
import gc
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from xgboost import XGBClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

## Prepare data for model training

In [2]:
with open("../input/tps-may-data-preprocess-v5/TPS_May_Dataset.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
train_df = processed_data['train_df']
test_df = processed_data['test_df']

del processed_data
gc.collect()

20

In [3]:
cat_cols = []
for col in tqdm(train_df.iloc[:,0:50].columns):
    train_ = train_df[col].unique().tolist()
    test_ = test_df[col].unique().tolist()
    
    train_.sort()
    test_.sort()
    
    if train_ == test_:
        cat_cols.append(col)

print(len(cat_cols))
cat_cols

100%|██████████| 50/50 [00:00<00:00, 1104.68it/s]

37





['feature_0',
 'feature_2',
 'feature_3',
 'feature_4',
 'feature_5',
 'feature_6',
 'feature_7',
 'feature_9',
 'feature_10',
 'feature_11',
 'feature_12',
 'feature_13',
 'feature_15',
 'feature_16',
 'feature_17',
 'feature_18',
 'feature_20',
 'feature_21',
 'feature_22',
 'feature_23',
 'feature_25',
 'feature_26',
 'feature_27',
 'feature_28',
 'feature_29',
 'feature_32',
 'feature_33',
 'feature_34',
 'feature_35',
 'feature_36',
 'feature_38',
 'feature_39',
 'feature_42',
 'feature_43',
 'feature_44',
 'feature_46',
 'feature_49']

In [4]:
train_df[cat_cols] = train_df[cat_cols].astype(int)
test_df[cat_cols] = test_df[cat_cols].astype(int)

for col in tqdm(cat_cols):
    dummy_val = pd.get_dummies(train_df[col], prefix='col')
    train_df = pd.concat([train_df, dummy_val], axis=1)
    train_df.drop([col], inplace=True, axis=1)

print("train_df: {}".format(train_df.shape))

for col in tqdm(cat_cols):
    dummy_val = pd.get_dummies(test_df[col], prefix='col')
    test_df = pd.concat([test_df, dummy_val], axis=1)
    test_df.drop([col], inplace=True, axis=1)

print("test_df: {}".format(test_df.shape))

100%|██████████| 37/37 [00:05<00:00,  7.25it/s]
 14%|█▎        | 5/37 [00:00<00:00, 48.12it/s]

train_df: (99918, 912)


100%|██████████| 37/37 [00:01<00:00, 22.07it/s]

test_df: (50000, 911)





In [5]:
Xtrain = train_df.loc[:, train_df.columns != 'target'].values
Ytrain = train_df['target'].values
Ytrain_oh = pd.get_dummies(train_df['target']).values
Xtest = test_df.values

print("Xtrain: {} \nYtrain: {} \nYtrain_oh: {} \nXtest: {}".format(Xtrain.shape, Ytrain.shape, 
                                                                   Ytrain_oh.shape, Xtest.shape))

del train_df
del test_df
gc.collect()

Xtrain: (99918, 911) 
Ytrain: (99918,) 
Ytrain_oh: (99918, 4) 
Xtest: (50000, 911)


100

## Build and validate the model

In [6]:
FOLD = 10
NUM_SEED = 3

# Prediction Clipping Thresholds
p_min = 0.025
p_max = 1 - p_min

np.random.seed(3)
seeds = np.random.randint(0, 100, size=NUM_SEED)

oof_score = 0
y_pred_meta_xgb = np.zeros((Ytrain.shape[0], 4))
y_pred_final_xgb = np.zeros((Xtest.shape[0], 4))
counter = 0


for sidx, seed in enumerate(seeds):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain, Ytrain)):
        counter += 1

        train_x, train_y, train_y_oh = Xtrain[train], Ytrain[train], Ytrain_oh[train]
        val_x, val_y, val_y_oh = Xtrain[val], Ytrain[val], Ytrain_oh[val]
        
        model = XGBClassifier(
            objective='multi:softmax',
            eval_metric='mlogloss',
            booster='gbtree',
            sample_type='weighted',
            tree_method='gpu_hist',
            grow_policy='lossguide',
            use_label_encoder=False,
            num_round=5000,
            num_class=4,
            #max_depth=10, 
            max_leaves=85,
            learning_rate=0.0982,
            subsample=0.876,
            colsample_bytree=0.52,
            min_child_weight=12,
            reg_lambda=0.001515,
            verbosity=0
        )

        model.fit(train_x, train_y, eval_set=[(train_x, train_y), (val_x, val_y)], 
                  early_stopping_rounds=200, verbose=50)

        y_pred = model.predict_proba(val_x, iteration_range=(0, model.best_iteration))
        y_pred = np.clip(y_pred, p_min, p_max)
        y_pred_meta_xgb[val] += y_pred
        y_pred_final_xgb += model.predict_proba(Xtest, iteration_range=(0, model.best_iteration))
        
        score = log_loss(val_y_oh, y_pred)
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_meta_xgb = y_pred_meta_xgb / float(NUM_SEED)
y_pred_final_xgb = y_pred_final_xgb / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

[0]	validation_0-mlogloss:1.34652	validation_1-mlogloss:1.34751
[50]	validation_0-mlogloss:1.06251	validation_1-mlogloss:1.09768
[99]	validation_0-mlogloss:1.03284	validation_1-mlogloss:1.09462


  "because it will generate extra copies and increase " +


Seed-24 | Fold-0 | OOF Score: 1.0946671266952832
[0]	validation_0-mlogloss:1.34668	validation_1-mlogloss:1.34728
[50]	validation_0-mlogloss:1.06166	validation_1-mlogloss:1.09633
[99]	validation_0-mlogloss:1.03216	validation_1-mlogloss:1.09274


  "because it will generate extra copies and increase " +


Seed-24 | Fold-1 | OOF Score: 1.0927583239033067
[0]	validation_0-mlogloss:1.34660	validation_1-mlogloss:1.34743
[50]	validation_0-mlogloss:1.06105	validation_1-mlogloss:1.10084
[99]	validation_0-mlogloss:1.03107	validation_1-mlogloss:1.09833


  "because it will generate extra copies and increase " +


Seed-24 | Fold-2 | OOF Score: 1.0983453118954423
[0]	validation_0-mlogloss:1.34655	validation_1-mlogloss:1.34730
[50]	validation_0-mlogloss:1.06220	validation_1-mlogloss:1.09630
[99]	validation_0-mlogloss:1.03295	validation_1-mlogloss:1.09203


  "because it will generate extra copies and increase " +


Seed-24 | Fold-3 | OOF Score: 1.0921150409072948
[0]	validation_0-mlogloss:1.34648	validation_1-mlogloss:1.34732
[50]	validation_0-mlogloss:1.06137	validation_1-mlogloss:1.09888
[99]	validation_0-mlogloss:1.03201	validation_1-mlogloss:1.09505


  "because it will generate extra copies and increase " +


Seed-24 | Fold-4 | OOF Score: 1.0950735873603124
[0]	validation_0-mlogloss:1.34653	validation_1-mlogloss:1.34735
[50]	validation_0-mlogloss:1.06118	validation_1-mlogloss:1.10040
[99]	validation_0-mlogloss:1.03078	validation_1-mlogloss:1.09716


  "because it will generate extra copies and increase " +


Seed-24 | Fold-5 | OOF Score: 1.0970794520601093
[0]	validation_0-mlogloss:1.34658	validation_1-mlogloss:1.34733
[50]	validation_0-mlogloss:1.06203	validation_1-mlogloss:1.09807
[99]	validation_0-mlogloss:1.03192	validation_1-mlogloss:1.09427


  "because it will generate extra copies and increase " +


Seed-24 | Fold-6 | OOF Score: 1.0942858952366776
[0]	validation_0-mlogloss:1.34642	validation_1-mlogloss:1.34728
[50]	validation_0-mlogloss:1.06137	validation_1-mlogloss:1.09814
[99]	validation_0-mlogloss:1.03076	validation_1-mlogloss:1.09420


  "because it will generate extra copies and increase " +


Seed-24 | Fold-7 | OOF Score: 1.094189705692524
[0]	validation_0-mlogloss:1.34662	validation_1-mlogloss:1.34736
[50]	validation_0-mlogloss:1.06170	validation_1-mlogloss:1.09641
[99]	validation_0-mlogloss:1.03237	validation_1-mlogloss:1.09217


  "because it will generate extra copies and increase " +


Seed-24 | Fold-8 | OOF Score: 1.0922105808973241
[0]	validation_0-mlogloss:1.34656	validation_1-mlogloss:1.34741
[50]	validation_0-mlogloss:1.06121	validation_1-mlogloss:1.09848
[99]	validation_0-mlogloss:1.03143	validation_1-mlogloss:1.09597


  "because it will generate extra copies and increase " +


Seed-24 | Fold-9 | OOF Score: 1.0959038913351389

Seed: 24 | Aggregate OOF Score: 1.0946628915983414


[0]	validation_0-mlogloss:1.34648	validation_1-mlogloss:1.34736
[50]	validation_0-mlogloss:1.06151	validation_1-mlogloss:1.09781
[99]	validation_0-mlogloss:1.03126	validation_1-mlogloss:1.09420


  "because it will generate extra copies and increase " +


Seed-3 | Fold-0 | OOF Score: 1.094245157721307
[0]	validation_0-mlogloss:1.34647	validation_1-mlogloss:1.34715
[50]	validation_0-mlogloss:1.06225	validation_1-mlogloss:1.09778
[99]	validation_0-mlogloss:1.03287	validation_1-mlogloss:1.09397


  "because it will generate extra copies and increase " +


Seed-3 | Fold-1 | OOF Score: 1.0939936606814449
[0]	validation_0-mlogloss:1.34657	validation_1-mlogloss:1.34754
[50]	validation_0-mlogloss:1.06193	validation_1-mlogloss:1.09978
[99]	validation_0-mlogloss:1.03216	validation_1-mlogloss:1.09737


  "because it will generate extra copies and increase " +


Seed-3 | Fold-2 | OOF Score: 1.0972660987334455
[0]	validation_0-mlogloss:1.34653	validation_1-mlogloss:1.34766
[50]	validation_0-mlogloss:1.06144	validation_1-mlogloss:1.09825
[99]	validation_0-mlogloss:1.03102	validation_1-mlogloss:1.09515


  "because it will generate extra copies and increase " +


Seed-3 | Fold-3 | OOF Score: 1.0950911553804759
[0]	validation_0-mlogloss:1.34657	validation_1-mlogloss:1.34729
[50]	validation_0-mlogloss:1.06162	validation_1-mlogloss:1.09724
[99]	validation_0-mlogloss:1.03192	validation_1-mlogloss:1.09435


  "because it will generate extra copies and increase " +


Seed-3 | Fold-4 | OOF Score: 1.094436731477312
[0]	validation_0-mlogloss:1.34662	validation_1-mlogloss:1.34741
[50]	validation_0-mlogloss:1.06136	validation_1-mlogloss:1.09943
[99]	validation_0-mlogloss:1.03201	validation_1-mlogloss:1.09706


  "because it will generate extra copies and increase " +


Seed-3 | Fold-5 | OOF Score: 1.0971175464357825
[0]	validation_0-mlogloss:1.34649	validation_1-mlogloss:1.34727
[50]	validation_0-mlogloss:1.06194	validation_1-mlogloss:1.09778
[99]	validation_0-mlogloss:1.03179	validation_1-mlogloss:1.09405


  "because it will generate extra copies and increase " +


Seed-3 | Fold-6 | OOF Score: 1.0941216080569571
[0]	validation_0-mlogloss:1.34650	validation_1-mlogloss:1.34727
[50]	validation_0-mlogloss:1.06167	validation_1-mlogloss:1.09873
[99]	validation_0-mlogloss:1.03275	validation_1-mlogloss:1.09470


  "because it will generate extra copies and increase " +


Seed-3 | Fold-7 | OOF Score: 1.0947534932693044
[0]	validation_0-mlogloss:1.34651	validation_1-mlogloss:1.34757
[50]	validation_0-mlogloss:1.06196	validation_1-mlogloss:1.09780
[99]	validation_0-mlogloss:1.03266	validation_1-mlogloss:1.09390


  "because it will generate extra copies and increase " +


Seed-3 | Fold-8 | OOF Score: 1.093947493668171
[0]	validation_0-mlogloss:1.34643	validation_1-mlogloss:1.34765
[50]	validation_0-mlogloss:1.06072	validation_1-mlogloss:1.09992
[99]	validation_0-mlogloss:1.03099	validation_1-mlogloss:1.09659


  "because it will generate extra copies and increase " +


Seed-3 | Fold-9 | OOF Score: 1.0965810772574063

Seed: 3 | Aggregate OOF Score: 1.0951554022681607


[0]	validation_0-mlogloss:1.34642	validation_1-mlogloss:1.34746
[50]	validation_0-mlogloss:1.06137	validation_1-mlogloss:1.09869
[99]	validation_0-mlogloss:1.03105	validation_1-mlogloss:1.09497


  "because it will generate extra copies and increase " +


Seed-56 | Fold-0 | OOF Score: 1.0950029260986704
[0]	validation_0-mlogloss:1.34652	validation_1-mlogloss:1.34735
[50]	validation_0-mlogloss:1.06128	validation_1-mlogloss:1.09870
[99]	validation_0-mlogloss:1.03150	validation_1-mlogloss:1.09493


  "because it will generate extra copies and increase " +


Seed-56 | Fold-1 | OOF Score: 1.0949288271442115
[0]	validation_0-mlogloss:1.34652	validation_1-mlogloss:1.34732
[50]	validation_0-mlogloss:1.06137	validation_1-mlogloss:1.09756
[99]	validation_0-mlogloss:1.03080	validation_1-mlogloss:1.09452


  "because it will generate extra copies and increase " +


Seed-56 | Fold-2 | OOF Score: 1.0945976614456974
[0]	validation_0-mlogloss:1.34654	validation_1-mlogloss:1.34726
[50]	validation_0-mlogloss:1.06207	validation_1-mlogloss:1.09911
[99]	validation_0-mlogloss:1.03249	validation_1-mlogloss:1.09654


  "because it will generate extra copies and increase " +


Seed-56 | Fold-3 | OOF Score: 1.0965818936665885
[0]	validation_0-mlogloss:1.34653	validation_1-mlogloss:1.34722
[50]	validation_0-mlogloss:1.06212	validation_1-mlogloss:1.09503
[99]	validation_0-mlogloss:1.03215	validation_1-mlogloss:1.09075


  "because it will generate extra copies and increase " +


Seed-56 | Fold-4 | OOF Score: 1.0907617350883656
[0]	validation_0-mlogloss:1.34662	validation_1-mlogloss:1.34735
[50]	validation_0-mlogloss:1.06257	validation_1-mlogloss:1.09534
[99]	validation_0-mlogloss:1.03263	validation_1-mlogloss:1.09094


  "because it will generate extra copies and increase " +


Seed-56 | Fold-5 | OOF Score: 1.0911435635881224
[0]	validation_0-mlogloss:1.34658	validation_1-mlogloss:1.34746
[50]	validation_0-mlogloss:1.06111	validation_1-mlogloss:1.09922
[99]	validation_0-mlogloss:1.03194	validation_1-mlogloss:1.09598


  "because it will generate extra copies and increase " +


Seed-56 | Fold-6 | OOF Score: 1.0960517600745416
[0]	validation_0-mlogloss:1.34655	validation_1-mlogloss:1.34739
[50]	validation_0-mlogloss:1.06165	validation_1-mlogloss:1.09902
[99]	validation_0-mlogloss:1.03218	validation_1-mlogloss:1.09512


  "because it will generate extra copies and increase " +


Seed-56 | Fold-7 | OOF Score: 1.095217237882823
[0]	validation_0-mlogloss:1.34647	validation_1-mlogloss:1.34722
[50]	validation_0-mlogloss:1.06158	validation_1-mlogloss:1.09797
[99]	validation_0-mlogloss:1.03144	validation_1-mlogloss:1.09534


  "because it will generate extra copies and increase " +


Seed-56 | Fold-8 | OOF Score: 1.0953474197912927
[0]	validation_0-mlogloss:1.34654	validation_1-mlogloss:1.34742
[50]	validation_0-mlogloss:1.06107	validation_1-mlogloss:1.09926
[99]	validation_0-mlogloss:1.03168	validation_1-mlogloss:1.09574


  "because it will generate extra copies and increase " +


Seed-56 | Fold-9 | OOF Score: 1.0957699390118

Seed: 56 | Aggregate OOF Score: 1.0945402963792115


Aggregate OOF Score: 1.0947861967485712


In [7]:
del model
gc.collect()

769

In [8]:
np.savez_compressed('./XGB_Meta_Features.npz',
                    y_pred_meta_xgb=y_pred_meta_xgb, 
                    oof_score=oof_score,
                    y_pred_final_xgb=y_pred_final_xgb)

## Create submission file

In [9]:
y_pred_final_xgb = np.clip(y_pred_final_xgb, p_min, p_max)

test_df = pd.read_csv("../input/tabular-playground-series-may-2021/test.csv")
submit_df = pd.DataFrame()
submit_df['id'] = test_df['id']
submit_df['Class_1'] = y_pred_final_xgb[:,0]
submit_df['Class_2'] = y_pred_final_xgb[:,1]
submit_df['Class_3'] = y_pred_final_xgb[:,2]
submit_df['Class_4'] = y_pred_final_xgb[:,3]
submit_df.head()

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4
0,100000,0.091022,0.57743,0.203782,0.127766
1,100001,0.084916,0.670191,0.155296,0.089597
2,100002,0.086437,0.609066,0.207291,0.097207
3,100003,0.08075,0.529945,0.270914,0.118391
4,100004,0.074147,0.638333,0.183757,0.103762


In [10]:
submit_df.to_csv("./XGB_submission.csv", index=False)