## Import libraries

In [1]:
import gc
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from xgboost import XGBClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

## Prepare data for model training

In [2]:
with open("../input/tps-may-data-preprocess-v3/TPS_May_Dataset_w_Org.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
train_df = processed_data['train_df']
test_df = processed_data['test_df']

del processed_data
gc.collect()

20

In [3]:
cat_cols = []
for col in tqdm(train_df.iloc[:,0:50].columns):
    train_ = train_df[col].unique().tolist()
    test_ = test_df[col].unique().tolist()
    
    train_.sort()
    test_.sort()
    
    if train_ == test_:
        cat_cols.append(col)

print(len(cat_cols))
cat_cols

100%|██████████| 50/50 [00:00<00:00, 1164.26it/s]

37





['feature_0',
 'feature_2',
 'feature_3',
 'feature_4',
 'feature_5',
 'feature_6',
 'feature_7',
 'feature_9',
 'feature_10',
 'feature_11',
 'feature_12',
 'feature_13',
 'feature_15',
 'feature_16',
 'feature_17',
 'feature_18',
 'feature_20',
 'feature_21',
 'feature_22',
 'feature_23',
 'feature_25',
 'feature_26',
 'feature_27',
 'feature_28',
 'feature_29',
 'feature_32',
 'feature_33',
 'feature_34',
 'feature_35',
 'feature_36',
 'feature_38',
 'feature_39',
 'feature_42',
 'feature_43',
 'feature_44',
 'feature_46',
 'feature_49']

In [4]:
train_df[cat_cols] = train_df[cat_cols].astype(int)
test_df[cat_cols] = test_df[cat_cols].astype(int)

for col in tqdm(cat_cols):
    dummy_val = pd.get_dummies(train_df[col], prefix='col')
    train_df = pd.concat([train_df, dummy_val], axis=1)
    train_df.drop([col], inplace=True, axis=1)

print("train_df: {}".format(train_df.shape))

for col in tqdm(cat_cols):
    dummy_val = pd.get_dummies(test_df[col], prefix='col')
    test_df = pd.concat([test_df, dummy_val], axis=1)
    test_df.drop([col], inplace=True, axis=1)

print("test_df: {}".format(test_df.shape))

100%|██████████| 37/37 [00:13<00:00,  2.77it/s]
  3%|▎         | 1/37 [00:00<00:05,  6.33it/s]

train_df: (99918, 1886)


100%|██████████| 37/37 [00:06<00:00,  5.52it/s]

test_df: (50000, 1885)





In [5]:
Xtrain = train_df.loc[:, train_df.columns != 'target'].values
Ytrain = train_df['target'].values
Ytrain_oh = pd.get_dummies(train_df['target']).values
Xtest = test_df.values

print("Xtrain: {} \nYtrain: {} \nYtrain_oh: {} \nXtest: {}".format(Xtrain.shape, Ytrain.shape, 
                                                                   Ytrain_oh.shape, Xtest.shape))

del train_df
del test_df
gc.collect()

Xtrain: (99918, 1885) 
Ytrain: (99918,) 
Ytrain_oh: (99918, 4) 
Xtest: (50000, 1885)


100

## Build and validate the model

In [6]:
FOLD = 10
NUM_SEED = 3

# Prediction Clipping Thresholds
p_min = 0.025
p_max = 1 - p_min

np.random.seed(2021)
seeds = np.random.randint(0, 100, size=NUM_SEED)

oof_score = 0
y_pred_meta_xgb = np.zeros((Ytrain.shape[0], 4))
y_pred_final_xgb = np.zeros((Xtest.shape[0], 4))
counter = 0


for sidx, seed in enumerate(seeds):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain, Ytrain)):
        counter += 1

        train_x, train_y, train_y_oh = Xtrain[train], Ytrain[train], Ytrain_oh[train]
        val_x, val_y, val_y_oh = Xtrain[val], Ytrain[val], Ytrain_oh[val]
        
        model = XGBClassifier(
            objective='multi:softmax',
            eval_metric='mlogloss',
            booster='gbtree',
            sample_type='weighted',
            tree_method='gpu_hist',
            grow_policy='lossguide',
            use_label_encoder=False,
            num_round=5000,
            num_class=4,
            #max_depth=10, 
            max_leaves=53,
            learning_rate=0.0982,
            subsample=0.76,
            colsample_bytree=0.52,
            min_child_weight=10,
            reg_lambda=0.001515,
            verbosity=0
        )
        
        model.fit(train_x, train_y, eval_set=[(train_x, train_y), (val_x, val_y)], 
                  early_stopping_rounds=200, verbose=50)

        y_pred = model.predict_proba(val_x, iteration_range=(0, model.best_iteration))
        y_pred = np.clip(y_pred, p_min, p_max)
        y_pred_meta_xgb[val] += y_pred
        y_pred_final_xgb += model.predict_proba(Xtest, iteration_range=(0, model.best_iteration))
        
        score = log_loss(val_y_oh, y_pred)
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))

y_pred_meta_xgb = y_pred_meta_xgb / float(NUM_SEED)
y_pred_final_xgb = y_pred_final_xgb / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

[0]	validation_0-mlogloss:1.34610	validation_1-mlogloss:1.34713
[50]	validation_0-mlogloss:1.05478	validation_1-mlogloss:1.09714
[99]	validation_0-mlogloss:1.02020	validation_1-mlogloss:1.09469


  "because it will generate extra copies and increase " +


Seed-85 | Fold-0 | OOF Score: 1.0945829360442365
[0]	validation_0-mlogloss:1.34604	validation_1-mlogloss:1.34690
[50]	validation_0-mlogloss:1.05521	validation_1-mlogloss:1.09602
[99]	validation_0-mlogloss:1.02177	validation_1-mlogloss:1.09367


  "because it will generate extra copies and increase " +


Seed-85 | Fold-1 | OOF Score: 1.0935679915531862
[0]	validation_0-mlogloss:1.34591	validation_1-mlogloss:1.34699
[50]	validation_0-mlogloss:1.05454	validation_1-mlogloss:1.09676
[99]	validation_0-mlogloss:1.02059	validation_1-mlogloss:1.09423


  "because it will generate extra copies and increase " +


Seed-85 | Fold-2 | OOF Score: 1.0941906668487384
[0]	validation_0-mlogloss:1.34621	validation_1-mlogloss:1.34685
[50]	validation_0-mlogloss:1.05502	validation_1-mlogloss:1.09598
[99]	validation_0-mlogloss:1.02036	validation_1-mlogloss:1.09360


  "because it will generate extra copies and increase " +


Seed-85 | Fold-3 | OOF Score: 1.0935670621215916
[0]	validation_0-mlogloss:1.34614	validation_1-mlogloss:1.34710
[50]	validation_0-mlogloss:1.05567	validation_1-mlogloss:1.09779
[99]	validation_0-mlogloss:1.02056	validation_1-mlogloss:1.09519


  "because it will generate extra copies and increase " +


Seed-85 | Fold-4 | OOF Score: 1.0952761650124223
[0]	validation_0-mlogloss:1.34592	validation_1-mlogloss:1.34707
[50]	validation_0-mlogloss:1.05372	validation_1-mlogloss:1.09815
[99]	validation_0-mlogloss:1.01851	validation_1-mlogloss:1.09649


  "because it will generate extra copies and increase " +


Seed-85 | Fold-5 | OOF Score: 1.096515994384032
[0]	validation_0-mlogloss:1.34594	validation_1-mlogloss:1.34705
[50]	validation_0-mlogloss:1.05544	validation_1-mlogloss:1.09617
[99]	validation_0-mlogloss:1.02023	validation_1-mlogloss:1.09284


  "because it will generate extra copies and increase " +


Seed-85 | Fold-6 | OOF Score: 1.0928971964606435
[0]	validation_0-mlogloss:1.34609	validation_1-mlogloss:1.34706
[50]	validation_0-mlogloss:1.05532	validation_1-mlogloss:1.09872
[99]	validation_0-mlogloss:1.02114	validation_1-mlogloss:1.09699


  "because it will generate extra copies and increase " +


Seed-85 | Fold-7 | OOF Score: 1.0968727077871228
[0]	validation_0-mlogloss:1.34604	validation_1-mlogloss:1.34726
[50]	validation_0-mlogloss:1.05487	validation_1-mlogloss:1.09869
[99]	validation_0-mlogloss:1.02022	validation_1-mlogloss:1.09671


  "because it will generate extra copies and increase " +


Seed-85 | Fold-8 | OOF Score: 1.0967134827384957
[0]	validation_0-mlogloss:1.34604	validation_1-mlogloss:1.34694
[50]	validation_0-mlogloss:1.05478	validation_1-mlogloss:1.09757
[99]	validation_0-mlogloss:1.02065	validation_1-mlogloss:1.09564


  "because it will generate extra copies and increase " +


Seed-85 | Fold-9 | OOF Score: 1.095483194424779

Seed: 85 | Aggregate OOF Score: 1.0949667397375247


[0]	validation_0-mlogloss:1.34603	validation_1-mlogloss:1.34695
[50]	validation_0-mlogloss:1.05457	validation_1-mlogloss:1.09627
[99]	validation_0-mlogloss:1.01915	validation_1-mlogloss:1.09408


  "because it will generate extra copies and increase " +


Seed-57 | Fold-0 | OOF Score: 1.0938765167611375
[0]	validation_0-mlogloss:1.34608	validation_1-mlogloss:1.34690
[50]	validation_0-mlogloss:1.05499	validation_1-mlogloss:1.09582
[99]	validation_0-mlogloss:1.02008	validation_1-mlogloss:1.09327


  "because it will generate extra copies and increase " +


Seed-57 | Fold-1 | OOF Score: 1.093342297745524
[0]	validation_0-mlogloss:1.34606	validation_1-mlogloss:1.34680
[50]	validation_0-mlogloss:1.05554	validation_1-mlogloss:1.09648
[99]	validation_0-mlogloss:1.02173	validation_1-mlogloss:1.09450


  "because it will generate extra copies and increase " +


Seed-57 | Fold-2 | OOF Score: 1.0945320294616674
[0]	validation_0-mlogloss:1.34602	validation_1-mlogloss:1.34686
[50]	validation_0-mlogloss:1.05522	validation_1-mlogloss:1.09643
[99]	validation_0-mlogloss:1.02069	validation_1-mlogloss:1.09477


  "because it will generate extra copies and increase " +


Seed-57 | Fold-3 | OOF Score: 1.0946821560374826
[0]	validation_0-mlogloss:1.34627	validation_1-mlogloss:1.34718
[50]	validation_0-mlogloss:1.05442	validation_1-mlogloss:1.09940
[99]	validation_0-mlogloss:1.02012	validation_1-mlogloss:1.09720


  "because it will generate extra copies and increase " +


Seed-57 | Fold-4 | OOF Score: 1.0972951247145812
[0]	validation_0-mlogloss:1.34612	validation_1-mlogloss:1.34720
[50]	validation_0-mlogloss:1.05399	validation_1-mlogloss:1.09924
[99]	validation_0-mlogloss:1.01886	validation_1-mlogloss:1.09673


  "because it will generate extra copies and increase " +


Seed-57 | Fold-5 | OOF Score: 1.0968257629806943
[0]	validation_0-mlogloss:1.34596	validation_1-mlogloss:1.34682
[50]	validation_0-mlogloss:1.05493	validation_1-mlogloss:1.09580
[99]	validation_0-mlogloss:1.02072	validation_1-mlogloss:1.09358


  "because it will generate extra copies and increase " +


Seed-57 | Fold-6 | OOF Score: 1.093567426772035
[0]	validation_0-mlogloss:1.34614	validation_1-mlogloss:1.34717
[50]	validation_0-mlogloss:1.05468	validation_1-mlogloss:1.09797
[99]	validation_0-mlogloss:1.01994	validation_1-mlogloss:1.09554


  "because it will generate extra copies and increase " +


Seed-57 | Fold-7 | OOF Score: 1.0955063883137641
[0]	validation_0-mlogloss:1.34598	validation_1-mlogloss:1.34716
[50]	validation_0-mlogloss:1.05505	validation_1-mlogloss:1.09925
[99]	validation_0-mlogloss:1.02011	validation_1-mlogloss:1.09759


  "because it will generate extra copies and increase " +


Seed-57 | Fold-8 | OOF Score: 1.0974270086748537
[0]	validation_0-mlogloss:1.34613	validation_1-mlogloss:1.34708
[50]	validation_0-mlogloss:1.05447	validation_1-mlogloss:1.09826
[99]	validation_0-mlogloss:1.02053	validation_1-mlogloss:1.09699


  "because it will generate extra copies and increase " +


Seed-57 | Fold-9 | OOF Score: 1.096851567539674

Seed: 57 | Aggregate OOF Score: 1.0953906279001415


[0]	validation_0-mlogloss:1.34610	validation_1-mlogloss:1.34714
[50]	validation_0-mlogloss:1.05380	validation_1-mlogloss:1.10015
[99]	validation_0-mlogloss:1.01981	validation_1-mlogloss:1.09740


  "because it will generate extra copies and increase " +


Seed-0 | Fold-0 | OOF Score: 1.0974549234295197
[0]	validation_0-mlogloss:1.34618	validation_1-mlogloss:1.34726
[50]	validation_0-mlogloss:1.05531	validation_1-mlogloss:1.09613
[99]	validation_0-mlogloss:1.02006	validation_1-mlogloss:1.09342


  "because it will generate extra copies and increase " +


Seed-0 | Fold-1 | OOF Score: 1.0932624386586218
[0]	validation_0-mlogloss:1.34593	validation_1-mlogloss:1.34711
[50]	validation_0-mlogloss:1.05421	validation_1-mlogloss:1.09825
[99]	validation_0-mlogloss:1.01971	validation_1-mlogloss:1.09621


  "because it will generate extra copies and increase " +


Seed-0 | Fold-2 | OOF Score: 1.096077731049197
[0]	validation_0-mlogloss:1.34589	validation_1-mlogloss:1.34674
[50]	validation_0-mlogloss:1.05430	validation_1-mlogloss:1.09761
[99]	validation_0-mlogloss:1.01940	validation_1-mlogloss:1.09534


  "because it will generate extra copies and increase " +


Seed-0 | Fold-3 | OOF Score: 1.0953638343836924
[0]	validation_0-mlogloss:1.34608	validation_1-mlogloss:1.34663
[50]	validation_0-mlogloss:1.05503	validation_1-mlogloss:1.09341
[99]	validation_0-mlogloss:1.02129	validation_1-mlogloss:1.09041


  "because it will generate extra copies and increase " +


Seed-0 | Fold-4 | OOF Score: 1.0903701833244248
[0]	validation_0-mlogloss:1.34588	validation_1-mlogloss:1.34701
[50]	validation_0-mlogloss:1.05519	validation_1-mlogloss:1.09708
[99]	validation_0-mlogloss:1.02095	validation_1-mlogloss:1.09443


  "because it will generate extra copies and increase " +


Seed-0 | Fold-5 | OOF Score: 1.0943651121852396
[0]	validation_0-mlogloss:1.34602	validation_1-mlogloss:1.34702
[50]	validation_0-mlogloss:1.05483	validation_1-mlogloss:1.09778
[99]	validation_0-mlogloss:1.02040	validation_1-mlogloss:1.09536


  "because it will generate extra copies and increase " +


Seed-0 | Fold-6 | OOF Score: 1.0953845849635007
[0]	validation_0-mlogloss:1.34611	validation_1-mlogloss:1.34693
[50]	validation_0-mlogloss:1.05492	validation_1-mlogloss:1.09743
[99]	validation_0-mlogloss:1.02085	validation_1-mlogloss:1.09539


  "because it will generate extra copies and increase " +


Seed-0 | Fold-7 | OOF Score: 1.095336206188743
[0]	validation_0-mlogloss:1.34600	validation_1-mlogloss:1.34691
[50]	validation_0-mlogloss:1.05409	validation_1-mlogloss:1.09563
[99]	validation_0-mlogloss:1.01965	validation_1-mlogloss:1.09355


  "because it will generate extra copies and increase " +


Seed-0 | Fold-8 | OOF Score: 1.0936177039501986
[0]	validation_0-mlogloss:1.34596	validation_1-mlogloss:1.34702
[50]	validation_0-mlogloss:1.05472	validation_1-mlogloss:1.09943
[99]	validation_0-mlogloss:1.01979	validation_1-mlogloss:1.09675


  "because it will generate extra copies and increase " +


Seed-0 | Fold-9 | OOF Score: 1.0967506439570562

Seed: 0 | Aggregate OOF Score: 1.0947983362090192


Aggregate OOF Score: 1.0950519012822286


In [7]:
del model
gc.collect()

769

In [8]:
np.savez_compressed('./XGB_Meta_Features.npz',
                    y_pred_meta_xgb=y_pred_meta_xgb, 
                    oof_score=oof_score,
                    y_pred_final_xgb=y_pred_final_xgb)

## Create submission file

In [9]:
y_pred_final_xgb = np.clip(y_pred_final_xgb, p_min, p_max)

test_df = pd.read_csv("../input/tabular-playground-series-may-2021/test.csv")
submit_df = pd.DataFrame()
submit_df['id'] = test_df['id']
submit_df['Class_1'] = y_pred_final_xgb[:,0]
submit_df['Class_2'] = y_pred_final_xgb[:,1]
submit_df['Class_3'] = y_pred_final_xgb[:,2]
submit_df['Class_4'] = y_pred_final_xgb[:,3]
submit_df.head()

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4
0,100000,0.084123,0.603899,0.188721,0.123256
1,100001,0.07417,0.661333,0.167542,0.096955
2,100002,0.085404,0.639757,0.179414,0.095425
3,100003,0.078688,0.47713,0.341408,0.102774
4,100004,0.074873,0.626725,0.203206,0.095197


In [10]:
submit_df.to_csv("./XGB_submission.csv", index=False)