## Import libraries

In [1]:
import gc
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold

## Prepare data for model training

In [2]:
with open("../input/tps-may-data-preprocess-v3/TPS_May_Dataset_w_Org.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
train_df = processed_data['train_df']
test_df = processed_data['test_df']

del processed_data
gc.collect()

0

In [3]:
cat_cols = train_df.iloc[:,0:50].columns
train_df[cat_cols] = train_df[cat_cols].astype(int)
test_df[cat_cols] = test_df[cat_cols].astype(int)
cat_cols_indices = [train_df.columns.get_loc(col) for col in cat_cols]
print(cat_cols_indices)

train_df[cat_cols].head()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]


Unnamed: 0_level_0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,1,0,1,0,0,0,0,0,...,3,0,0,21,0,0,0,0,0,0
1,0,0,0,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,2,...,0,0,1,0,0,0,0,13,2,0
3,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [4]:
Xtrain = train_df.loc[:, train_df.columns != 'target'].copy()
Ytrain = train_df['target'].copy()
Ytrain_oh = pd.get_dummies(train_df['target']).copy()
Xtest = test_df.copy()

print("Xtrain: {} \nYtrain: {} \nYtrain_oh: {} \nXtest: {}".format(Xtrain.shape, Ytrain.shape, 
                                                                   Ytrain_oh.shape, Xtest.shape))

del train_df
del test_df
gc.collect()

Xtrain: (99918, 1074) 
Ytrain: (99918,) 
Ytrain_oh: (99918, 4) 
Xtest: (50000, 1074)


20

## Build and validate the model

In [5]:
FOLD = 10
NUM_SEED = 3

# Prediction Clipping Thresholds
p_min = 0.025
p_max = 1 - p_min

np.random.seed(2021)
seeds = np.random.randint(0, 100, size=NUM_SEED)

oof_score = 0
y_pred_meta_cb = np.zeros((Ytrain.shape[0], 4))
y_pred_final_cb = np.zeros((Xtest.shape[0], 4))
counter = 0


for sidx, seed in enumerate(seeds):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain.values, Ytrain.values)):
        counter += 1

        train_x, train_y, train_y_oh = Xtrain.iloc[train], Ytrain.iloc[train], Ytrain_oh.iloc[train]
        val_x, val_y, val_y_oh = Xtrain.iloc[val], Ytrain.iloc[val], Ytrain_oh.iloc[val]

        model = CatBoostClassifier(
            objective='MultiClass',
            eval_metric='MultiClass',
            num_boost_round=5000,
            max_ctr_complexity=15,
            od_wait=1000, 
            od_type='Iter',
            use_best_model=True,
            bootstrap_type='Poisson',
            learning_rate=0.01465,
            reg_lambda=0.01864,
            subsample=0.68135,
            max_depth=6, 
            min_data_in_leaf=1,
            task_type='GPU',
            verbose=0
        )

        model.fit(train_x, train_y, eval_set=[(val_x, val_y)], 
                  cat_features=cat_cols_indices,
                  early_stopping_rounds=200, verbose=200)

        y_pred = model.predict_proba(val_x)
        y_pred = np.clip(y_pred, p_min, p_max)
        y_pred_meta_cb[val] += y_pred
        y_pred_final_cb += model.predict_proba(Xtest)
        
        score = log_loss(val_y_oh, y_pred)
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_meta_cb = y_pred_meta_cb / float(NUM_SEED)
y_pred_final_cb = y_pred_final_cb / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

0:	learn: 1.3775415	test: 1.3775481	best: 1.3775481 (0)	total: 22.7ms	remaining: 1m 53s
200:	learn: 1.0988614	test: 1.1048047	best: 1.1048047 (200)	total: 3.23s	remaining: 1m 17s
400:	learn: 1.0864491	test: 1.0985647	best: 1.0985647 (400)	total: 7.57s	remaining: 1m 26s
600:	learn: 1.0781897	test: 1.0963146	best: 1.0963146 (600)	total: 10.7s	remaining: 1m 18s
800:	learn: 1.0712271	test: 1.0948577	best: 1.0948388 (797)	total: 13.8s	remaining: 1m 12s
1000:	learn: 1.0647348	test: 1.0941046	best: 1.0941046 (1000)	total: 17.2s	remaining: 1m 8s
1200:	learn: 1.0583842	test: 1.0936840	best: 1.0936713 (1186)	total: 20.3s	remaining: 1m 4s
1400:	learn: 1.0524262	test: 1.0934592	best: 1.0934443 (1387)	total: 23.4s	remaining: 1m
1600:	learn: 1.0465730	test: 1.0932272	best: 1.0932199 (1597)	total: 26.8s	remaining: 56.9s
1800:	learn: 1.0409114	test: 1.0931833	best: 1.0931252 (1733)	total: 29.8s	remaining: 53s
2000:	learn: 1.0352126	test: 1.0932146	best: 1.0931071 (1876)	total: 32.9s	remaining: 49.3s
b

In [6]:
np.savez_compressed('./CB_Meta_Features.npz',
                    y_pred_meta_cb=y_pred_meta_cb, 
                    oof_score=oof_score,
                    y_pred_final_cb=y_pred_final_cb)

## Create submission file

In [7]:
y_pred_final_cb = np.clip(y_pred_final_cb, p_min, p_max)

test_df = pd.read_csv("../input/tabular-playground-series-may-2021/test.csv")
submit_df = pd.DataFrame()
submit_df['id'] = test_df['id']
submit_df['Class_1'] = y_pred_final_cb[:,0]
submit_df['Class_2'] = y_pred_final_cb[:,1]
submit_df['Class_3'] = y_pred_final_cb[:,2]
submit_df['Class_4'] = y_pred_final_cb[:,3]
submit_df.head()

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4
0,100000,0.08762,0.628253,0.170778,0.113349
1,100001,0.082701,0.664,0.154464,0.098836
2,100002,0.079805,0.644553,0.171222,0.104419
3,100003,0.079738,0.497243,0.327634,0.095385
4,100004,0.076621,0.617211,0.196948,0.10922


In [8]:
submit_df.to_csv("./CB_submission.csv", index=False)