## Import libraries

In [1]:
import gc
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

## Prepare data

In [2]:
with open("../input/tps-may-data-preprocess/TPS_May_Dataset_w_Quantile.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
train_df = processed_data['train_df']

del processed_data
gc.collect()

0

In [3]:
ds = np.load('../input/tps-may-cnn-blend/CNN_Meta_Features.npz', allow_pickle=True)
y_pred_meta_dnn, y_pred_final_dnn = ds['y_pred_meta_dnn'], ds['y_pred_final_dnn']

ds = np.load('../input/tps-may-cnn-blend-v2/CNN_Meta_Features.npz', allow_pickle=True)
y_pred_meta_dnn2, y_pred_final_dnn2 = ds['y_pred_meta_dnn'], ds['y_pred_final_dnn']

ds = np.load('../input/tps-may-cnn-blend-v3/CNN_Meta_Features.npz', allow_pickle=True)
y_pred_meta_dnn3, y_pred_final_dnn3 = ds['y_pred_meta_dnn'], ds['y_pred_final_dnn']

ds = np.load('../input/tps-may-cnn-blend-v4/CNN_Meta_Features.npz', allow_pickle=True)
y_pred_meta_dnn4, y_pred_final_dnn4 = ds['y_pred_meta_dnn'], ds['y_pred_final_dnn']

ds = np.load('../input/tps-may-cnn-blend-v5/CNN_Meta_Features.npz', allow_pickle=True)
y_pred_meta_dnn5, y_pred_final_dnn5 = ds['y_pred_meta_dnn'], ds['y_pred_final_dnn']

In [4]:
Xtrain_meta = np.concatenate((y_pred_meta_dnn, y_pred_meta_dnn2, 
                              y_pred_meta_dnn3, y_pred_meta_dnn4,
                              y_pred_meta_dnn5), axis=1)
Xtest_meta = np.concatenate((y_pred_final_dnn, y_pred_final_dnn2, 
                             y_pred_final_dnn3, y_pred_final_dnn4,
                             y_pred_final_dnn5), axis=1)

Ytrain_meta = train_df['target'].values
Ytrain_oh = pd.get_dummies(train_df['target']).values

print("Xtrain_meta shape: {}".format(Xtrain_meta.shape))
print("Ytrain_meta shape: {}".format(Ytrain_meta.shape))
print("Xtest_meta shape: {}".format(Xtest_meta.shape))

Xtrain_meta shape: (99918, 20)
Ytrain_meta shape: (99918,)
Xtest_meta shape: (50000, 20)


## Logistic Regression

In [5]:
FOLD = 10
NUM_SEED = 2

# Prediction Clipping Thresholds
p_min = 0.025
p_max = 1 - p_min

np.random.seed(3)
seeds = np.random.randint(0, 100, size=NUM_SEED)

oof_score = 0
y_pred_meta_lr = np.zeros((Ytrain_meta.shape[0], 4))
y_pred_final_lr = np.zeros((Xtest_meta.shape[0], 4))
counter = 0


for sidx, seed in enumerate(seeds):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain_meta, Ytrain_meta)):
        counter += 1

        train_x, train_y, train_y_oh = Xtrain_meta[train], Ytrain_meta[train], Ytrain_oh[train]
        val_x, val_y, val_y_oh = Xtrain_meta[val], Ytrain_meta[val], Ytrain_oh[val]

        model = LogisticRegression(
            max_iter=2000,
            #class_weight=class_weight, 
            random_state=0
        )

        model.fit(train_x, train_y)

        y_pred = model.predict_proba(val_x)
        y_pred = np.clip(y_pred, p_min, p_max)
        y_pred_meta_lr[val] += y_pred
        y_pred_final_lr += model.predict_proba(Xtest_meta)
        
        score = log_loss(val_y_oh, y_pred)
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_meta_lr = y_pred_meta_lr / float(NUM_SEED)
y_pred_final_lr = y_pred_final_lr / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

Seed-24 | Fold-0 | OOF Score: 1.0862744123405081
Seed-24 | Fold-1 | OOF Score: 1.0864917311210855
Seed-24 | Fold-2 | OOF Score: 1.0904779832380633
Seed-24 | Fold-3 | OOF Score: 1.0842194211234273
Seed-24 | Fold-4 | OOF Score: 1.0877771857870926
Seed-24 | Fold-5 | OOF Score: 1.0871921943196983
Seed-24 | Fold-6 | OOF Score: 1.0861886043056292
Seed-24 | Fold-7 | OOF Score: 1.087618792160985
Seed-24 | Fold-8 | OOF Score: 1.0839881135794707
Seed-24 | Fold-9 | OOF Score: 1.0895576033806877

Seed: 24 | Aggregate OOF Score: 1.0869786041356648


Seed-3 | Fold-0 | OOF Score: 1.0851020991355564
Seed-3 | Fold-1 | OOF Score: 1.0859961793598556
Seed-3 | Fold-2 | OOF Score: 1.0875900300583863
Seed-3 | Fold-3 | OOF Score: 1.0894810803156505
Seed-3 | Fold-4 | OOF Score: 1.0862404520077424
Seed-3 | Fold-5 | OOF Score: 1.0891773307704418
Seed-3 | Fold-6 | OOF Score: 1.085036773263987
Seed-3 | Fold-7 | OOF Score: 1.0868809156101602
Seed-3 | Fold-8 | OOF Score: 1.0849781907883431
Seed-3 | Fold-9 | OOF Scor

In [6]:
y_pred_final_lr = np.clip(y_pred_final_lr, p_min, p_max)

test_df = pd.read_csv("../input/tabular-playground-series-may-2021/test.csv")
submit_df = pd.DataFrame()
submit_df['id'] = test_df['id']
submit_df['Class_1'] = y_pred_final_lr[:,0]
submit_df['Class_2'] = y_pred_final_lr[:,1]
submit_df['Class_3'] = y_pred_final_lr[:,2]
submit_df['Class_4'] = y_pred_final_lr[:,3]
submit_df.to_csv("./LR_submission.csv", index=False)
submit_df.head()

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4
0,100000,0.086097,0.640418,0.168208,0.105277
1,100001,0.08517,0.686622,0.140141,0.088067
2,100002,0.086082,0.670839,0.151071,0.092008
3,100003,0.091525,0.528176,0.27788,0.10242
4,100004,0.085982,0.652313,0.160165,0.10154


## RandomForest Classifier

In [7]:
FOLD = 10
NUM_SEED = 2

# Prediction Clipping Thresholds
p_min = 0.025
p_max = 1 - p_min

np.random.seed(3)
seeds = np.random.randint(0, 100, size=NUM_SEED)

oof_score = 0
y_pred_meta_rfc = np.zeros((Ytrain_meta.shape[0], 4))
y_pred_final_rfc = np.zeros((Xtest_meta.shape[0], 4))
counter = 0


for sidx, seed in enumerate(seeds):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain_meta, Ytrain_meta)):
        counter += 1

        train_x, train_y, train_y_oh = Xtrain_meta[train], Ytrain_meta[train], Ytrain_oh[train]
        val_x, val_y, val_y_oh = Xtrain_meta[val], Ytrain_meta[val], Ytrain_oh[val]

        model = RandomForestClassifier(
            max_depth=6, 
            random_state=0
        )

        model.fit(train_x, train_y)

        y_pred = model.predict_proba(val_x)
        y_pred = np.clip(y_pred, p_min, p_max)
        y_pred_meta_rfc[val] += y_pred
        y_pred_final_rfc += model.predict_proba(Xtest_meta)
        
        score = log_loss(val_y_oh, y_pred)
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_meta_rfc = y_pred_meta_rfc / float(NUM_SEED)
y_pred_final_rfc = y_pred_final_rfc / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

Seed-24 | Fold-0 | OOF Score: 1.0855765772952302
Seed-24 | Fold-1 | OOF Score: 1.0864002659992118
Seed-24 | Fold-2 | OOF Score: 1.0905753300870722
Seed-24 | Fold-3 | OOF Score: 1.0846747216511285
Seed-24 | Fold-4 | OOF Score: 1.0876702140794334
Seed-24 | Fold-5 | OOF Score: 1.0875061453365542
Seed-24 | Fold-6 | OOF Score: 1.086451383955301
Seed-24 | Fold-7 | OOF Score: 1.0874716669206075
Seed-24 | Fold-8 | OOF Score: 1.0844100217540482
Seed-24 | Fold-9 | OOF Score: 1.089702890761316

Seed: 24 | Aggregate OOF Score: 1.0870439217839902


Seed-3 | Fold-0 | OOF Score: 1.086814219793093
Seed-3 | Fold-1 | OOF Score: 1.0858753786207949
Seed-3 | Fold-2 | OOF Score: 1.0870390295952859
Seed-3 | Fold-3 | OOF Score: 1.0903116028935158
Seed-3 | Fold-4 | OOF Score: 1.0860026803522245
Seed-3 | Fold-5 | OOF Score: 1.0891149338062116
Seed-3 | Fold-6 | OOF Score: 1.0852614444756743
Seed-3 | Fold-7 | OOF Score: 1.0868133623913716
Seed-3 | Fold-8 | OOF Score: 1.0852327799103205
Seed-3 | Fold-9 | OOF Score

In [8]:
y_pred_final_rfc = np.clip(y_pred_final_rfc, p_min, p_max)

submit_df['Class_1'] = y_pred_final_rfc[:,0]
submit_df['Class_2'] = y_pred_final_rfc[:,1]
submit_df['Class_3'] = y_pred_final_rfc[:,2]
submit_df['Class_4'] = y_pred_final_rfc[:,3]
submit_df.to_csv("./RFC_submission.csv", index=False)
submit_df.head()

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4
0,100000,0.085915,0.639332,0.171721,0.103032
1,100001,0.084012,0.680013,0.145266,0.090709
2,100002,0.085609,0.668547,0.15595,0.089894
3,100003,0.088023,0.546981,0.270568,0.094427
4,100004,0.085335,0.655085,0.161535,0.098045


## MLP Classifier

In [9]:
FOLD = 10
NUM_SEED = 2

# Prediction Clipping Thresholds
p_min = 0.025
p_max = 1 - p_min

np.random.seed(3)
seeds = np.random.randint(0, 100, size=NUM_SEED)

oof_score = 0
y_pred_meta_mlp = np.zeros((Ytrain_meta.shape[0], 4))
y_pred_final_mlp = np.zeros((Xtest_meta.shape[0], 4))
counter = 0


for sidx, seed in enumerate(seeds):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain_meta, Ytrain_meta)):
        counter += 1

        train_x, train_y, train_y_oh = Xtrain_meta[train], Ytrain_meta[train], Ytrain_oh[train]
        val_x, val_y, val_y_oh = Xtrain_meta[val], Ytrain_meta[val], Ytrain_oh[val]

        model = MLPClassifier(
            #hidden_layer_sizes=(64,32),
            learning_rate='adaptive',
            early_stopping=True,
            n_iter_no_change=30,
            random_state=0, 
            max_iter=1000,
            verbose=False
        )

        model.fit(train_x, train_y)

        y_pred = model.predict_proba(val_x)
        y_pred = np.clip(y_pred, p_min, p_max)
        y_pred_meta_mlp[val] += y_pred
        y_pred_final_mlp += model.predict_proba(Xtest_meta)
        
        score = log_loss(val_y_oh, y_pred)
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_meta_mlp = y_pred_meta_mlp / float(NUM_SEED)
y_pred_final_mlp = y_pred_final_mlp / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

Seed-24 | Fold-0 | OOF Score: 1.0888281133983708
Seed-24 | Fold-1 | OOF Score: 1.0864290930497793
Seed-24 | Fold-2 | OOF Score: 1.0902727279213065
Seed-24 | Fold-3 | OOF Score: 1.0838228271015413
Seed-24 | Fold-4 | OOF Score: 1.0877151510786638
Seed-24 | Fold-5 | OOF Score: 1.0876121708146684
Seed-24 | Fold-6 | OOF Score: 1.0860095299674226
Seed-24 | Fold-7 | OOF Score: 1.088939007147367
Seed-24 | Fold-8 | OOF Score: 1.0839721229918253
Seed-24 | Fold-9 | OOF Score: 1.0903805272679723

Seed: 24 | Aggregate OOF Score: 1.0873981270738917


Seed-3 | Fold-0 | OOF Score: 1.0861976030905405
Seed-3 | Fold-1 | OOF Score: 1.0871011917845599
Seed-3 | Fold-2 | OOF Score: 1.087031502949526
Seed-3 | Fold-3 | OOF Score: 1.0899600812275303
Seed-3 | Fold-4 | OOF Score: 1.0858633965637783
Seed-3 | Fold-5 | OOF Score: 1.0902197546122352
Seed-3 | Fold-6 | OOF Score: 1.0852611160962637
Seed-3 | Fold-7 | OOF Score: 1.0876170412006922
Seed-3 | Fold-8 | OOF Score: 1.085711673898688
Seed-3 | Fold-9 | OOF Score

In [10]:
y_pred_final_mlp = np.clip(y_pred_final_mlp, p_min, p_max)

submit_df['Class_1'] = y_pred_final_mlp[:,0]
submit_df['Class_2'] = y_pred_final_mlp[:,1]
submit_df['Class_3'] = y_pred_final_mlp[:,2]
submit_df['Class_4'] = y_pred_final_mlp[:,3]
submit_df.to_csv("./MLP_submission.csv", index=False)
submit_df.head()

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4
0,100000,0.086837,0.639096,0.165598,0.108469
1,100001,0.086039,0.68772,0.137049,0.089191
2,100002,0.086721,0.671878,0.148147,0.093254
3,100003,0.090722,0.52825,0.280104,0.100924
4,100004,0.086607,0.65206,0.157136,0.104197


## Voting Classifier

In [11]:
FOLD = 10
NUM_SEED = 2

# Prediction Clipping Thresholds
p_min = 0.025
p_max = 1 - p_min

np.random.seed(3)
seeds = np.random.randint(0, 100, size=NUM_SEED)

oof_score = 0
y_pred_meta_vc = np.zeros((Ytrain_meta.shape[0], 4))
y_pred_final_vc = np.zeros((Xtest_meta.shape[0], 4))
counter = 0


for sidx, seed in enumerate(seeds):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain_meta, Ytrain_meta)):
        counter += 1

        train_x, train_y, train_y_oh = Xtrain_meta[train], Ytrain_meta[train], Ytrain_oh[train]
        val_x, val_y, val_y_oh = Xtrain_meta[val], Ytrain_meta[val], Ytrain_oh[val]

        m1 = LogisticRegression(
            max_iter=2000,
            #class_weight=class_weight, 
            random_state=0
        )
        m2 = RandomForestClassifier(
            max_depth=6, 
            random_state=0
        )
        m3 = MLPClassifier(
            #hidden_layer_sizes=(64,32),
            learning_rate='adaptive',
            early_stopping=True,
            n_iter_no_change=30,
            random_state=0, 
            max_iter=1000,
            verbose=False
        )

        model = VotingClassifier([('lr', m1), ('rfc', m2), ('mlp', m3)],
                                 voting='soft', weights=[3,2,1])

        model.fit(train_x, train_y)

        y_pred = model.predict_proba(val_x)
        y_pred = np.clip(y_pred, p_min, p_max)
        y_pred_meta_vc[val] += y_pred
        y_pred_final_vc += model.predict_proba(Xtest_meta)
        
        score = log_loss(val_y_oh, y_pred)
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_meta_vc = y_pred_meta_vc / float(NUM_SEED)
y_pred_final_vc = y_pred_final_vc / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

Seed-24 | Fold-0 | OOF Score: 1.085847041274633
Seed-24 | Fold-1 | OOF Score: 1.0861658149927687
Seed-24 | Fold-2 | OOF Score: 1.0901739218168964
Seed-24 | Fold-3 | OOF Score: 1.0839847611858657
Seed-24 | Fold-4 | OOF Score: 1.0873938169736634
Seed-24 | Fold-5 | OOF Score: 1.087066119744375
Seed-24 | Fold-6 | OOF Score: 1.086032259082209
Seed-24 | Fold-7 | OOF Score: 1.0871929249706784
Seed-24 | Fold-8 | OOF Score: 1.0838796543720068
Seed-24 | Fold-9 | OOF Score: 1.0893397799598257

Seed: 24 | Aggregate OOF Score: 1.086707609437292


Seed-3 | Fold-0 | OOF Score: 1.0854560412239531
Seed-3 | Fold-1 | OOF Score: 1.0857177785509422
Seed-3 | Fold-2 | OOF Score: 1.087063794437447
Seed-3 | Fold-3 | OOF Score: 1.089526118082982
Seed-3 | Fold-4 | OOF Score: 1.0858362059547353
Seed-3 | Fold-5 | OOF Score: 1.0889290073191047
Seed-3 | Fold-6 | OOF Score: 1.0848530792598698
Seed-3 | Fold-7 | OOF Score: 1.0865901666359894
Seed-3 | Fold-8 | OOF Score: 1.084824186128898
Seed-3 | Fold-9 | OOF Score: 1.

In [12]:
y_pred_final_vc = np.clip(y_pred_final_vc, p_min, p_max)

submit_df['Class_1'] = y_pred_final_vc[:,0]
submit_df['Class_2'] = y_pred_final_vc[:,1]
submit_df['Class_3'] = y_pred_final_vc[:,2]
submit_df['Class_4'] = y_pred_final_vc[:,3]
submit_df.to_csv("./VC_submission.csv", index=False)
submit_df.head()

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4
0,100000,0.086159,0.639836,0.168944,0.105061
1,100001,0.084929,0.684602,0.141334,0.089135
2,100002,0.086031,0.670248,0.15221,0.091511
3,100003,0.090224,0.534457,0.275813,0.099506
4,100004,0.08587,0.653195,0.160117,0.100818


In [13]:
y_pred_final = (y_pred_final_lr * 0.35) + \
               (y_pred_final_rfc * 0.1) + \
               (y_pred_final_mlp * 0.1) + \
               (y_pred_final_vc * 0.45)

submit_df['Class_1'] = y_pred_final[:,0]
submit_df['Class_2'] = y_pred_final[:,1]
submit_df['Class_3'] = y_pred_final[:,2]
submit_df['Class_4'] = y_pred_final[:,3]
submit_df.to_csv("./BLEND_submission2.csv", index=False)
submit_df.head()

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4
0,100000,0.086181,0.639915,0.16863,0.105274
1,100001,0.085033,0.685162,0.140881,0.088924
2,100002,0.086075,0.670448,0.151779,0.091698
3,100003,0.090509,0.53289,0.276441,0.10016
4,100004,0.085929,0.652962,0.159977,0.101132
