## Import libraries

In [1]:
import gc
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

## Prepare data

In [2]:
with open("../input/mh-new-dawn/MH_New_Dawn_Set2.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
Ytrain_meta = processed_data['Ytrain']
Ytrain_oh = processed_data['Ytrain_oh']
class_weight = processed_data['class_weight']

del processed_data
gc.collect()

20

In [3]:
ds = np.load('../input/mh-new-dawn-dnn-inference-p1/DNN_Meta_Features.npz', allow_pickle=True)
y_pred_meta_dnn1, y_pred_final_dnn1 = ds['y_pred_meta_dnn'], ds['y_pred_final_dnn']

ds = np.load('../input/mh-new-dawn-dnn-inference-p2/DNN_Meta_Features.npz', allow_pickle=True)
y_pred_meta_dnn2, y_pred_final_dnn2 = ds['y_pred_meta_dnn'], ds['y_pred_final_dnn']

ds = np.load('../input/mh-new-dawn-dnn-inference-p3/DNN_Meta_Features.npz', allow_pickle=True)
y_pred_meta_dnn3, y_pred_final_dnn3 = ds['y_pred_meta_dnn'], ds['y_pred_final_dnn']

ds = np.load('../input/mh-new-dawn-dnn-inference-p4/DNN_Meta_Features.npz', allow_pickle=True)
y_pred_meta_dnn4, y_pred_final_dnn4 = ds['y_pred_meta_dnn'], ds['y_pred_final_dnn']

#ds = np.load('../input/mh-new-dawn-dnn-inference-p5/DNN_Meta_Features.npz', allow_pickle=True)
#y_pred_meta_dnn5, y_pred_final_dnn5 = ds['y_pred_meta_dnn'], ds['y_pred_final_dnn']

In [4]:
Xtrain_meta = np.concatenate((y_pred_meta_dnn1, y_pred_meta_dnn2, y_pred_meta_dnn3, 
                              y_pred_meta_dnn4), 
                             axis=1)
Xtest_meta = np.concatenate((y_pred_final_dnn1, y_pred_final_dnn2, y_pred_final_dnn3, 
                             y_pred_final_dnn4), 
                            axis=1)

print("Xtrain_meta shape: {}".format(Xtrain_meta.shape))
print("Ytrain_meta shape: {}".format(Ytrain_meta.shape))
print("Xtest_meta shape: {}".format(Xtest_meta.shape))

Xtrain_meta shape: (44095, 12)
Ytrain_meta shape: (44095,)
Xtest_meta shape: (18900, 12)


## Logistic Regression

In [5]:
FOLD = 5
SEEDS = [2020, 2022]

counter = 0
oof_score = 0
y_pred_final_lr = np.zeros((Xtest_meta.shape[0], 3))


for sidx, seed in enumerate(SEEDS):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain_meta, Ytrain_meta)):
        counter += 1

        train_x, train_y, train_y_oh = Xtrain_meta[train], Ytrain_meta[train], Ytrain_oh[train]
        val_x, val_y, val_y_oh = Xtrain_meta[val], Ytrain_meta[val], Ytrain_oh[val]

        model = LogisticRegression(
            max_iter=1000,
            #class_weight=class_weight, 
            random_state=0
        )

        model.fit(train_x, train_y)

        y_pred = model.predict_proba(val_x)
        y_pred_final_lr += model.predict_proba(Xtest_meta)
        
        score = log_loss(val_y_oh, y_pred)
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_final_lr = y_pred_final_lr / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

Seed-2020 | Fold-0 | OOF Score: 0.6441798083165232
Seed-2020 | Fold-1 | OOF Score: 0.6281100047284521
Seed-2020 | Fold-2 | OOF Score: 0.6338160273072584
Seed-2020 | Fold-3 | OOF Score: 0.639678971366512
Seed-2020 | Fold-4 | OOF Score: 0.6345086538635228

Seed: 2020 | Aggregate OOF Score: 0.6360586931164537


Seed-2022 | Fold-0 | OOF Score: 0.6442198262337235
Seed-2022 | Fold-1 | OOF Score: 0.6411237333912296
Seed-2022 | Fold-2 | OOF Score: 0.6309549129103201
Seed-2022 | Fold-3 | OOF Score: 0.6244342020873435
Seed-2022 | Fold-4 | OOF Score: 0.6393172680539669

Seed: 2022 | Aggregate OOF Score: 0.6360099885353167


Aggregate OOF Score: 0.6360343408258852


## KNeighborsClassifier

In [6]:
FOLD = 5
SEEDS = [2020, 2022]

counter = 0
oof_score = 0
y_pred_final_knc = np.zeros((Xtest_meta.shape[0], 3))


for sidx, seed in enumerate(SEEDS):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain_meta, Ytrain_meta)):
        counter += 1

        train_x, train_y, train_y_oh = Xtrain_meta[train], Ytrain_meta[train], Ytrain_oh[train]
        val_x, val_y, val_y_oh = Xtrain_meta[val], Ytrain_meta[val], Ytrain_oh[val]

        model = KNeighborsClassifier(
            n_neighbors=500
        )

        model.fit(train_x, train_y)

        y_pred = model.predict_proba(val_x)
        y_pred_final_knc += model.predict_proba(Xtest_meta)
        
        score = log_loss(val_y_oh, y_pred)
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_final_knc = y_pred_final_knc / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

Seed-2020 | Fold-0 | OOF Score: 0.6265768096652884
Seed-2020 | Fold-1 | OOF Score: 0.6146835624473967
Seed-2020 | Fold-2 | OOF Score: 0.6273971880583975
Seed-2020 | Fold-3 | OOF Score: 0.6223701311168834
Seed-2020 | Fold-4 | OOF Score: 0.6210182813585036

Seed: 2020 | Aggregate OOF Score: 0.622409194529294


Seed-2022 | Fold-0 | OOF Score: 0.6275227913150689
Seed-2022 | Fold-1 | OOF Score: 0.6269875713549833
Seed-2022 | Fold-2 | OOF Score: 0.6146775766770536
Seed-2022 | Fold-3 | OOF Score: 0.6088486306424743
Seed-2022 | Fold-4 | OOF Score: 0.6346214105880617

Seed: 2022 | Aggregate OOF Score: 0.6225315961155283


Aggregate OOF Score: 0.6224703953224111


## GradientBoostingClassifier

In [7]:
FOLD = 5
SEEDS = [2020, 2022]

counter = 0
oof_score = 0
y_pred_final_gbc = np.zeros((Xtest_meta.shape[0], 3))


for sidx, seed in enumerate(SEEDS):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain_meta, Ytrain_meta)):
        counter += 1

        train_x, train_y, train_y_oh = Xtrain_meta[train], Ytrain_meta[train], Ytrain_oh[train]
        val_x, val_y, val_y_oh = Xtrain_meta[val], Ytrain_meta[val], Ytrain_oh[val]

        model = HistGradientBoostingClassifier(
            loss='categorical_crossentropy',
            learning_rate=0.01,
            max_iter=1000,
            max_leaf_nodes=52, 
            max_depth=6,
            min_samples_leaf=10,
            early_stopping=True,
            n_iter_no_change=30,
            random_state=0
        )

        model.fit(train_x, train_y)

        y_pred = model.predict_proba(val_x)
        y_pred_final_gbc += model.predict_proba(Xtest_meta)
        
        score = log_loss(val_y_oh, y_pred)
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_final_gbc = y_pred_final_gbc / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

Seed-2020 | Fold-0 | OOF Score: 0.6233800040607784
Seed-2020 | Fold-1 | OOF Score: 0.6139659203730298
Seed-2020 | Fold-2 | OOF Score: 0.6193112686001623
Seed-2020 | Fold-3 | OOF Score: 0.6243130455241109
Seed-2020 | Fold-4 | OOF Score: 0.6208503866000334

Seed: 2020 | Aggregate OOF Score: 0.620364125031623


Seed-2022 | Fold-0 | OOF Score: 0.6272020099425383
Seed-2022 | Fold-1 | OOF Score: 0.6240428258866545
Seed-2022 | Fold-2 | OOF Score: 0.6135581305765453
Seed-2022 | Fold-3 | OOF Score: 0.6107217535607747
Seed-2022 | Fold-4 | OOF Score: 0.6266473674150629

Seed: 2022 | Aggregate OOF Score: 0.6204344174763152


Aggregate OOF Score: 0.6203992712539691


## MLP Classifier

In [8]:
FOLD = 5
SEEDS = [2020, 2022]

counter = 0
oof_score = 0
y_pred_final_mlp = np.zeros((Xtest_meta.shape[0], 3))


for sidx, seed in enumerate(SEEDS):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain_meta, Ytrain_meta)):
        counter += 1

        train_x, train_y, train_y_oh = Xtrain_meta[train], Ytrain_meta[train], Ytrain_oh[train]
        val_x, val_y, val_y_oh = Xtrain_meta[val], Ytrain_meta[val], Ytrain_oh[val]

        model = MLPClassifier(
            #hidden_layer_sizes=(32, 8),
            learning_rate='adaptive',
            learning_rate_init=0.001,
            max_iter=1000,
            early_stopping=True,
            n_iter_no_change=30,
            random_state=0
        )

        model.fit(train_x, train_y)

        y_pred = model.predict_proba(val_x)
        y_pred_final_mlp += model.predict_proba(Xtest_meta)
        
        score = log_loss(val_y_oh, y_pred)
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_final_mlp = y_pred_final_mlp / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

Seed-2020 | Fold-0 | OOF Score: 0.6296952620346137
Seed-2020 | Fold-1 | OOF Score: 0.6106338403154398
Seed-2020 | Fold-2 | OOF Score: 0.6194802996758536
Seed-2020 | Fold-3 | OOF Score: 0.6211769568422062
Seed-2020 | Fold-4 | OOF Score: 0.6286114002649251

Seed: 2020 | Aggregate OOF Score: 0.6219195518266076


Seed-2022 | Fold-0 | OOF Score: 0.6248016568657335
Seed-2022 | Fold-1 | OOF Score: 0.622897941377972
Seed-2022 | Fold-2 | OOF Score: 0.6241284832076678
Seed-2022 | Fold-3 | OOF Score: 0.6059229364618478
Seed-2022 | Fold-4 | OOF Score: 0.6282057646948623

Seed: 2022 | Aggregate OOF Score: 0.6211913565216167


Aggregate OOF Score: 0.6215554541741122


## Random Forest Classifier

In [9]:
FOLD = 5
SEEDS = [2020, 2022]

counter = 0
oof_score = 0
y_pred_final_rfc = np.zeros((Xtest_meta.shape[0], 3))


for sidx, seed in enumerate(SEEDS):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain_meta, Ytrain_meta)):
        counter += 1

        train_x, train_y, train_y_oh = Xtrain_meta[train], Ytrain_meta[train], Ytrain_oh[train]
        val_x, val_y, val_y_oh = Xtrain_meta[val], Ytrain_meta[val], Ytrain_oh[val]

        model = RandomForestClassifier(
            n_estimators=100,
            max_depth=6,
            max_leaf_nodes=52,
            random_state=0
        )

        model.fit(train_x, train_y)

        y_pred = model.predict_proba(val_x)
        y_pred_final_rfc += model.predict_proba(Xtest_meta)
        
        score = log_loss(val_y_oh, y_pred)
        oof_score += score
        seed_score += score
        print("Seed-{} | Fold-{} | OOF Score: {}".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_final_rfc = y_pred_final_rfc / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))

Seed-2020 | Fold-0 | OOF Score: 0.6267924304764533
Seed-2020 | Fold-1 | OOF Score: 0.613613694493062
Seed-2020 | Fold-2 | OOF Score: 0.6219331029986911
Seed-2020 | Fold-3 | OOF Score: 0.6244716728115406
Seed-2020 | Fold-4 | OOF Score: 0.6223444432923584

Seed: 2020 | Aggregate OOF Score: 0.6218310688144211


Seed-2022 | Fold-0 | OOF Score: 0.6293069414626773
Seed-2022 | Fold-1 | OOF Score: 0.6257095280353574
Seed-2022 | Fold-2 | OOF Score: 0.6161508573929447
Seed-2022 | Fold-3 | OOF Score: 0.6113205890747835
Seed-2022 | Fold-4 | OOF Score: 0.6262739449398024

Seed: 2022 | Aggregate OOF Score: 0.621752372181113


Aggregate OOF Score: 0.621791720497767


## Create submission file

In [10]:
submit_df = pd.DataFrame(y_pred_final_gbc, columns=['Negative_0','Neutral_1','Positive_2'])
submit_df.to_csv("./GBC_submission.csv", index=False)
submit_df.head()

Unnamed: 0,Negative_0,Neutral_1,Positive_2
0,0.1798,0.088559,0.731641
1,0.778827,0.134287,0.086886
2,0.976396,0.001509,0.022095
3,0.798406,0.07466,0.126934
4,0.42153,0.446779,0.131691


In [11]:
y_pred_final = (y_pred_final_lr * 0.05) + \
               (y_pred_final_knc * 0.05) + \
               (y_pred_final_gbc * 0.5) + \
               (y_pred_final_mlp * 0.2) + \
               (y_pred_final_rfc * 0.2)

submit_df = pd.DataFrame(y_pred_final, columns=['Negative_0','Neutral_1','Positive_2'])
submit_df.to_csv("./Blend_Submission.csv", index=False)
submit_df.head()

Unnamed: 0,Negative_0,Neutral_1,Positive_2
0,0.173722,0.086261,0.740016
1,0.782094,0.126459,0.091447
2,0.972462,0.004109,0.023429
3,0.810142,0.07387,0.115988
4,0.411221,0.444504,0.144276
