# *Importing Libraries*

In [2]:
#!pip install -U lightautoml
# Standard python libraries
import os
import time

# Essential DS libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch

# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from lightautoml.report.report_deco import ReportDeco

# *Setting up constants*

In [66]:
N_THREADS = 4
N_FOLDS = 10
RANDOM_STATE = 42
TEST_SIZE = 0.2
TIMEOUT = 5*3600
TARGET_NAME = 'song_popularity'

In [42]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

# *Load data*

In [43]:
%%time

train_data = pd.read_csv('D:\\COMPI-TOP\\Song-popularity\\train.csv')
train_data.head()

Wall time: 70.5 ms


Unnamed: 0,id,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence,song_popularity
0,0,212990.0,0.642286,0.85652,0.707073,0.002001,10.0,,-5.619088,0,0.08257,158.386236,4,0.734642,0
1,1,,0.054866,0.733289,0.835545,0.000996,8.0,0.436428,-5.236965,1,0.127358,102.752988,3,0.711531,1
2,2,193213.0,,0.188387,0.783524,-0.002694,5.0,0.170499,-4.951759,0,0.052282,178.685791,3,0.425536,0
3,3,249893.0,0.48866,0.585234,0.552685,0.000608,0.0,0.094805,-7.893694,0,0.035618,128.71563,3,0.453597,0
4,4,165969.0,0.493017,,0.740982,0.002033,10.0,0.094891,-2.684095,0,0.050746,121.928157,4,0.741311,0


In [44]:
test_data = pd.read_csv('D:\\COMPI-TOP\\Song-popularity\\test.csv')
test_data.head()

Unnamed: 0,id,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,0,308523.0,0.019845,,0.908939,0.001438,,0.112832,-8.890172,0,0.082714,126.129304,4,0.39962
1,1,200011.0,0.070119,0.731256,0.444655,0.00202,10.0,0.13904,-6.301214,0,0.061685,86.448149,3,0.499424
2,2,279758.0,0.810637,0.568858,0.125466,0.898841,0.0,0.226614,-11.542478,0,0.041868,99.544351,3,0.564951
3,3,249197.0,,0.871789,0.557342,0.000715,4.0,0.325391,-7.905546,1,0.046815,123.063854,4,0.906485
4,4,,0.765568,0.624687,0.710794,0.000346,8.0,0.308284,,0,0.129284,88.703121,3,0.935571


In [45]:
samp_sub = pd.read_csv('D:\\COMPI-TOP\\Song-popularity\\sample_submission.csv')
samp_sub.head()

Unnamed: 0,id,song_popularity
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


# *Splitting Data*

In [46]:
%%time
tr_data, te_data = train_test_split(train_data, 
                                    test_size=TEST_SIZE, 
                                    stratify=train_data[TARGET_NAME], 
                                    random_state=RANDOM_STATE)
print('Data splitted. Parts sizes: tr_data = {}, te_data = {}'.format(tr_data.shape, te_data.shape))

Data splitted. Parts sizes: tr_data = (32000, 15), te_data = (8000, 15)
Wall time: 17 ms


In [47]:
tr_data.head()

Unnamed: 0,id,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence,song_popularity
8445,8445,249165.0,0.018023,0.691292,,0.00207,7.0,0.253386,,0,0.133197,119.674313,4,0.291141,0
29445,29445,262839.0,0.029204,0.29605,0.747173,0.000863,0.0,0.146838,-10.175356,1,0.331771,97.095876,4,0.476756,1
27510,27510,249180.0,,0.240227,0.981282,0.003295,6.0,0.111339,,0,0.095754,121.685407,3,0.226383,0
21464,21464,150192.0,0.896027,0.290141,0.260367,0.00438,10.0,0.210436,-12.902434,0,0.039707,122.609402,3,0.436087,1
1915,1915,169956.0,0.606447,0.784803,,0.000923,4.0,0.353536,-7.492491,0,0.043732,79.920547,3,0.343077,0


# *Adding oofs and test-predictions from AutoWoe Models*

In [48]:
from autowoe import AutoWoE
from sklearn.model_selection import StratifiedKFold

def get_oof_and_test_pred(tr, te, real_te):
    skf = StratifiedKFold(n_splits=N_FOLDS)

    oof_preds_woe = np.zeros(len(tr))
    test_preds_woe = np.zeros(len(te))
    real_test_preds_woe = np.zeros(len(real_te))

    y = tr['song_popularity'].values

    for fold, (train_idx, val_idx) in enumerate(skf.split(y, y)):

        X_tr, X_val = tr.iloc[train_idx, :], tr.iloc[val_idx, :]

        auto_woe = AutoWoE(monotonic=False,
                         vif_th=20.,
                         imp_th=0,
                         th_const=32,
                         force_single_split=True,
                         min_bin_size = 0.005,
                         oof_woe=True,
                         n_folds=5,
                         n_jobs=N_THREADS,
                         regularized_refit=True,
                         verbose=0)

        auto_woe.fit(X_tr.sample(20000, random_state = 13).drop('id', axis = 1), 
                     target_name="song_popularity")

        val_pred = auto_woe.predict_proba(X_val)
        print("FOLD {}, AUC_SCORE = {:.5f}".format(fold, roc_auc_score(X_val['song_popularity'], val_pred)))

        oof_preds_woe[val_idx] = val_pred
        test_preds_woe += auto_woe.predict_proba(te) / N_FOLDS
        real_test_preds_woe += auto_woe.predict_proba(real_te) / N_FOLDS

    print("AUC_SCORE TRAIN = {:.5f}".format(roc_auc_score(tr_data['song_popularity'], oof_preds_woe)))
    print("AUC_SCORE TEST = {:.5f}".format(roc_auc_score(te_data['song_popularity'], test_preds_woe)))
    
    return oof_preds_woe, test_preds_woe, real_test_preds_woe

In [49]:
oof_preds_woe, test_preds_woe, real_test_preds_woe = get_oof_and_test_pred(tr_data, te_data, test_data)

# This idea was in my mind but as it was already announced in @hiro5299834 
tr_data['missed_cnt'] = tr_data.isna().sum(axis=1)
te_data['missed_cnt'] = te_data.isna().sum(axis=1)
test_data['missed_cnt'] = test_data.isna().sum(axis=1)
oof_preds_woe2, test_preds_woe2, real_test_preds_woe2 = get_oof_and_test_pred(tr_data, te_data, test_data)

FOLD 0, AUC_SCORE = 0.53472
FOLD 1, AUC_SCORE = 0.57293
FOLD 2, AUC_SCORE = 0.56182
FOLD 3, AUC_SCORE = 0.55723
FOLD 4, AUC_SCORE = 0.57838
FOLD 5, AUC_SCORE = 0.54732
FOLD 6, AUC_SCORE = 0.56403
FOLD 7, AUC_SCORE = 0.55482
FOLD 8, AUC_SCORE = 0.56121
FOLD 9, AUC_SCORE = 0.55910
AUC_SCORE TRAIN = 0.55908
AUC_SCORE TEST = 0.56719
FOLD 0, AUC_SCORE = 0.53472
FOLD 1, AUC_SCORE = 0.57293
FOLD 2, AUC_SCORE = 0.56182
FOLD 3, AUC_SCORE = 0.55723
FOLD 4, AUC_SCORE = 0.57838
FOLD 5, AUC_SCORE = 0.54732
FOLD 6, AUC_SCORE = 0.56403
FOLD 7, AUC_SCORE = 0.55413
FOLD 8, AUC_SCORE = 0.56121
FOLD 9, AUC_SCORE = 0.55910
AUC_SCORE TRAIN = 0.55901
AUC_SCORE TEST = 0.56715


In [50]:
print("AUC_SCORE TEST = {:.5f}".format(roc_auc_score(te_data['song_popularity'], test_preds_woe)))
print("AUC_SCORE TEST = {:.5f}".format(roc_auc_score(te_data['song_popularity'], test_preds_woe2)))
print("AUC_SCORE TEST = {:.5f}".format(roc_auc_score(te_data['song_popularity'], 0.5 * test_preds_woe +
                                                                         0.5 * test_preds_woe2)))

AUC_SCORE TEST = 0.56719
AUC_SCORE TEST = 0.56715
AUC_SCORE TEST = 0.56717


In [51]:
from scipy.stats import rankdata
print("AUC_SCORE TEST = {:.5f}".format(roc_auc_score(te_data['song_popularity'], 0.5 * rankdata(test_preds_woe) +
                                                                         0.5 * rankdata(test_preds_woe2))))

AUC_SCORE TEST = 0.56717


In [52]:
tr_data['oof_woe_1'] = oof_preds_woe
te_data['oof_woe_1'] = test_preds_woe
test_data['oof_woe_1'] = real_test_preds_woe

tr_data['oof_woe_2'] = oof_preds_woe2
te_data['oof_woe_2'] = test_preds_woe2
test_data['oof_woe_2'] = real_test_preds_woe2

tr_data['oof_woe_12'] = 0.5 * oof_preds_woe + 0.5 * oof_preds_woe2
te_data['oof_woe_12'] = 0.5 * test_preds_woe + 0.5 * test_preds_woe2
test_data['oof_woe_12'] = 0.5 * real_test_preds_woe + 0.5 * real_test_preds_woe2

tr_data['rank_oof_woe_1'] = rankdata(oof_preds_woe)
te_data['rank_oof_woe_1'] = rankdata(test_preds_woe)
test_data['rank_oof_woe_1'] = rankdata(real_test_preds_woe)

tr_data['rank_oof_woe_2'] = rankdata(oof_preds_woe2)
te_data['rank_oof_woe_2'] = rankdata(test_preds_woe2)
test_data['rank_oof_woe_2'] = rankdata(real_test_preds_woe2)

tr_data['rank_oof_woe_12'] = 0.5 * rankdata(oof_preds_woe) + 0.5 * rankdata(oof_preds_woe2)
te_data['rank_oof_woe_12'] = 0.5 * rankdata(test_preds_woe) + 0.5 * rankdata(test_preds_woe2)
test_data['rank_oof_woe_12'] = 0.5 * rankdata(real_test_preds_woe) + 0.5 * rankdata(real_test_preds_woe2)

# *Model-building*

## *Setting up task*

In [53]:
%%time

task = Task('binary', )

Wall time: 1.97 ms


In [54]:
%%time

roles = {'target': TARGET_NAME,
         'drop': ['id']
         }

Wall time: 0 ns


## *Model Creation*

In [55]:
%%time 

automl = TabularAutoML(task = task, 
                        timeout = TIMEOUT,
                        cpu_limit = N_THREADS,
                        reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                        general_params = {'use_algos': [['lgb', 'lgb_tuned', 'linear_l2', 'cb', 'cb_tuned'], ['lgb', 'linear_l2']]},
                        selection_params = {'mode': 0}
                       )

RD = ReportDeco(output_path = 'tabularAutoML_model_report')
automl_rd = RD(automl)

oof_pred = automl_rd.fit_predict(tr_data, roles = roles)

Wall time: 8min 47s


In [56]:
%%time

te_pred = automl_rd.predict(te_data)
print('Prediction for te_data:\n{}\nShape = {}'.format(te_pred, te_pred.shape))

Prediction for te_data:
array([[0.3885191 ],
       [0.37146556],
       [0.38216656],
       ...,
       [0.34906876],
       [0.38018227],
       [0.3144596 ]], dtype=float32)
Shape = (8000, 1)
Wall time: 2.21 s


In [57]:
print('Check scores...')
print('OOF score: {}'.format(roc_auc_score(tr_data[TARGET_NAME].values, oof_pred.data[:, 0])))
print('HOLDOUT score: {}'.format(roc_auc_score(te_data[TARGET_NAME].values, te_pred.data[:, 0])))

Check scores...
OOF score: 0.568856404651221
HOLDOUT score: 0.5781695397791575


# *Training on whole data*

In [58]:
train_data = pd.concat([tr_data, te_data]).reset_index(drop = True)
print(train_data.shape)
train_data.head()

(40000, 22)


Unnamed: 0,id,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,...,time_signature,audio_valence,song_popularity,missed_cnt,oof_woe_1,oof_woe_2,oof_woe_12,rank_oof_woe_1,rank_oof_woe_2,rank_oof_woe_12
0,8445,249165.0,0.018023,0.691292,,0.00207,7.0,0.253386,,0,...,4,0.291141,0,2,0.289544,0.289544,0.289544,3356.0,3343.0,3349.5
1,29445,262839.0,0.029204,0.29605,0.747173,0.000863,0.0,0.146838,-10.175356,1,...,4,0.476756,1,0,0.384845,0.384845,0.384845,20833.5,20841.5,20837.5
2,27510,249180.0,,0.240227,0.981282,0.003295,6.0,0.111339,,0,...,3,0.226383,0,2,0.341728,0.341728,0.341728,11693.0,11693.0,11693.0
3,21464,150192.0,0.896027,0.290141,0.260367,0.00438,10.0,0.210436,-12.902434,0,...,3,0.436087,1,0,0.436695,0.436695,0.436695,28470.0,28484.0,28477.0
4,1915,169956.0,0.606447,0.784803,,0.000923,4.0,0.353536,-7.492491,0,...,3,0.343077,0,1,0.378587,0.378587,0.378587,19505.0,19501.0,19503.0


In [59]:
%%time 

automl = TabularAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                       tuning_params = {'max_tuning_time': 900}, # more time for params tuning
                       general_params = {'use_algos': [['lgb', 'lgb_tuned', 'linear_l2', 'cb', 'cb_tuned'], ['lgb', 'linear_l2']]},
                       selection_params = {'mode': 0} # no feature selection - everything is necessary :)
                      )

oof_pred = automl.fit_predict(train_data, roles = roles)

Wall time: 8min 2s


# *Using Utilized Model*

In [68]:
TIMEOUT = 2700

In [69]:
%%time 
automl = TabularUtilizedAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                       tuning_params = {'max_tuning_time': 2700}, # more time for params tuning
                       general_params = {'use_algos': [['lgb', 'lgb_tuned', 'linear_l2', 'cb', 'cb_tuned'], ['lgb', 'linear_l2']]},
                       selection_params = {'mode': 0} # no feature selection - everything is necessary :)
                      )

oof_pred = automl.fit_predict(train_data, roles = roles)

Wall time: 39min 50s


In [70]:
print('Check scores...')
print('OOF score: {}'.format(roc_auc_score(train_data[TARGET_NAME].values, oof_pred.data[:, 0])))

Check scores...
OOF score: 0.5753449744258344


In [None]:
# 0.5711841736055511
# 0.5714597819445433

In [71]:
test_pred = automl.predict(test_data)
print('Prediction for test_data:\n{}\nShape = {}'.format(test_pred, test_pred.shape))

Prediction for test_data:
array([[0.4076663 ],
       [0.44756764],
       [0.31369266],
       ...,
       [0.35283282],
       [0.41616076],
       [0.30357975]], dtype=float32)
Shape = (10000, 1)


# *Creating submission file*

In [72]:
samp_sub[TARGET_NAME] = test_pred.data[:, 0]
samp_sub.to_csv('D:\\COMPI-TOP\\Song-popularity\\LightAUTOML_2.csv', index = False)

In [73]:
samp_sub.head()

Unnamed: 0,id,song_popularity
0,0,0.407666
1,1,0.447568
2,2,0.313693
3,3,0.32812
4,4,0.363966
