### Import Libraries

In [1]:
%%capture
!pip install -U lightautoml

In [2]:
# Standard python libraries
import os
import time

# Essential DS libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import torch

# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from lightautoml.report.report_deco import ReportDeco

# Imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline
from scipy.stats import rankdata
import gc


from autowoe import AutoWoE

### Constants

In [3]:
N_THREADS = 10
N_FOLDS = 10
RANDOM_STATE = 42
TEST_SIZE = 0.3
TIMEOUT = 8*3600
TARGET_NAME = 'song_popularity'

TRAIN_PATH = '../input/song-popularity-prediction/train.csv'
TEST_PATH = '../input/song-popularity-prediction/testcsv'
SAMPLE_PATH = '../input/song-popularity-prediction/sample_submission.csv'

np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

### Load Data

In [4]:
%%time

train_data = pd.read_csv('../input/song-popularity-prediction/train.csv')
test_data = pd.read_csv('../input/song-popularity-prediction/test.csv')
samp_sub = pd.read_csv('../input/song-popularity-prediction/sample_submission.csv')

CPU times: user 120 ms, sys: 25.1 ms, total: 145 ms
Wall time: 293 ms


### Scaling and Imputation

In [5]:
%%time

iti = IterativeImputer()
Robust_scaler = RobustScaler()

target = ['song_popularity']
not_features = ['id', 'song_popularity']

cols = list(train_data.columns)
features = [feat for feat in cols if feat not in not_features]

train_knnimp = iti.fit_transform(train_data[features])
test_knnimp = iti.transform(test_data[features])

train_knnimp = Robust_scaler.fit_transform(train_knnimp)
test_knnimp = Robust_scaler.transform(test_knnimp)

train = pd.DataFrame(columns=train_data[features].columns, data=train_knnimp)
test = pd.DataFrame(columns=test_data[features].columns, data=test_knnimp)

train['song_popularity'] = train_data.song_popularity
train.insert(0, 'id', train_data.id)
test.insert(0, 'id', test_data.id)

CPU times: user 2.51 s, sys: 1.79 s, total: 4.3 s
Wall time: 2.19 s


In [6]:
train_data = train.copy()
test_data = test.copy()
del train
del test
del train_knnimp
del test_knnimp
gc.collect()

81

### Train Test Split

In [7]:
%%time
tr_data, te_data = train_test_split(train_data, 
                                    test_size=TEST_SIZE, 
                                    stratify=train_data[TARGET_NAME], 
                                    random_state=RANDOM_STATE)
print('Data splitted. Parts sizes: tr_data = {}, te_data = {}'.format(tr_data.shape, te_data.shape))

Data splitted. Parts sizes: tr_data = (28000, 15), te_data = (12000, 15)
CPU times: user 19.6 ms, sys: 0 ns, total: 19.6 ms
Wall time: 21.3 ms


### Adding oofs and test-predictions from AutoWoe Models

In [8]:
def get_oof_and_test_pred(tr, te, real_te):
    skf = StratifiedKFold(n_splits=N_FOLDS)

    oof_preds_woe = np.zeros(len(tr))
    test_preds_woe = np.zeros(len(te))
    real_test_preds_woe = np.zeros(len(real_te))

    y = tr['song_popularity'].values

    for fold, (train_idx, val_idx) in enumerate(skf.split(y, y)):

        X_tr, X_val = tr.iloc[train_idx, :], tr.iloc[val_idx, :]

        auto_woe = AutoWoE(monotonic=False,
                         vif_th=20.,
                         imp_th=0,
                         th_const=32,
                         force_single_split=True,
                         min_bin_size = 0.005,
                         oof_woe=True,
                         n_folds=5,
                         n_jobs=N_THREADS,
                         regularized_refit=True,
                         verbose=0)

        auto_woe.fit(X_tr.sample(20000, random_state = 42).drop('id', axis = 1), 
                     target_name="song_popularity")

        val_pred = auto_woe.predict_proba(X_val)
        print("FOLD {}, AUC_SCORE = {:.5f}".format(fold, roc_auc_score(X_val['song_popularity'], val_pred)))

        oof_preds_woe[val_idx] = val_pred
        test_preds_woe += auto_woe.predict_proba(te) / N_FOLDS
        real_test_preds_woe += auto_woe.predict_proba(real_te) / N_FOLDS

    print("AUC_SCORE TRAIN = {:.5f}".format(roc_auc_score(tr_data['song_popularity'], oof_preds_woe)))
    print("AUC_SCORE TEST = {:.5f}".format(roc_auc_score(te_data['song_popularity'], test_preds_woe)))
    
    return oof_preds_woe, test_preds_woe, real_test_preds_woe

In [9]:
oof_preds_woe, test_preds_woe, real_test_preds_woe = get_oof_and_test_pred(tr_data, te_data, test_data)

tr_data['missed_cnt'] = tr_data.isna().sum(axis=1)
te_data['missed_cnt'] = te_data.isna().sum(axis=1)
test_data['missed_cnt'] = test_data.isna().sum(axis=1)
oof_preds_woe2, test_preds_woe2, real_test_preds_woe2 = get_oof_and_test_pred(tr_data, te_data, test_data)

print("AUC_SCORE TEST = {:.5f}".format(roc_auc_score(te_data['song_popularity'], test_preds_woe)))
print("AUC_SCORE TEST = {:.5f}".format(roc_auc_score(te_data['song_popularity'], test_preds_woe2)))
print("AUC_SCORE TEST = {:.5f}".format(roc_auc_score(te_data['song_popularity'], 0.5 * test_preds_woe + 0.5 * test_preds_woe2)))
print("AUC_SCORE TEST = {:.5f}".format(roc_auc_score(te_data['song_popularity'], 0.5 * rankdata(test_preds_woe) + 0.5 * rankdata(test_preds_woe2))))

tr_data['oof_woe_1'] = oof_preds_woe
te_data['oof_woe_1'] = test_preds_woe
test_data['oof_woe_1'] = real_test_preds_woe

tr_data['oof_woe_2'] = oof_preds_woe2
te_data['oof_woe_2'] = test_preds_woe2
test_data['oof_woe_2'] = real_test_preds_woe2

tr_data['oof_woe_12'] = 0.5 * oof_preds_woe + 0.5 * oof_preds_woe2
te_data['oof_woe_12'] = 0.5 * test_preds_woe + 0.5 * test_preds_woe2
test_data['oof_woe_12'] = 0.5 * real_test_preds_woe + 0.5 * real_test_preds_woe2

tr_data['rank_oof_woe_1'] = rankdata(oof_preds_woe)
te_data['rank_oof_woe_1'] = rankdata(test_preds_woe)
test_data['rank_oof_woe_1'] = rankdata(real_test_preds_woe)

tr_data['rank_oof_woe_2'] = rankdata(oof_preds_woe2)
te_data['rank_oof_woe_2'] = rankdata(test_preds_woe2)
test_data['rank_oof_woe_2'] = rankdata(real_test_preds_woe2)

tr_data['rank_oof_woe_12'] = 0.5 * rankdata(oof_preds_woe) + 0.5 * rankdata(oof_preds_woe2)
te_data['rank_oof_woe_12'] = 0.5 * rankdata(test_preds_woe) + 0.5 * rankdata(test_preds_woe2)
test_data['rank_oof_woe_12'] = 0.5 * rankdata(real_test_preds_woe) + 0.5 * rankdata(real_test_preds_woe2)

 features [] contain too many nans or identical values
This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[LightGBM] [Info] Number of positive: 5845, number of negative: 10155
[LightGBM] [Info] Total Bins 2805
[LightGBM] [Info] Number of data: 16000, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.365312 -> initscore=-0.552380
[LightGBM] [Info] Start training from score -0.552380
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[20]	val_set's auc: 0.560498
 features [] have low importance
Feature audio_mode removed due to single WOE value
Feature time_signature removed due to low AUC value 0.5042955491763604
C parameter range in [0.0003718854592785422:3.718854592785422], 20 values
Result(score=0.5757247731809313, reg_alpha=0.20288563708324708, is_neg=True, min_weights=energy             -0.65

### Model Building

In [10]:
%%time

task = Task('binary', )
roles = {'target': TARGET_NAME,
         'drop': ['id']}

CPU times: user 5.62 ms, sys: 63 µs, total: 5.68 ms
Wall time: 4.6 ms


In [11]:
%%time 

automl = TabularAutoML(task = task, 
                        timeout = TIMEOUT,
                        cpu_limit = N_THREADS,
                        reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                        general_params = {'use_algos': [['lgb', 'lgb_tuned', 'linear_l2', 'cb', 'cb_tuned', 'lgb_tuned', 'lgb_tuned', 'lgb_tuned', 'lgb', 'lgb_tuned', 'linear_l2'], ['linear_l2']]},
                        selection_params = {'mode': 0})

RD = ReportDeco(output_path = 'tabularAutoML_model_report')
automl_rd = RD(automl)
oof_pred = automl_rd.fit_predict(tr_data, roles = roles)
te_pred = automl_rd.predict(te_data)
print('Prediction for te_data:\n{}\nShape = {}'.format(te_pred, te_pred.shape))
print('OOF score: {}'.format(roc_auc_score(tr_data[TARGET_NAME].values, oof_pred.data[:, 0])))
print('HOLDOUT score: {}'.format(roc_auc_score(te_data[TARGET_NAME].values, te_pred.data[:, 0])))

Prediction for te_data:
array([[0.34075361],
       [0.35829973],
       [0.3035342 ],
       ...,
       [0.3489387 ],
       [0.38342965],
       [0.32817546]], dtype=float32)
Shape = (12000, 1)
OOF score: 0.5684230836610502
HOLDOUT score: 0.5776435707738623
CPU times: user 18min 17s, sys: 3min 42s, total: 21min 59s
Wall time: 13min 11s


### Training on Whole Data

In [12]:
train_data = pd.concat([tr_data, te_data]).reset_index(drop = True)
print(train_data.shape)
train_data.head()

(40000, 22)


Unnamed: 0,id,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,...,time_signature,audio_valence,song_popularity,missed_cnt,oof_woe_1,oof_woe_2,oof_woe_12,rank_oof_woe_1,rank_oof_woe_2,rank_oof_woe_12
0,13898,-0.947208,0.32433,0.524932,-0.336972,0.367463,0.790971,-0.643172,-0.165804,0.0,...,1.0,0.656491,0,0,0.32507,0.32507,0.32507,6964.0,6964.0,6964.0
1,37144,-1.305897,-0.323108,0.602268,0.918888,-0.82372,0.624304,1.273297,0.301763,0.0,...,0.0,0.491159,0,0,0.31657,0.31657,0.31657,5704.0,5704.0,5704.0
2,980,0.368399,0.414091,-0.152927,0.174795,0.666655,0.790971,-0.184816,-1.136889,0.0,...,0.0,-0.880087,1,0,0.354439,0.354439,0.354439,12151.0,12151.0,12151.0
3,1816,-0.28767,0.598472,0.292502,0.811662,-0.792982,0.457637,-0.3431,0.471242,0.0,...,0.0,0.645036,1,0,0.414546,0.414546,0.414546,22683.0,22683.0,22683.0
4,27531,-0.428607,-0.340319,-1.28873,0.355879,0.41687,-0.709029,0.514374,0.126536,0.0,...,1.0,-0.274986,0,0,0.31092,0.31092,0.31092,4914.0,4914.0,4914.0


In [13]:
%%time 
automl = TabularUtilizedAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                       tuning_params = {'max_tuning_time': 3600}, # more time for params tuning
                       general_params = {'use_algos': [['lgb', 'lgb_tuned', 'linear_l2', 'cb', 'cb_tuned', 'lgb_tuned', 'lgb_tuned', 'lgb_tuned', 'lgb', 'lgb_tuned', 'linear_l2'], ['linear_l2']]},
                       selection_params = {'mode': 0})

oof_pred = automl.fit_predict(train_data, roles = roles)

CPU times: user 14h 7min 59s, sys: 26min 49s, total: 14h 34min 48s
Wall time: 7h 45min 30s


In [14]:
print('OOF score: {}'.format(roc_auc_score(train_data[TARGET_NAME].values, oof_pred.data[:, 0])))

OOF score: 0.5786278843633059


### Create Submission

In [15]:
test_pred = automl.predict(test_data)
samp_sub[TARGET_NAME] = test_pred.data[:, 0]
samp_sub.to_csv('./LightAUTOMLsubmission_TUNED.csv', index = False)
print('Prediction for test_data:\n{}\nShape = {}'.format(test_pred, test_pred.shape))

Prediction for test_data:
array([[0.39818868],
       [0.50590307],
       [0.30382237],
       ...,
       [0.3438912 ],
       [0.436447  ],
       [0.2849599 ]], dtype=float32)
Shape = (10000, 1)
