## Import libraries

In [1]:
! pip install -U lightautoml

Collecting lightautoml
  Downloading LightAutoML-0.2.14-py3-none-any.whl (250 kB)
[K     |████████████████████████████████| 250 kB 4.5 MB/s 
Collecting importlib-metadata<2.0,>=1.0
  Downloading importlib_metadata-1.7.0-py2.py3-none-any.whl (31 kB)
Collecting poetry-core<2.0.0,>=1.0.0
  Downloading poetry_core-1.0.3-py2.py3-none-any.whl (424 kB)
[K     |████████████████████████████████| 424 kB 7.5 MB/s 
Collecting lightgbm<3.0,>=2.3
  Downloading lightgbm-2.3.1-py2.py3-none-manylinux1_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 8.3 MB/s 
Collecting json2html
  Downloading json2html-1.3.0.tar.gz (7.0 kB)
Collecting log-calls
  Downloading log_calls-0.3.2.tar.gz (232 kB)
[K     |████████████████████████████████| 232 kB 20.4 MB/s 
Collecting efficientnet-pytorch
  Downloading efficientnet_pytorch-0.7.1.tar.gz (21 kB)
Collecting autowoe>=1.2
  Downloading AutoWoE-1.2.5-py3-none-any.whl (204 kB)
[K     |████████████████████████████████| 204 

In [2]:
import gc
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

from lightautoml.tasks import Task
from lightautoml.automl.presets.tabular_presets import TabularUtilizedAutoML

## Prepare data for model training

In [3]:
with open("../input/tps-may-data-preprocess-v5/TPS_May_Dataset.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
train_df = processed_data['train_df']
test_df = processed_data['test_df']

Ytrain_oh = pd.get_dummies(train_df['target']).values

del processed_data
gc.collect()

0

## Build and validate the model

In [4]:
FOLD = 5
N_THREADS = 4
TIMEOUT = 60 * 60 * 3.5

model = TabularUtilizedAutoML(
    task = Task('multiclass',), 
    timeout = TIMEOUT,
    cpu_limit = N_THREADS,
    general_params = {'use_algos': [['linear_l2', 'cb_tuned', 'lgb_tuned'], ['cb', 'lgb']]},
    reader_params = {'n_jobs': N_THREADS, 'cv': FOLD},
)

y_pred_meta_lama = model.fit_predict(train_df, roles={'target':'target'})
print("\n\ny_pred_meta_lama: {}".format(y_pred_meta_lama.shape))

Current random state: {'reader_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}
Found reader_params in kwargs, need to combine
Merged variant for reader_params = {'n_jobs': 4, 'cv': 5, 'random_state': 42}
Found general_params in kwargs, need to combine
Merged variant for general_params = {'use_algos': [['linear_l2', 'cb_tuned', 'lgb_tuned'], ['cb', 'lgb']], 'return_all_predictions': False}
Start automl preset with listed constraints:
- time: 12599.994782686234 seconds
- cpus: 4 cores
- memory: 16 gb

Train data shape: (99918, 101)
Feats was rejected during automatic roles guess: []


Layer 1 ...
Train process start. Time left 12555.3079931736 secs
Start fitting Lvl_0_Pipe_0_Mod_0_LinearL2 ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====

Linear model: C = 1e-05 score = -1.1109017351324602
Linear model: C = 5e-05 score = -1.1065233861990242
Linear model: C = 0.0001 score = -1.105628559517982
Linear model: C = 0.0005 score = -1

Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer


Start fitting Lvl_0_Pipe_1_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_0_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's multi_logloss: 1.09925
[200]	valid's multi_logloss: 1.09704
[300]	valid's multi_logloss: 1.0989
Early stopping, best iteration is:
[196]	valid's multi_logloss: 1.09702
Lvl_0_Pipe_1_Mod_0_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_0_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's multi_logloss: 1.09914
[200]	valid's multi_logloss: 1.0969
[300]	valid's multi_logloss: 1.09801
Early stopping, best iteration is:
[194]	valid's multi_logloss: 1.09682
Lvl_0_Pipe_1_Mod_0_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_0_LightGBM =====

Training until validation scores don

In [5]:
oof_score = log_loss(Ytrain_oh, y_pred_meta_lama.data)
print("Aggregate OOF Score: {}".format(oof_score))

Aggregate OOF Score: 1.0915339592727997


In [6]:
y_pred_final_lama = model.predict(test_df)

In [7]:
np.savez_compressed('./LAMA_Meta_Features.npz',
                    y_pred_meta_lama=y_pred_meta_lama.data, 
                    oof_score=oof_score,
                    y_pred_final_lama=y_pred_final_lama.data)

## Create submission file

In [8]:
test_df = pd.read_csv("../input/tabular-playground-series-may-2021/test.csv")
submit_df = pd.DataFrame()
submit_df['id'] = test_df['id']
submit_df['Class_1'] = y_pred_final_lama.data[:,0]
submit_df['Class_2'] = y_pred_final_lama.data[:,1]
submit_df['Class_3'] = y_pred_final_lama.data[:,2]
submit_df['Class_4'] = y_pred_final_lama.data[:,3]
submit_df.head()

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4
0,100000,0.093282,0.621752,0.163495,0.121472
1,100001,0.089292,0.667576,0.148231,0.094901
2,100002,0.085493,0.613204,0.19638,0.104923
3,100003,0.085608,0.517825,0.295891,0.100676
4,100004,0.077717,0.624484,0.179608,0.118191


In [9]:
submit_df.to_csv("./LAMA_submission.csv", index=False)