In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'dota2-first:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4692887%2F7974583%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240329%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240329T145900Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D2b88bde848da9601749bbbe8864ba55938100a0382a65166d8dae20720c0c2a48e656d2acfd0f21a25e4fa1ebb63f7ff13f45b1c84639ef703a974df3ae3034937e92266b83f09aeacae0681eed34e42490b1ef4a465b4e622b7ab6ab07337fe0efe2a5c4ef0ea9475c9e777a8ba5cf74d833679b04bb9bec6d412d7234d15db6413fa026d5fadb5e50e58a34502981c1cfb93b204ead0608212c895feef1bcaff8122dba91ba7e77a1ef1adc88bf6fe62e13f479e96e63cf222828b3112c9729a3a150dc301a4d6d0cb7b314cf3bec91d1da5403e69604c96a3567d7e7c36fe9ff1bcdb68cec000bada398527995b52260e6e669a5c99ae94ad425f312fbbe3'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
!pip install -U lightautoml

Collecting lightautoml
  Downloading lightautoml-0.3.8.1-py3-none-any.whl.metadata (16 kB)
Collecting autowoe>=1.2 (from lightautoml)
  Downloading AutoWoE-1.3.2-py3-none-any.whl.metadata (2.8 kB)
Collecting cmaes (from lightautoml)
  Downloading cmaes-0.10.0-py3-none-any.whl.metadata (19 kB)
Collecting joblib<1.3.0 (from lightautoml)
  Downloading joblib-1.2.0-py3-none-any.whl.metadata (5.3 kB)
Collecting json2html (from lightautoml)
  Downloading json2html-1.3.0.tar.gz (7.0 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting lightgbm<=3.2.1,>=2.3 (from lightautoml)
  Downloading lightgbm-3.2.1-py3-none-manylinux1_x86_64.whl.metadata (14 kB)
Collecting pandas<2.0.0 (from lightautoml)
  Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting poetry-core<2.0.0,>=1.0.0 (from lightautoml)
  Downloading poetry_core-1.9.0-py3-none-any.whl.metadata (3.5 kB)
Collecting statsmodels<=0.14.0 (from lightautoml)
  Downloading 

In [None]:
# Standard python libraries
import os
import requests

# Essential DS libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch

# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from lightautoml.report.report_deco import ReportDeco

In [None]:
N_THREADS = 4
N_FOLDS = 5
RANDOM_STATE = 42
TEST_SIZE = 0.2
TIMEOUT = 300
TARGET_NAME = '1'

In [None]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [None]:
df = pd.read_csv('/kaggle/input/dota2-first/matches.csv')

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,71,72,73,74,75,76,77,78,79,80
0,0,408160.1,1,2589,7.32e,Infinity Esports,Lava Esports,Smoker,Lumpy,hermit,...,0.69,-1.11,0.11,1.01,-0.61,0.73,-1.43,0.33,1.36,-0.03
1,1,408160.0,0,2808,7.32e,Infinity Esports,Lava Esports,Kiri,Leostyle,Frank,...,0.69,-0.77,-1.18,1.56,-0.58,-0.62,2.66,-0.8,-3.4,-0.03
2,2,408161.1,0,2414,7.32e,Infinity Esports,noMERCY,Michael,Mr.Jeans,YadomiJN,...,-0.94,-0.49,0.71,-0.65,-1.25,2.85,-0.89,-0.47,-0.15,-1.36
3,3,408161.0,0,2130,7.32e,Infinity Esports,noMERCY,Mr.Jeans,elmisho,Tiburoncin,...,0.52,-0.37,0.28,-1.27,-0.13,1.14,1.03,0.95,-0.09,1.25
4,4,408162.1,1,3152,7.32e,Infinity Esports,Keyd Stars,fcr,4nalog,Costabile,...,1.07,-0.4,-0.97,0.74,1.01,-0.06,-0.1,0.23,-0.41,1.53


In [None]:
df = df.drop(columns=['Unnamed: 0', '0'], axis=1)

In [None]:
train_data, test_data = train_test_split(
    df,
    test_size=TEST_SIZE,
    stratify=df[TARGET_NAME],
    random_state=RANDOM_STATE
)

In [None]:
roles = {
    'target': TARGET_NAME,
}

In [None]:
task = Task('binary')

In [None]:
automl = TabularAutoML(
    task = task,
    timeout = TIMEOUT,
    cpu_limit = N_THREADS,
    reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
)

In [None]:
out_of_fold_predictions = automl.fit_predict(train_data, roles = roles, verbose = 1)

[13:54:08] Stdout logging level is INFO.
[13:54:08] Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer
[13:54:08] Task: binary

[13:54:08] Start automl preset with listed constraints:
[13:54:08] - time: 300.00 seconds
[13:54:08] - CPU: 4 cores
[13:54:08] - memory: 16 GB

[13:54:08] [1mTrain data shape: (5737, 80)[0m



  pid = os.fork()


[13:54:16] Layer [1m1[0m train process start. Time left 291.91 secs
[13:54:17] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[13:54:18] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m0.6557128299281794[0m
[13:54:18] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed
[13:54:18] Time left 289.73 secs

[13:54:25] [1mSelector_LightGBM[0m fitting and predicting completed
[13:54:26] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...
[13:54:46] Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1m0.668505380755899[0m
[13:54:46] [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m fitting and predicting completed
[13:54:46] Start hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m ... Time budget is 1.00 secs
[13:54:56] Hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m completed
[13:54:56] Start fitting [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m ...
[13:55:20] Fitting [1mLvl_0_Pipe_1_Mod_1_Tuned_L

In [None]:
%%time

test_predictions = automl.predict(test_data)
print(f'Prediction for test_data:\n{test_predictions}\nShape = {test_predictions.shape}')

Prediction for test_data:
array([[0.4220273 ],
       [0.6604965 ],
       [0.53309584],
       ...,
       [0.57693666],
       [0.3599443 ],
       [0.43167877]], dtype=float32)
Shape = (1435, 1)
CPU times: user 1.54 s, sys: 3.84 ms, total: 1.54 s
Wall time: 835 ms


In [None]:
test_predictions

array([[0.4220273 ],
       [0.6604965 ],
       [0.53309584],
       ...,
       [0.57693666],
       [0.3599443 ],
       [0.43167877]], dtype=float32)

In [None]:
predictions_oof_array = out_of_fold_predictions.data
predictions_test_array = test_predictions.data

# Создание массива с таргетами 0 или 1
targets_oof = np.array((predictions_oof_array > 0.5).astype(int))
targets_test = np.array((predictions_test_array > 0.5).astype(int))

In [None]:
targets_oof = targets_oof.flatten()
targets_test = targets_test.flatten()

In [None]:
print(f'OOF score: {roc_auc_score(train_data[TARGET_NAME].values, out_of_fold_predictions.data[:, 0])}')
print(f'HOLDOUT score: {roc_auc_score(test_data[TARGET_NAME].values, test_predictions.data[:, 0])}')

OOF score: 0.6761563542106743
HOLDOUT score: 0.6322245055227331


In [None]:
print(f'OOF score: {accuracy_score(train_data[TARGET_NAME].values, targets_oof)}')
print(f'HOLDOUT score: {accuracy_score(test_data[TARGET_NAME].values, targets_test)}')

OOF score: 0.6301202719191215
HOLDOUT score: 0.5902439024390244
