In [1]:
# Standard python libraries
import os
import time

# Essential DS libraries
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.model_selection import train_test_split
from scipy.stats import rankdata
import torch

# LightAutoML presets, task and report generation
from lightautoml.automl.base import AutoML
from lightautoml.automl.blend import WeightedBlender
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.linear_sklearn import LinearLBFGS
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures, LGBAdvancedPipeline
from lightautoml.pipelines.features.linear_pipeline import LinearFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ModelBasedImportanceEstimator, ImportanceCutoffSelector
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task

# Everything for graphs
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


# Step 0.2. Constants

Here we setup the constants to use in the kernel:
- `N_THREADS` - number of vCPUs for LightAutoML model creation
- `N_FOLDS` - number of folds in LightAutoML inner CV
- `RANDOM_STATE` - random seed for better reproducibility
- `TEST_SIZE` - houldout data part size 
- `TIMEOUT` - limit in seconds for model to train
- `TARGET_NAME` - target column name in dataset

In [2]:
N_THREADS = 1
N_FOLDS = 5
RANDOM_STATE = 42
TEST_SIZE = 0.1
TIMEOUT = 6*3600
TARGET_NAME = 'Рейтинг'

In [3]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [4]:
%%time

df = pd.read_csv('bo_rates.csv')
rates = pd.read_csv('na_ratings.csv')
df = df.apply(pd.to_numeric, errors='coerce').astype('Int64')
rates['ИНН'] = rates['ИНН'].astype('Int64')
df = pd.merge(df, rates[['ИНН', 'Рейтинг']], on='ИНН')

CPU times: total: 62.5 ms
Wall time: 166 ms


In [5]:
df['Рейтинг'] = df['Рейтинг'].str.split('[|]|.esg|.am').str[0].str.strip()
df['Рейтинг'] = df['Рейтинг'].str.replace('-', '–').str.replace('А', 'A').str.replace('В', 'B').str.replace('Независимое мнение НРA', '—').str.replace('1', '+').str.replace('2', '+')
df['Рейтинг'].nunique()

19

# regression

In [6]:
from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task

In [7]:
rating_to_number = {
    'AAA': 100,
    'AA+': 90,
    'AA': 85,
    'AA–': 80,
    'A+': 75,
    'A': 70,
    'A–': 65,
    'BBB+': 60,
    'BBB': 55,
    'BBB–': 50,
    'BB+': 45,
    'BB': 40,
    'BB–': 35,
    'B+': 30,
    'B': 25,
    'B–': 20,
    'CCC': 15,
    'C': 10,
    '—': 0  # Отсутствующий или не определённый рейтинг
}

df['Рейтинг'] = df['Рейтинг'].map(rating_to_number)

In [8]:
df['Рейтинг'].mean()

62.14456188203205

In [9]:
for task_params, target in zip(
    [
        {"name": "reg", "loss": "mse", "metric": "r2"},
        {"name": "reg", "loss": "rmsle", "metric": "rmsle"},
        {
            "name": "reg",
            "loss": "quantile",
            "loss_params": {"q": 0.9},
            "metric": "quantile",
            "metric_params": {"q": 0.9},
        },
    ],
    ["num_sold", "num_sold", "num_sold"],
):
    task = Task(**task_params)
    print("Task created")

    reader = PandasToPandasReader(task, cv=5, random_state=1)
    print("Reader created")

    # pipeline 1
    pipe = LGBSimpleFeatures()

    params_tuner1 = OptunaTuner(n_trials=20, timeout=30)
    model1 = BoostLGBM(
        default_params={'learning_rate': 0.05, 'num_leaves': 128,
                        'seed': 1, 'num_threads': N_THREADS}
    )
    model2 = BoostLGBM(
        default_params={'learning_rate': 0.04, 'num_leaves': 64,
                        'seed': 2, 'max_depth': 8, 'num_threads': N_THREADS}
    )
    pipeline_lvl1 = MLPipeline([
        (model1, params_tuner1),
        model2],
        pre_selection=None, 
        features_pipeline=pipe,
        post_selection=None
    )
    print("Pipeline1 created")
    #pipeline2
    pipe1 = LGBSimpleFeatures()
    model = BoostLGBM(
        default_params={'learning_rate': 0.05, 'num_leaves': 64,
                        'max_bin': 1024, 'seed': 3, 'max_depth': 8, 'num_threads': N_THREADS},
        freeze_defaults=True
    )
    pipeline_lvl2 = MLPipeline(
        [model], 
        pre_selection=None, 
        features_pipeline=pipe1,
        post_selection=None
    )
    print("Pipeline2 created")
\
    automl = AutoML(reader, [
        [pipeline_lvl1],
        [pipeline_lvl2],], 
        skip_conn=False)

Task created
Reader created
Pipeline1 created
Pipeline2 created
Task created
Reader created
Pipeline1 created
Pipeline2 created
Task created
Reader created
Pipeline1 created
Pipeline2 created


In [10]:
train_data, test_data = train_test_split(df, 
                                    test_size=TEST_SIZE, 
                                    stratify=df['Рейтинг'], 
                                    random_state=RANDOM_STATE)
print('Data splitted. Parts sizes: tr_data = {}, te_data = {}'.format(train_data.shape, test_data.shape))

Data splitted. Parts sizes: tr_data = (7919, 70), te_data = (880, 70)


In [11]:
start_time = time.time()
oof_pred = automl.fit_predict(train_data, roles={"target": 'Рейтинг'}, verbose=2)
print("AutoML pipeline fitted and predicted. Time = {:.3f} sec".format(time.time() - start_time))

[07:45:16] [1mTrain data shape: (7919, 70)[0m

[07:45:24] Layer [1m1[0m train process start. Time left 9999999991.47 secs
[07:45:24] Start hyperparameters optimization for [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m ... Time budget is 30.00 secs
[07:45:24] Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer
[07:45:25] [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m fitting and predicting completed
[07:45:25] [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m fitting and predicting completed
[07:45:25] [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m fitting and predicting completed
[07:45:25] [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m fitting and predicting completed
[07:45:25] [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m fitting and predicting completed
[07:45:25] [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m fitting and predicting completed
[07:45:25] [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m fitting and predicting completed
[07:45:25] [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m fitting and predicting completed
[07:45:2

In [12]:
test_pred = automl.predict(test_data.drop(columns='Рейтинг'))
print("Check scores...")
print("OOF score: {}".format(task.metric_func(train_data[TARGET_NAME].values, oof_pred.data[:, 0])))

Check scores...
OOF score: 34.070779138780146


In [13]:
len(test_pred)

880

In [14]:
res = pd.DataFrame({
    'pred': test_pred.data[:, 0],
    'true': test_data[TARGET_NAME].tolist()
})
res.sample(10)

Unnamed: 0,pred,true
331,100.0,40
247,100.0,85
790,100.0,70
316,100.0,0
215,100.0,80
680,100.0,60
585,100.0,75
462,100.0,65
815,100.0,50
39,100.0,100


# classifier

In [6]:
%%time
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

label_encoder = LabelEncoder()
df = df.apply(LabelEncoder().fit_transform)
df['Рейтинг'] = df['Рейтинг'] + 1
train_data, test_data = train_test_split(df, 
                                    test_size=TEST_SIZE, 
                                    stratify=df[TARGET_NAME], 
                                    random_state=RANDOM_STATE)
print('Data splitted. Parts sizes: tr_data = {}, te_data = {}'.format(train_data.shape, test_data.shape))

Data splitted. Parts sizes: tr_data = (7919, 70), te_data = (880, 70)
CPU times: total: 0 ns
Wall time: 54.8 ms


In [7]:
feat_sel_0 = LGBSimpleFeatures()
mod_sel_0 = BoostLGBM()
imp_sel_0 = ModelBasedImportanceEstimator()
selector_0 = ImportanceCutoffSelector(feat_sel_0, mod_sel_0, imp_sel_0, cutoff=0, )

feats_gbm_0 = LGBAdvancedPipeline(top_intersections=4, 
                                  output_categories=True, 
                                  feats_imp=imp_sel_0)
gbm_0 = BoostLGBM()
gbm_1 = BoostLGBM()

tuner_0 = OptunaTuner(n_trials=20, timeout=30, fit_on_holdout=True)
gbm_lvl0 = MLPipeline([
        (gbm_0, tuner_0),
        gbm_1
    ],
    pre_selection=selector_0,
    features_pipeline=feats_gbm_0, 
    post_selection=None
)

Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer
CPU times: total: 0 ns
Wall time: 1.78 ms


In [8]:
feats_reg_0 = LinearFeatures(output_categories=True, 
                             sparse_ohe='auto')

reg_0 = LinearLBFGS()

reg_lvl0 = MLPipeline([
        reg_0
    ],
    pre_selection=None,
    features_pipeline=feats_reg_0, 
    post_selection=None
)

In [9]:
task = Task('multiclass', metric = 'crossentropy', ) 
reader = PandasToPandasReader(task = task, samples = None, max_nan_rate = 1, max_constant_rate = 1,
                              advanced_roles = True, drop_score_co = -1, n_jobs = 4)
blender = WeightedBlender()
automl = AutoML(reader=reader, levels=[
    [gbm_lvl0, reg_lvl0]
], blender=blender, skip_conn=False)

In [10]:
oof_pred = automl.fit_predict(train_data, roles={'target': TARGET_NAME}, verbose = 1)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred[:10], oof_pred.shape))

[17:01:27] [1mTrain data shape: (7919, 70)[0m

[17:01:37] Feats was rejected during automatic roles guess: []
[17:01:37] Layer [1m1[0m train process start. Time left 9999999989.32 secs
[17:01:38] Training until validation scores don't improve for 100 rounds
[17:01:49] [1mLightGBM[0m fitting and predicting completed
[17:01:49] Start hyperparameters optimization for [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m ... Time budget is 30.00 secs
[17:01:49] Training until validation scores don't improve for 100 rounds
[17:02:02] [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m fitting and predicting completed
[17:02:02] [1mTrial 1[0m with hyperparameters {'feature_fraction': 0.6872700594236812, 'num_leaves': 244} scored -0.6029024442634559 in 0:00:12.309111
[17:02:02] Training until validation scores don't improve for 100 rounds
[17:02:14] [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m fitting and predicting completed
[17:02:14] [1mTrial 2[0m with hyperparameters {'feature_fraction': 0.8659969709057025, 'num_leaves':

In [14]:
from sklearn.metrics import log_loss

In [15]:
%%time

test_pred = automl.predict(test_data)

print('Check scores...')
print('OOF score: {}'.format(log_loss(train_data[TARGET_NAME].values, oof_pred.data)))
print('TEST score: {}'.format(log_loss(test_data[TARGET_NAME].values, test_pred.data)))

Check scores...
OOF score: 8.76467546257399
TEST score: 8.79726533673589
CPU times: total: 13.8 s
Wall time: 8.39 s


In [29]:
import statsmodels.tsa.stattools as ts
import warnings
warnings.filterwarnings('ignore')

results = {}
for column in df.columns:
    if df[column].nunique() > 1:  # Пропустить константные столбцы
        try:
            results[column] = test_stationarity(df[column])
        except Exception as e:
            pass
        # print(f"Column {column} is constant and will be skipped.")

# Вывод результатов
for column, result in results.items():
    print(f'Результаты для столбца {column}:')
    print(f'ADF Statistic: {result["ADF Statistic"]}')
    print(f'ADF p-value: {result["ADF p-value"]}')
    print(f'ADF Critical Values: {result["ADF Critical Values"]}')
    print(f'KPSS Statistic: {result["KPSS Statistic"]}')
    print(f'KPSS p-value: {result["KPSS p-value"]}')
    print(f'KPSS Critical Values: {result["KPSS Critical Values"]}')
    print()

Результаты для столбца ИНН:
ADF Statistic: -30.981897108619435
ADF p-value: 0.0
ADF Critical Values: {'1%': -3.4378113191216397, '5%': -2.8648337072350074, '10%': -2.5685238062308366}
KPSS Statistic: 0.22807413602933882
KPSS p-value: 0.1
KPSS Critical Values: {'10%': 0.347, '5%': 0.463, '2.5%': 0.574, '1%': 0.739}

Результаты для столбца Выручка2023:
ADF Statistic: -29.58019581639882
ADF p-value: 0.0
ADF Critical Values: {'1%': -3.4378113191216397, '5%': -2.8648337072350074, '10%': -2.5685238062308366}
KPSS Statistic: 0.18487151855705636
KPSS p-value: 0.1
KPSS Critical Values: {'10%': 0.347, '5%': 0.463, '2.5%': 0.574, '1%': 0.739}

Результаты для столбца Выручка2022:
ADF Statistic: -29.550926125922786
ADF p-value: 0.0
ADF Critical Values: {'1%': -3.4378113191216397, '5%': -2.8648337072350074, '10%': -2.5685238062308366}
KPSS Statistic: 0.12255339090425471
KPSS p-value: 0.1
KPSS Critical Values: {'10%': 0.347, '5%': 0.463, '2.5%': 0.574, '1%': 0.739}

Результаты для столбца Себестоимос

In [23]:
print(automl.create_model_str_desc())

AttributeError: 'AutoML' object has no attribute 'create_model_str_desc'

In [22]:
fast_fi = automl.get_feature_scores('fast')
fast_fi.set_index('Feature')['Importance'].plot.bar(figsize = (30, 10), grid = True)

AttributeError: 'AutoML' object has no attribute 'get_feature_scores'

In [18]:
for dat, df, name in zip([oof_pred, test_pred], [train_data, test_data], ['train', 'test']):
    print('Check aucs {0}...'.format(name))
    for cl in range(1, 21):
        sc = roc_auc_score((df[TARGET_NAME].values == cl).astype(np.float32), dat.data[:, cl])
        print('Class {0} {1} auc score: {2}'.format(cl, name, sc))

Check aucs train...
Class 1 train auc score: 0.9757588386821228
Class 2 train auc score: 0.46633907871180197
Class 3 train auc score: 0.5097782467772134
Class 4 train auc score: 0.32090542570496805
Class 5 train auc score: 0.4156428008726215
Class 6 train auc score: 0.34357686251799546
Class 7 train auc score: 0.4649804163182678
Class 8 train auc score: 0.5229800772191633
Class 9 train auc score: 0.5410695767784214
Class 10 train auc score: 0.6197792124980073
Class 11 train auc score: 0.4833727352259598
Class 12 train auc score: 0.618421320572214
Class 13 train auc score: 0.9889034418642875
Class 14 train auc score: 0.5074059111511815
Class 15 train auc score: 0.5039439953569018
Class 16 train auc score: 0.5852892582063804
Class 17 train auc score: 0.4754505057029379
Class 18 train auc score: 0.6931322194429612
Class 19 train auc score: 0.6614709594694769
Class 20 train auc score: 0.49150559511698877
Check aucs test...
Class 1 test auc score: 0.9825379680544646
Class 2 test auc score: 

In [21]:
print(automl.create_model_str_desc())

AttributeError: 'AutoML' object has no attribute 'create_model_str_desc'

In [33]:
df['Рейтинг'].unique()

array([17,  8,  9,  4,  6, 15,  1,  7,  2, 18,  3, 12, 16, 21, 14, 10, 20,
        5, 13, 11, 19])