In [None]:
import pandas as pd
import numpy as np
import wandb
import lightgbm as lgb

from features.extractor import FeatureExtractor
from configs import utils
utils.login_wandb()
from sklearn.metrics import classification_report
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

wandb: Appending key for api.wandb.ai to your netrc file: /Users/fedorturchenko/.netrc


In [None]:
from typing import Literal, Union
def parse_classification_report(report: Union[Literal['sklearn.metrics.classification_report'], dict]) -> dict:
    '''
    Extract required metrics from `sklearn.metrics.classification_report`
    and transform it into `wandb.Artefact` friendly format

    report - `sklearn.metrics.classifcation_report(..., output_dict=True)`; report as dictionary
    '''
    new_dict = {
        'accuracy': None,
        'precision': [],
        'recall': [],
        'f1-score': []
    }
    new_dict['accuracy'] = report['accuracy']
    for k in (['0', '1', 'macro avg']):
        for metric in ['precision', 'recall', 'f1-score']:
            new_dict[metric].append({f'{k}_{metric}': report[k][metric]})

    return new_dict

In [None]:
sales = pd.read_excel('ucy_eko_data.xlsx', sheet_name='smile_sales')

In [None]:
fe = FeatureExtractor(sales, target_month=3)
X_train, X_test, y_train, y_test = fe.transform()

In [None]:
pipe = Pipeline(
    [
        ('scaling', RobustScaler()),
        ('lightgbm', lgb.LGBMClassifier(n_jobs=-1, random_state=1))
    ]
)
config = pipe.get_params()

In [None]:
with utils.init_wandb_run(
    name='robust_scaling_initial_run',
    model=lgb.LGBMClassifier,
    config=config,
    group='default_parameters',
    job_type='train'
) as run:
    pipe.fit(X_train, y_train)
    train_preds = pipe.predict(X_train)

    train_report = parse_classification_report(
        classification_report(y_train, train_preds, output_dict=True)
    )

    metadata = {
        'experiment': {
            'name': run.name,
        },
        'classification_report': train_report,
        'config': config
    }

    artifact = wandb.Artifact(
        name='train_classification_report',
        type='performance_report',
        metadata=metadata
    )
    run.log_artifact(artifact)
    run.finish()

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01673183888196945, max=1.0)…

[LightGBM] [Info] Number of positive: 29528, number of negative: 25893
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2711
[LightGBM] [Info] Number of data points in the train set: 55421, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.532794 -> initscore=0.131366
[LightGBM] [Info] Start training from score 0.131366


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [None]:
with utils.init_wandb_run(
    name='robust_scaling_initial_run',
    model=lgb.LGBMClassifier,
    config=config,
    group='default_parameters',
    job_type='test'
) as run:
    test_preds = pipe.predict(X_test)

    test_report = parse_classification_report(
        classification_report(y_test, test_preds, output_dict=True)
    )

    metadata = {
        'experiment': {
            'name': run.name,
        },
        'classification_report': test_report,
        'config': config
    }

    artifact = wandb.Artifact(
        name='test_classification_report',
        type='performance_report',
        metadata=metadata
    )
    run.log_artifact(artifact)
    run.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [None]:
param_grid = {
    'lightgbm__max_depth': [10, 50, 100],
    'lightgbm__num_leaves': [5, 7, 10],
    'lightgbm__n_estimators': [100, 1000, 10000],
    'lightgbm__learning_rate': [0.0001, 0.001, 0.1]
}

pipe = Pipeline(
    [
        ('scaling', RobustScaler()),
        ('lightgbm', lgb.LGBMClassifier(n_jobs=-1, random_state=1))
    ]
)
config = pipe.get_params()

search = GridSearchCV(
    pipe,
    param_grid=param_grid,
    scoring='accuracy',
    cv=10,
    verbose=10
)

In [None]:
search.fit(X_train, y_train)

Fitting 10 folds for each of 81 candidates, totalling 810 fits
[CV 1/10; 1/81] START lightgbm__learning_rate=0.0001, lightgbm__max_depth=10, lightgbm__n_estimators=100, lightgbm__num_leaves=5
[LightGBM] [Info] Number of positive: 26575, number of negative: 23303
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2710
[LightGBM] [Info] Number of data points in the train set: 49878, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.532800 -> initscore=0.131389
[LightGBM] [Info] Start training from score 0.131389
[CV 1/10; 1/81] END lightgbm__learning_rate=0.0001, lightgbm__max_depth=10, lightgbm__n_estimators=100, lightgbm__num_leaves=5;, score=0.533 total time=   0.2s
[CV 2/10; 1/81] START lightgbm__learning_rate=0.0001, lightgbm__max_depth=10, lightgbm__n_estimators=100, lightgbm__num_leaves=5
[LightGBM] [Info] Number of positive: 26575, number of negative: 23

In [None]:
with utils.init_wandb_run(
    name='robust_scaling_tuning_for_better_training_fit_run',
    model=lgb.LGBMClassifier,
    config=config,
    group='parameters_tuning',
    job_type='tuning_train'
) as run:
    train_preds = search.predict(X_train)

    train_report = parse_classification_report(
        classification_report(y_train, train_preds, output_dict=True)
    )

    metadata = {
        'experiment': {
            'name': run.name,
        },
        'classification_report': train_report,
        'best_params': search.best_params_,
        'config': config
    }

    artifact = wandb.Artifact(
        name='train_classification_report',
        type='performance_report',
        metadata=metadata
    )
    run.log_artifact(artifact)
    run.finish()

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016686934733297677, max=1.0…

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [None]:
with utils.init_wandb_run(
    name='robust_scaling_tuning_for_better_training_fit_run',
    model=lgb.LGBMClassifier,
    config=config,
    group='parameters_tuning',
    job_type='tuning_test'
) as run:
    test_preds = search.predict(X_test)

    test_report = parse_classification_report(
        classification_report(y_test, test_preds, output_dict=True)
    )

    metadata = {
        'experiment': {
            'name': run.name,
        },
        'classification_report': test_report,
        'best_params': search.best_params_,
        'config': config
    }

    artifact = wandb.Artifact(
        name='test_classification_report',
        type='performance_report',
        metadata=metadata
    )
    run.log_artifact(artifact)
    run.finish()

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.0167519284839121, max=1.0))…

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [None]:
{k.replace('lightgbm__', ''): search.best_params_[k] for k in search.best_params_}

{'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 100, 'num_leaves': 7}