In [1]:
import pandas as pd
import numpy as np
import wandb
import lightgbm as lgb

from features.extractor import FeatureExtractor
from features.final_processing import CustomColumnTransformer
from configs import utils
utils.login_wandb()
from sklearn.metrics import classification_report
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats.mstats import winsorize
from typing import List

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
wandb: Appending key for api.wandb.ai to your netrc file: /Users/fedorturchenko/.netrc


In [2]:
customers, sales = pd.read_excel('ucy_eko_data.xlsx', sheet_name='smile_customers'), pd.read_excel('ucy_eko_data.xlsx', sheet_name='smile_sales')

In [3]:
fe = FeatureExtractor(sales=sales, customers=customers, target_month=3, perform_split=False)
# X_train, X_test, y_train, y_test = fe.transform()
X, y = fe.transform()

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/fedorturchenko/.cache/huggingface/token
Login successful
Successfully logged out.


TypeError: Cannot setitem on a Categorical with a new category (0), set the categories first

In [5]:
import pickle
def extract_clustering_feature(df_customer_level: pd.DataFrame):
    '''
    Method to extract clusters (i.e. customer segments) based on RFM variables
    TODO: add extended docstring
    '''
    # Load clustering model
    with open('./features/clustering_model.pkl', 'rb') as f:
        model = pickle.load(f)
    # Load `scipy.stats.mstats.winsorize` output object to define threshold for the `monetary` variable
    with open('./features/winsorizing_object_for_threshold.pkl', 'rb') as f:
        winsor = pickle.load(f)
    X_clust = df_customer_level[['monetary', 'recency', 'average_days_between_visits']]
    monetary_threshold = winsor.max()
    # Perform winsorization
    X_clust.loc[X_clust['monetary'] > monetary_threshold, 'monetary'] = monetary_threshold
    scaler = StandardScaler()
    labels = pd.Categorical(
        model.predict(
            scaler.fit_transform(X_clust)
        )
    )
    df_customer_level['segments'] = labels
    df_customer_level['segments'] = df_customer_level['segments'].cat.rename_categories({0: 'frequent_drivers', 1: 'passerbys', 2: 'regular_drivers'})
    return df_customer_level

In [7]:
pivot_tables = []
for feature in fe.customer_level_features:
    pivot_tables.append(
        fe.pivot_table(
            fe.sales,
            **feature
        )
    )
df_customer_level = pd.concat(pivot_tables, axis=1).reset_index()

In [10]:
extract_clustering_feature(df_customer_level)['segments'].value_counts()

regular_drivers     16894
passerbys           12311
frequent_drivers      525
Name: segments, dtype: int64

In [None]:
pipe = Pipeline(
    [
        ('scaling', RobustScaler()),
        ('lightgbm', lgb.LGBMClassifier(n_jobs=-1, random_state=1))
    ]
)
config = pipe.get_params()

In [None]:
with utils.init_wandb_run(
    name='robust_scaling_initial_run',
    model=lgb.LGBMClassifier,
    config=config,
    group='default_parameters',
    job_type='train'
) as run:
    pipe.fit(X_train, y_train)
    train_preds = pipe.predict(X_train)

    train_report = parse_classification_report(
        classification_report(y_train, train_preds, output_dict=True)
    )

    metadata = {
        'experiment': {
            'name': run.name,
        },
        'classification_report': train_report,
        'config': config
    }

    artifact = wandb.Artifact(
        name='train_classification_report',
        type='performance_report',
        metadata=metadata
    )
    run.log_artifact(artifact)
    run.finish()

In [None]:
with utils.init_wandb_run(
    name='robust_scaling_initial_run',
    model=lgb.LGBMClassifier,
    config=config,
    group='default_parameters',
    job_type='test'
) as run:
    test_preds = pipe.predict(X_test)

    test_report = parse_classification_report(
        classification_report(y_test, test_preds, output_dict=True)
    )

    metadata = {
        'experiment': {
            'name': run.name,
        },
        'classification_report': test_report,
        'config': config
    }

    artifact = wandb.Artifact(
        name='test_classification_report',
        type='performance_report',
        metadata=metadata
    )
    run.log_artifact(artifact)
    run.finish()

In [None]:
param_grid = {
    'lightgbm__max_depth': [10, 50, 100],
    'lightgbm__num_leaves': [5, 7, 10],
    'lightgbm__n_estimators': [100, 1000, 10000],
    'lightgbm__learning_rate': [0.0001, 0.001, 0.1]
}

pipe = Pipeline(
    [
        ('scaling', RobustScaler()),
        ('lightgbm', lgb.LGBMClassifier(n_jobs=-1, random_state=1))
    ]
)
config = pipe.get_params()

search = GridSearchCV(
    pipe,
    param_grid=param_grid,
    scoring='accuracy',
    cv=10,
    verbose=10
)

In [None]:
search.fit(X_train, y_train)

In [None]:
with utils.init_wandb_run(
    name='robust_scaling_tuning_for_better_training_fit_run',
    model=lgb.LGBMClassifier,
    config=config,
    group='parameters_tuning',
    job_type='tuning_train'
) as run:
    train_preds = search.predict(X_train)

    train_report = parse_classification_report(
        classification_report(y_train, train_preds, output_dict=True)
    )

    metadata = {
        'experiment': {
            'name': run.name,
        },
        'classification_report': train_report,
        'best_params': search.best_params_,
        'config': config
    }

    artifact = wandb.Artifact(
        name='train_classification_report',
        type='performance_report',
        metadata=metadata
    )
    run.log_artifact(artifact)
    run.finish()

In [None]:
with utils.init_wandb_run(
    name='robust_scaling_tuning_for_better_training_fit_run',
    model=lgb.LGBMClassifier,
    config=config,
    group='parameters_tuning',
    job_type='tuning_test'
) as run:
    test_preds = search.predict(X_test)

    test_report = parse_classification_report(
        classification_report(y_test, test_preds, output_dict=True)
    )

    metadata = {
        'experiment': {
            'name': run.name,
        },
        'classification_report': test_report,
        'best_params': search.best_params_,
        'config': config
    }

    artifact = wandb.Artifact(
        name='test_classification_report',
        type='performance_report',
        metadata=metadata
    )
    run.log_artifact(artifact)
    run.finish()

In [None]:
{k.replace('lightgbm__', ''): search.best_params_[k] for k in search.best_params_}

In [None]:
class CustomColsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_wins: List[str]) -> None:
        self.cols_to_wins = cols_to_wins

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.apply(winsorize, limits=(0.0, 0.01), axis=0)
        return X

In [None]:
fe = FeatureExtractor(sales, target_month=2)
X_train, X_test, y_train, y_test = fe.transform()

In [None]:
param_grid = {
    'lightgbm__max_depth': [10, 50, 100],
    'lightgbm__num_leaves': [5, 7, 10],
    'lightgbm__n_estimators': [100, 1000, 10000],
    'lightgbm__learning_rate': [0.0001, 0.001, 0.1]
}

pipe = Pipeline(
    [
        ('winsorization', CustomColsTransformer(X_train.columns)),
        ('scaling', StandardScaler()),
        ('lightgbm', lgb.LGBMClassifier(n_jobs=-1, random_state=1))
    ]
)
config = pipe.get_params()

search = GridSearchCV(
    pipe,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    verbose=10
)

In [None]:
search.fit(X_train, y_train)

In [None]:
with utils.init_wandb_run(
    name='winsorized_tuning_for_better_training_fit_run',
    model=lgb.LGBMClassifier,
    config=config,
    target_month=fe.target_month,
    group='parameters_tuning',
    job_type='tuning_train'
) as run:
    train_preds = search.predict(X_train)

    train_report = utils.parse_classification_report(
        classification_report(y_train, train_preds, output_dict=True)
    )

    metadata = {
        'experiment': {
            'name': run.name,
        },
        'classification_report': train_report,
        'best_params': search.best_params_,
        'config': config
    }

    artifact = wandb.Artifact(
        name='train_classification_report',
        type='performance_report',
        metadata=metadata
    )
    run.log_artifact(artifact)
    run.finish()

In [None]:
with utils.init_wandb_run(
    name='winsorized_tuning_for_better_training_fit_run',
    model=lgb.LGBMClassifier,
    config=config,
    target_month=fe.target_month,
    group='parameters_tuning',
    job_type='tuning_test'
) as run:
    test_preds = search.predict(X_test)

    test_report = utils.parse_classification_report(
        classification_report(y_test, test_preds, output_dict=True)
    )

    metadata = {
        'experiment': {
            'name': run.name,
        },
        'classification_report': test_report,
        'best_params': search.best_params_,
        'config': config
    }

    artifact = wandb.Artifact(
        name='test_classification_report',
        type='performance_report',
        metadata=metadata
    )
    run.log_artifact(artifact)
    run.finish()

In [None]:
pipe = Pipeline(
    [
        ('winsorization', CustomColsTransformer(X_train.columns)),
        ('scaling', StandardScaler()),
        ('lightgbm', lgb.LGBMClassifier(n_jobs=-1, random_state=1, colsample_bytree=0.5, **{k.replace('lightgbm__', ''): search.best_params_[k] for k in search.best_params_}))
    ]
)

pipe.fit(X_train, y_train)
test_preds = pipe.predict(X_test)
train_preds = pipe.predict(X_train)

In [None]:
print(classification_report(y_train, train_preds))

In [None]:
print(classification_report(y_test, test_preds))