In [1]:
import pandas as pd, wandb, warnings, optuna, joblib
warnings.filterwarnings('ignore')
from features.extractor import FeatureExtractor
from features.final_processing import CustomColumnTransformer
from tuning.optuna_tuning import OptunaTuner
from configs import utils
utils.login_wandb()
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline

  from .autonotebook import tqdm as notebook_tqdm
wandb: Appending key for api.wandb.ai to your netrc file: /Users/grigoryturchenko/.netrc


In [2]:
try:
    customers, sales = joblib.load('customers.joblib'), joblib.load('sales.joblib')
except:
    customers, sales = pd.read_excel('ucy_eko_data.xlsx', sheet_name='smile_customers'), pd.read_excel('ucy_eko_data.xlsx', sheet_name='smile_sales')
    joblib.dump(customers, 'customers.joblib')
    joblib.dump(sales, 'sales.joblib')

In [3]:
fe = FeatureExtractor(sales=sales, customers=customers, target_month=3, perform_split=True, generation_type='continuous', filtering_set='sales', period=60, subperiod=30)
X_train, X_test, y_train, y_test = fe.transform()

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/grigoryturchenko/.cache/huggingface/token
Login successful
Successfully logged out.


In [4]:
qty_cols = [col for col in X_train.columns if 'qty' in col]
col_transform = CustomColumnTransformer(
    cols_for_scaling=qty_cols,
    scaling_algo=RobustScaler(),
    cols_for_ohe=None,
    cols_for_winsor=None,
    cols_to_skip=None
)

In [5]:
pipe = Pipeline(
    [
        ('column_transformer', col_transform),
        ('xgb', XGBClassifier(predictor='cpu_predictor', seed=571))
    ]
)

In [6]:
pipe.fit(X_train, y_train)
train_preds = pipe.predict(X_train)
test_preds = pipe.predict(X_test)

print('Train data')
print(classification_report(y_train, train_preds))
print('Test data')
print(classification_report(y_test, test_preds))

Train data
              precision    recall  f1-score   support

           0       0.78      0.80      0.79     25669
           1       0.82      0.80      0.81     29752

    accuracy                           0.80     55421
   macro avg       0.80      0.80      0.80     55421
weighted avg       0.80      0.80      0.80     55421

Test data
              precision    recall  f1-score   support

           0       0.77      0.69      0.73      8603
           1       0.75      0.82      0.78      9871

    accuracy                           0.76     18474
   macro avg       0.76      0.75      0.75     18474
weighted avg       0.76      0.76      0.76     18474



In [9]:
X_train, X_test = col_transform.fit_transform(X_train, y_train), col_transform.fit_transform(X_test, y_test)

In [11]:
xgb_op = OptunaTuner(
    XGBClassifier, accuracy_score, 'maximize', # class-specific arguments
    seed=571, predictor='cpu_predictor', verbosity=0, nthread=7, # model-specific technical parameters
    objective='binary:hinge', eval_metric='error' # model-specific fixed hyperparameters
)
xgb_op.fit(
    100, X_train, y_train, X_test, y_test,
    ('n_estimators', 'int', 100, 1000, {'step': 50}),
    ('eta', 'float', 1e-3, 1e-1, {'log': True}),
    ('max_depth', 'int', 3, 20),
    ('subsample', 'float', 0.5, 1.0, {'step': 0.05}),
    ('colsample_bynode', 'float', 0.1, 1.0, {'step': 0.05}),
    # ('lambda', 'float', 0.0, 10.0, {'step': 0.05}),
    # ('alpha', 'float', 0.0, 10.0, {'step': 0.05})
)

In [12]:
xgb_op.model.get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': None,
 'base_score': 0.5,
 'booster': 'gbtree',
 'callbacks': None,
 'colsample_bylevel': 1,
 'colsample_bynode': 0.35,
 'colsample_bytree': 1,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': 0,
 'gpu_id': -1,
 'grow_policy': 'depthwise',
 'importance_type': None,
 'interaction_constraints': '',
 'learning_rate': 0.0561350323,
 'max_bin': 256,
 'max_cat_threshold': 64,
 'max_cat_to_onehot': 4,
 'max_delta_step': 0,
 'max_depth': 4,
 'max_leaves': 0,
 'min_child_weight': 1,
 'missing': nan,
 'monotone_constraints': '()',
 'n_estimators': 450,
 'n_jobs': 0,
 'num_parallel_tree': 1,
 'predictor': 'auto',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'sampling_method': 'uniform',
 'scale_pos_weight': 1,
 'subsample': 0.8500000000000001,
 'tree_method': 'exact',
 'validate_parameters': 1,
 'verbosity': None,
 'eta': 0.05613503325172536}

In [13]:
optuna.visualization.plot_optimization_history(xgb_op.study)

In [14]:
train_preds = xgb_op.model.predict(X_train)
test_preds = xgb_op.model.predict(X_test)

print('Train data')
print(classification_report(y_train, train_preds))
print('Test data')
print(classification_report(y_test, test_preds))

Train data
              precision    recall  f1-score   support

           0       0.76      0.76      0.76     25669
           1       0.79      0.80      0.79     29752

    accuracy                           0.78     55421
   macro avg       0.78      0.78      0.78     55421
weighted avg       0.78      0.78      0.78     55421

Test data
              precision    recall  f1-score   support

           0       0.77      0.73      0.75      8603
           1       0.77      0.81      0.79      9871

    accuracy                           0.77     18474
   macro avg       0.77      0.77      0.77     18474
weighted avg       0.77      0.77      0.77     18474



In [16]:
config = xgb_op.model.get_params()
xgb = XGBClassifier(**config)

# Create w&b run for the training set
with utils.init_wandb_run(
    name='continuous_features_optuna_best_score',
    model=XGBClassifier,
    config=config,
    target_month=fe.target_month,
    group='parameters_tuning',
    job_type='tuning_train'
) as run:

    xgb.fit(X_train, y_train)
    train_preds = xgb.predict(X_train)

    rep = utils.parse_classification_report(
        classification_report(y_train, train_preds, output_dict=True)
    )

    metadata = {
        'experiment': {
            'name': run.name,
        },
        'performance_report': rep,
        'config': config
    }

    artifact = wandb.Artifact(
        name=f'report_train',
        type='performance_metric',
        metadata=metadata
    )
    run.log_artifact(artifact)
    run.finish()

# Create w&b run for the test set
with utils.init_wandb_run(
    name='continuous_features_optuna_best_score',
    model=XGBClassifier,
    config=config,
    target_month=fe.target_month,
    group='parameters_tuning',
    job_type='tuning_test'
) as run:
    test_preds = xgb.predict(X_test)
    rep = utils.parse_classification_report(
        classification_report(y_test, test_preds, output_dict=True)
    )

    metadata = {
        'experiment': {
            'name': run.name,
        },
        'performance_report': rep,
        'config': config
    }

    artifact = wandb.Artifact(
        name=f'report_test',
        type='performance_metric',
        metadata=metadata
    )
    run.log_artifact(artifact)
    run.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mgaturchenko[0m ([33mkpmg-capstone[0m). Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
