In [1]:
import pandas as pd, wandb, warnings, optuna, joblib
warnings.filterwarnings('ignore')
from features.extractor import FeatureExtractor
from features.final_processing import CustomColumnTransformer
from tuning.optuna_tuning import OptunaTuner
from configs import utils
utils.login_wandb()
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline

  from .autonotebook import tqdm as notebook_tqdm
wandb: Appending key for api.wandb.ai to your netrc file: /Users/grigoryturchenko/.netrc


In [2]:
try:
    customers, sales = joblib.load('customers.joblib'), joblib.load('sales.joblib')
except:
    customers, sales = pd.read_excel('ucy_eko_data.xlsx', sheet_name='smile_customers'), pd.read_excel('ucy_eko_data.xlsx', sheet_name='smile_sales')
    joblib.dump(customers, 'customers.joblib')
    joblib.dump(sales, 'sales.joblib')

In [3]:
fe = FeatureExtractor(sales=sales, customers=customers, target_month=3, perform_split=True, generation_type='continuous', filtering_set='sales', period=60, subperiod=15)
X_train, X_test, y_train, y_test = fe.transform()

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/grigoryturchenko/.cache/huggingface/token
Login successful
Successfully logged out.


In [4]:
qty_cols = [col for col in X_train.columns if 'qty' in col]
col_transform = CustomColumnTransformer(
    cols_for_scaling=qty_cols,
    scaling_algo=RobustScaler(),
    cols_for_ohe=None,
    cols_for_winsor=None,
    cols_to_skip=None
)

In [5]:
pipe = Pipeline(
    [
        ('column_transformer', col_transform),
        ('rf', RandomForestClassifier(max_features=None, random_state=571, n_jobs=7))
    ]
)

In [6]:
pipe.fit(X_train, y_train)
train_preds = pipe.predict(X_train)
test_preds = pipe.predict(X_test)

print('Train data')
print(classification_report(y_train, train_preds))
print('Test data')
print(classification_report(y_test, test_preds))

Train data
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     25669
           1       1.00      0.98      0.99     29752

    accuracy                           0.98     55421
   macro avg       0.98      0.98      0.98     55421
weighted avg       0.98      0.98      0.98     55421

Test data
              precision    recall  f1-score   support

           0       0.74      0.75      0.74      8603
           1       0.78      0.77      0.77      9871

    accuracy                           0.76     18474
   macro avg       0.76      0.76      0.76     18474
weighted avg       0.76      0.76      0.76     18474



In [7]:
X_train, X_test = col_transform.fit_transform(X_train, y_train), col_transform.fit_transform(X_test, y_test)

In [14]:
rf_op = OptunaTuner(RandomForestClassifier, accuracy_score, direction='maximize', random_state=571, n_jobs=7)
rf_op.fit(
    200, X_train, y_train, X_test, y_test,
    ('n_estimators', 'int', 10, 200),
    ('max_depth', 'int', 20, 40),
    ('max_features', 'float', 0.05, 1.0, {'step': 0.05}),
    ('max_samples', 'float', 0.05, 1.0, {'step': 0.05}),
    ('min_samples_leaf', 'float', 1e-4, 1e-2, {'log': True}),
    ('min_samples_split', 'float', 1e-4, 1e-2, {'log': True})
)

In [15]:
rf_op.model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 21,
 'max_features': 0.1,
 'max_leaf_nodes': None,
 'max_samples': 0.4,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 0.000627318769493311,
 'min_samples_split': 0.00155668115409059,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 77,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [16]:
optuna.visualization.plot_optimization_history(rf_op.study)

In [None]:
{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 36,
 'max_features': 0.1,
 'max_leaf_nodes': None,
 'max_samples': 0.9000000000000001,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 0.003570132806941135,
 'min_samples_split': 0.004948383139887321,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 163,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [17]:
config = {
    'bootstrap': True,
    'ccp_alpha': 0.0,
    'class_weight': None,
    'criterion': 'gini',
    'max_depth': 36,
    'max_features': 0.1,
    'max_leaf_nodes': None,
    'max_samples': 0.9000000000000001,
    'min_impurity_decrease': 0.0,
    'min_samples_leaf': 0.003570132806941135,
    'min_samples_split': 0.004948383139887321,
    'min_weight_fraction_leaf': 0.0,
    'n_estimators': 163,
    'n_jobs': None,
    'oob_score': False,
    'random_state': None,
    'verbose': 0,
    'warm_start': False
}
rf = RandomForestClassifier(**config)

# Create w&b run for the training set
with utils.init_wandb_run(
    name='continuous_features_optuna_subperiod_15',
    model=RandomForestClassifier,
    config=config,
    target_month=fe.target_month,
    group='parameters_tuning',
    job_type='tuning_train'
) as run:

    rf.fit(X_train, y_train)
    train_preds = rf.predict(X_train)

    rep = utils.parse_classification_report(
        classification_report(y_train, train_preds, output_dict=True)
    )

    metadata = {
        'experiment': {
            'name': run.name,
        },
        'performance_report': rep,
        'config': config
    }

    artifact = wandb.Artifact(
        name=f'report_train',
        type='performance_metric',
        metadata=metadata
    )
    run.log_artifact(artifact)
    run.finish()

# Create w&b run for the test set
with utils.init_wandb_run(
    name='continuous_features_optuna_subperiod_15',
    model=RandomForestClassifier,
    config=config,
    target_month=fe.target_month,
    group='parameters_tuning',
    job_type='tuning_test'
) as run:
    test_preds = rf.predict(X_test)
    rep = utils.parse_classification_report(
        classification_report(y_test, test_preds, output_dict=True)
    )

    metadata = {
        'experiment': {
            'name': run.name,
        },
        'performance_report': rep,
        'config': config
    }

    artifact = wandb.Artifact(
        name=f'report_test',
        type='performance_metric',
        metadata=metadata
    )
    run.log_artifact(artifact)
    run.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mgaturchenko[0m ([33mkpmg-capstone[0m). Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
