In [None]:
import pandas as pd
import joblib
import json
import os
from functools import partial

from services.app_api.features.extractor import FeatureExtractor
from services.app_api.features.final_processing import CustomColumnTransformer
from services.app_api.configs import utils, settings
from tuning.optuna_tuning import OptunaTuner
import optuna
import wandb

from sklearn.preprocessing import RobustScaler
from sklearn.metrics import classification_report, f1_score, confusion_matrix, ConfusionMatrixDisplay, make_scorer
# Base classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier, Booster, DMatrix
from sklearn.model_selection import cross_val_score, GridSearchCV
# Meta classifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, VotingClassifier

In [None]:
try:
    customers, sales = joblib.load('customers.joblib'), joblib.load('sales.joblib')
except:
    customers, sales = pd.read_excel('ucy_eko_data.xlsx', sheet_name='smile_customers'), pd.read_excel('ucy_eko_data.xlsx', sheet_name='smile_sales')
    joblib.dump(customers, 'customers.joblib')
    joblib.dump(sales, 'sales.joblib')

In [None]:
fe = FeatureExtractor(target_month=3, n_purchases=2, perform_split=True, generation_type='continuous', filtering_set='customers', period=60, subperiod=15)
X_train, X_test, y_train, y_test = fe.transform(sales=sales, customers=customers)

In [None]:
y_train.value_counts(normalize=True)

In [None]:
f1_score_average_weighted = partial(f1_score, average='weighted')

In [None]:
def params_to_txt(optuna_object):
    model = optuna_object.model
    model_name = model.__str__()[:model.__str__().find('(')]
    best_params = model.get_params()
    with open(f'{model_name}.txt', 'w') as f:
        f.write(json.dumps(best_params, indent=4))

In [None]:
def load_params(model):
    try:
        model = model()
    except TypeError:
        pass
    model_name = model.__str__()[:model.__str__().find('(')]
    with open(f'{model_name}.txt', 'r') as f:
        params = json.loads(f.read())
    return params

## SVC base model

In [None]:
# svc_op = OptunaTuner(
#     SVC,
#     accuracy_score,
#     direction='maximize',
#     random_state=571,
#     verbose=False,
#     tol=1
# )
# svc_op.fit(
#     50, X_train_trans, y_train, X_test_trans, y_test,
#     ('C', 'float', 1e-3, 1.0),
#     ('kernel', 'categorical', ['linear', 'poly', 'rbf', 'sigmoid']),
#     ('degree', 'int', 1, 15)
# )

In [None]:
# optuna.visualization.plot_optimization_history(svc_op.study)

In [None]:
# svc_op.model.get_params()

In [None]:
svc_best = SVC(
    verbose=False,
    random_state=571,
    tol=1,
    probability=True
)
svc_best.fit(X_train, y_train)
svc_train_preds = svc_best.predict(X_train)
svc_test_preds = svc_best.predict(X_test)

In [None]:
print('Train:')
print(classification_report(y_train, svc_train_preds), '\n')
print('Test:')
print(classification_report(y_test, svc_test_preds))

## KNN base model

In [None]:
knn_op = OptunaTuner(
    KNeighborsClassifier,
    f1_score_average_weighted,
    direction='maximize',
    n_jobs=7
)
knn_op.fit(
    100, X_train, y_train, X_test, y_test,
    ('n_neighbors', 'int', 1, 50)
)

In [None]:
optuna.visualization.plot_optimization_history(knn_op.study)

In [None]:
# try:
#     knn_best_params = load_params(KNeighborsClassifier)
#     knn_best = KNeighborsClassifier(**knn_best_params)
# except FileNotFoundError:
knn_best = KNeighborsClassifier(**knn_op.model.get_params())
knn_best.fit(X_train, y_train)
knn_train_preds = knn_best.predict(X_train)
knn_test_preds = knn_best.predict(X_test)

In [None]:
print('Train:')
print(classification_report(y_train, knn_train_preds), '\n')
print('Test:')
print(classification_report(y_test, knn_test_preds))

In [None]:
params_to_txt(knn_op)

## Random Forest base model

In [None]:
rf_op = OptunaTuner(RandomForestClassifier, f1_score_average_weighted, direction='maximize', random_state=571, n_jobs=7)
rf_op.fit(
    100, X_train, y_train, X_test, y_test,
    ('n_estimators', 'int', 10, 200),
    ('max_depth', 'int', 30, 60),
    ('max_features', 'float', 0.05, 1.0, {'step': 0.05}),
    ('max_samples', 'float', 0.05, 1.0, {'step': 0.05}),
    ('min_samples_leaf', 'float', 1e-4, 1e-2, {'log': True}),
    ('min_samples_split', 'float', 1e-4, 1e-2, {'log': True})
)

In [None]:
# try:
#     rf_best_params = load_params(RandomForestClassifier)
#     rf_best = RandomForestClassifier(**rf_best_params)
# except FileNotFoundError:
rf_best = RandomForestClassifier(**rf_op.model.get_params())
rf_best.fit(X_train, y_train)
rf_train_preds = rf_best.predict(X_train)
rf_test_preds = rf_best.predict(X_test)

In [None]:
print('Train:')
print(classification_report(y_train, rf_train_preds), '\n')
print('Test:')
print(classification_report(y_test, rf_test_preds))

In [None]:
params_to_txt(rf_op)

## XGBoost base model

In [None]:
xgb_op = OptunaTuner(
    XGBClassifier, f1_score_average_weighted, 'maximize', # class-specific arguments
    seed=571, predictor='cpu_predictor', verbosity=0, nthread=7, # model-specific technical parameters
    objective='binary:logistic', eval_metric='error', n_estimators=1000 # model-specific fixed hyperparameters
)
xgb_op.fit(
    100, X_train, y_train, X_test, y_test,
    ('reg_alpha', 'loguniform', 1e-3, 10.0),
    ('reg_lambda', 'loguniform', 1e-3, 10.0),
    ('colsample_bytree', 'loguniform', 0.2, 1.0),
    ('subsample', 'loguniform', 0.4, 1.0),
    ('learning_rate', 'loguniform', 1e-4, 0.5),
    ('max_depth', 'categorical', [5, 10, 20, 30])
)

In [None]:
try:
    xgb_best_params = load_params(XGBClassifier)
    xgb_best_params['objective'] = 'binary:logistic'
    xgb_best = XGBClassifier(**xgb_best_params)
except FileNotFoundError:
    xgb_best = XGBClassifier(**xgb_op.model.get_params())
xgb_best.fit(X_train, y_train)
xgb_train_preds = xgb_best.predict(X_train)
xgb_test_preds = xgb_best.predict(X_test)

In [None]:
# import yaml
# with open('service/app_api/api_config.yaml', 'rb') as f:
#     config = yaml.safe_load(f)
# classification_model_artifact = utils.get_artifact(config['wandb_classification_model_project'], f"{config['wandb_classification_model_id']}_model.json")
# classification_model_artifact.download(config['model_path'])
# classification_model = XGBClassifier()
# classification_model.load_model(config['model_path']+f"/{config['wandb_classification_model_id']}_model.json")
# classification_model.fit(X_train, y_train)
# xgb_train_preds = classification_model.predict(X_train)
# xgb_test_preds = classification_model.predict(X_test)
# print('Train:')
# print(classification_report(y_train, xgb_train_preds), '\n')
# print('Test:')
# print(classification_report(y_test, xgb_test_preds))

In [None]:
print('Train:')
print(classification_report(y_train, xgb_train_preds), '\n')
print('Test:')
print(classification_report(y_test, xgb_test_preds))

In [None]:
xgb_best.feature_names_in_[0][:xgb_best.feature_names_in_[0].rindex('_')]

In [None]:
import numpy as np
importance = pd.DataFrame({'feature': xgb_best.feature_names_in_, 'importance': xgb_best.feature_importances_})
importance['feature'] = importance['feature'].apply(lambda x: x[:x.rindex('_')])
importance['feature'] = importance['feature'].apply(lambda x: np.nan if 'previous' in x else x)
importance.dropna(inplace=True)

In [None]:
importance.groupby('feature').sum().sort_values('importance', ascending=False)

In [None]:
params_to_txt(xgb_op)

In [None]:
cm = confusion_matrix(y_test, xgb_test_preds)
cm_disp = ConfusionMatrixDisplay(cm, display_labels=xgb_best.classes_)

In [None]:
cm_disp.plot()

In [None]:
try:
    config = xgb_op.model.get_params()
    features = xgb_op.model.get_booster().feature_names
except NameError:
    config = xgb_best_params
    xgb_best_params['objective'] = 'binary:logistic'
    features = xgb_best.get_booster().feature_names

# Create w&b run for the training set
with utils.init_wandb_run(
    name=f'continuous_features_subperiod_{fe.subperiod}_customers_filtering_{fe.n_purchases}_visits',
    model=XGBClassifier,
    config=config,
    target_month=fe.target_month,
    group='parameters_tuning',
    job_type='tuning_train'
) as run:
    xgb_best = XGBClassifier(**xgb_best_params)
    xgb_best.fit(X_train, y_train, callbacks=[wandb.xgboost.WandbCallback(log_model=True)])

    rep = utils.parse_classification_report(
        classification_report(y_train, xgb_train_preds, output_dict=True)
    )

    metadata = {
        'experiment': {
            'name': run.name,
        },
        'performance_report': rep,
        'features': features
    }

    artifact = wandb.Artifact(
        name=f'report_train',
        type='performance_metric',
        metadata=metadata
    )
    run.log_artifact(artifact)
    run.finish()

# Create w&b run for the test set
with utils.init_wandb_run(
    name=f'continuous_features_subperiod_{fe.subperiod}_customers_filtering_{fe.n_purchases}_visits',
    model=XGBClassifier,
    config=config,
    target_month=fe.target_month,
    group='parameters_tuning',
    job_type='tuning_test'
) as run:
    rep = utils.parse_classification_report(
        classification_report(y_test, xgb_test_preds, output_dict=True)
    )

    metadata = {
        'experiment': {
            'name': run.name
        },
        'performance_report': rep,
        'features': features
    }

    artifact = wandb.Artifact(
        name=f'report_test',
        type='performance_metric',
        metadata=metadata
    )
    run.log_artifact(artifact)
    run.finish()

In [None]:
wandb_api = wandb.Api()
model_run = wandb_api.run(f"{settings.SETTINGS['WANDB_ENTITY']}/X-G-B-Classifier/gnazdw35")
model_config = json.loads(model_run.json_config)

In [None]:
art = utils.get_artifact('X-G-B-Classifier', '00g8kx9u_model.json')
art.download('service/app_api/configs/')

In [None]:
xgb_json_loaded = XGBClassifier()
xgb_json_loaded.load_model('service/app_api/configs/gnazdw35_model.json')
xgb_train_preds = xgb_json_loaded.predict(X_train)
xgb_test_preds = xgb_json_loaded.predict(X_test)

In [None]:
print('Train:')
print(classification_report(y_train, xgb_train_preds), '\n')
print('Test:')
print(classification_report(y_test, xgb_test_preds))

In [None]:
import shap
explainer = shap.TreeExplainer(xgb_json_loaded)
shapley_values = explainer.shap_values(X_train.head(1), y_train.head(1))

In [None]:
shapley_values.tolist()

## Identify overlaps of models errors

In [None]:
base_models_df = pd.DataFrame(
    {
        'svc_preds': svc_test_preds,
        'knn_preds': knn_test_preds,
        'rf_preds': rf_test_preds,
        'xgb_preds': xgb_test_preds,
        'y_true': y_test
    }
)

In [None]:
base_models_df['svc_accuracy'] = base_models_df.apply(lambda x: 'error' if x['svc_preds'] != x['y_true'] else 'correct', axis=1)
base_models_df['knn_accuracy'] = base_models_df.apply(lambda x: 'error' if x['knn_preds'] != x['y_true'] else 'correct', axis=1)
base_models_df['rf_accuracy'] = base_models_df.apply(lambda x: 'error' if x['rf_preds'] != x['y_true'] else 'correct', axis=1)
base_models_df['xgb_accuracy'] = base_models_df.apply(lambda x: 'error' if x['xgb_preds'] != x['y_true'] else 'correct', axis=1)

In [None]:
svc_errors = base_models_df[base_models_df['svc_accuracy']=='error'].index
knn_errors = base_models_df[base_models_df['knn_accuracy']=='error'].index
rf_errors = base_models_df[base_models_df['rf_accuracy']=='error'].index
xgb_errors = base_models_df[base_models_df['xgb_accuracy']=='error'].index

In [None]:
# Overlap of all models
len(
    set.intersection(
        set(svc_errors),
        set(knn_errors),
        set(rf_errors),
        set(xgb_errors)
    )
) / base_models_df.shape[0]

In [None]:
len(
    set.intersection(
        set(rf_errors),
        set(xgb_errors)
    )
) / base_models_df.shape[0]

## Models Stacking

In [None]:
stacking_model = StackingClassifier(
    estimators=[
        ('svc', svc_best),
        ('knn', knn_best),
        ('rf', rf_best),
        ('xgb', xgb_best)
    ],
    final_estimator=LogisticRegression(),
    cv=5
)

In [None]:
stacking_model.fit(X_train, y_train)
stacking_train_preds = stacking_model.predict(X_train)
stacking_test_preds = stacking_model.predict(X_test)

In [None]:
print('Train:')
print(classification_report(y_train, stacking_train_preds), '\n')
print('Test:')
print(classification_report(y_test, stacking_test_preds))

## Voting Stacking

In [None]:
voting_stacking_model = VotingClassifier(
    estimators=[
        ('svc', svc_best),
        ('knn', knn_best),
        ('rf', rf_best),
        ('xgb', xgb_best)
    ],
    voting='soft',
    n_jobs=7
)

In [None]:
voting_stacking_model.fit(X_train, y_train)
voting_stacking_train_preds = voting_stacking_model.predict(X_train)
voting_stacking_test_preds = voting_stacking_model.predict(X_test)

In [None]:
print('Train:')
print(classification_report(y_train, voting_stacking_train_preds), '\n')
print('Test:')
print(classification_report(y_test, voting_stacking_test_preds))

In [None]:
params = {
    'weights': [
        (1, 1, 3, 3),
        (1, 1, 2, 3),
        (1, 1, 3, 4),
        (1, 1, 2, 5),
        (1, 2, 2, 3),
        (1, 2, 3, 4),
        (1, 2, 3, 3)
    ]
}
grid = GridSearchCV(
    estimator=voting_stacking_model,
    param_grid=params,
    n_jobs=7,
    cv=5,
    scoring=make_scorer(f1_score, **{'average': 'weighted'})
)

In [None]:
grid.fit(X_train, y_train)

In [None]:
tuned_voting_stacking_train_preds = grid.predict(X_train)
tuned_voting_stacking_test_preds = grid.predict(X_test)

In [None]:
print('Train:')
print(classification_report(y_train, tuned_voting_stacking_train_preds), '\n')
print('Test:')
print(classification_report(y_test, tuned_voting_stacking_test_preds))