In [13]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
import logging

for handler in logging.root.handlers:
    logging.root.removeHandler(handler)
logging.basicConfig(level=logging.INFO)

In [16]:
# Set project root
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()
PROJECT_ROOT_LS = [p.name for p in PROJECT_ROOT.iterdir()]
assert "featurologists" in PROJECT_ROOT_LS, f"Not a project root? {PROJECT_ROOT}, pwd: {Path().resolve()}"

In [15]:
import pandas as pd
from customer_segmentation_toolkit.data_zoo import download_data_csv

from featurologists.data_transforms import build_client_clusters, clean_client_clusters
from featurologists.models.customer_segmentation import (
    calc_score_roc_auc,
    calc_score_accuracy,
    train_test_split,
    save_model,
    train_lightgbm,
    train_xgboost,
)

In [17]:
df = pd.read_csv(PROJECT_ROOT/'data/output/offline_clusters.csv')
df

Unnamed: 0,CustomerID,count,min,max,mean,sum,categ_0,categ_1,categ_2,categ_3,categ_4,LastPurchase,FirstPurchase,cluster
0,12346.0,1,77183.60,77183.60,77183.600000,77183.60,0.000000,0.000000,0.000000,0.000000,100.000000,255,255,2
1,12347.0,5,382.52,711.79,558.172000,2790.86,43.755688,12.621916,10.442659,3.790946,29.388791,59,297,6
2,12348.0,4,227.44,892.80,449.310000,1797.24,20.030714,0.000000,35.692506,2.323563,41.953217,5,288,0
3,12350.0,1,334.40,334.40,334.400000,334.40,11.961722,27.900718,5.592105,6.100478,48.444976,240,240,10
4,12352.0,7,120.33,840.30,313.472857,2194.31,78.004019,4.798775,0.464839,2.983170,13.749197,2,226,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3611,18280.0,1,180.60,180.60,180.600000,180.60,47.840532,41.140642,0.000000,11.018826,0.000000,207,207,3
3612,18281.0,1,80.82,80.82,80.820000,80.82,41.945063,18.930958,18.708241,0.000000,20.415739,110,110,6
3613,18282.0,1,100.21,100.21,100.210000,100.21,25.446562,40.564814,20.516914,13.471709,0.000000,56,56,3
3614,18283.0,10,2.50,192.80,108.683000,1086.83,9.789940,29.352337,15.616978,10.617116,34.623630,25,267,0


In [18]:
df = clean_client_clusters(df)
df

INFO:root:Cluster sizes:     cluster  cluster_size
0         0           577
1         1           663
2         2             1
3         3           324
4         4             7
5         5           181
6         6          1336
7         7            93
8         8             9
9         9           125
10       10           300


Unnamed: 0,CustomerID,count,min,max,mean,sum,categ_0,categ_1,categ_2,categ_3,categ_4,LastPurchase,FirstPurchase,cluster
0,12346.0,1,77183.60,77183.60,77183.600000,77183.60,0.000000,0.000000,0.000000,0.000000,100.000000,255,255,2
1,12347.0,5,382.52,711.79,558.172000,2790.86,43.755688,12.621916,10.442659,3.790946,29.388791,59,297,6
2,12348.0,4,227.44,892.80,449.310000,1797.24,20.030714,0.000000,35.692506,2.323563,41.953217,5,288,0
3,12350.0,1,334.40,334.40,334.400000,334.40,11.961722,27.900718,5.592105,6.100478,48.444976,240,240,10
4,12352.0,7,120.33,840.30,313.472857,2194.31,78.004019,4.798775,0.464839,2.983170,13.749197,2,226,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3612,18281.0,1,80.82,80.82,80.820000,80.82,41.945063,18.930958,18.708241,0.000000,20.415739,110,110,6
3613,18282.0,1,100.21,100.21,100.210000,100.21,25.446562,40.564814,20.516914,13.471709,0.000000,56,56,3
3614,18283.0,10,2.50,192.80,108.683000,1086.83,9.789940,29.352337,15.616978,10.617116,34.623630,25,267,0
3615,18287.0,1,765.28,765.28,765.280000,765.28,6.402885,17.601401,5.315701,0.000000,70.680013,131,131,10


In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(df)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((2893, 6), (724, 6), (2893,), (724,))

In [20]:
xgboost_model = train_xgboost(X_train, Y_train)
xgboost_model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='aucpr',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.16, max_delta_step=0,
              max_depth=50, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=12,
              num_parallel_tree=1, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [21]:
xgboost_score_accuracy = calc_score_accuracy(xgboost_model, X_test, Y_test)
print(f"XGBoost score_accuracy: {xgboost_score_accuracy:.6f}")

XGBoost score_accuracy: 0.943370


In [22]:
lightgbm_model = train_lightgbm(X_train, Y_train)
lightgbm_model



<lightgbm.basic.Booster at 0x7f1c44cc8050>

In [23]:
lightgbm_score_accuracy = calc_score_accuracy(lightgbm_model, X_test, Y_test)
print(f"LightGBM score_accuracy: {lightgbm_score_accuracy}")

LightGBM score_accuracy: 0.9392265193370166


In [24]:
MODELS_DIR = PROJECT_ROOT / "models" / "customer_segmentation"
! rm -r {MODELS_DIR}

save_model(
    xgboost_model, MODELS_DIR / "xgboost", metadata={"score_accuracy": xgboost_score_accuracy}
)
save_model(
    lightgbm_model, MODELS_DIR / "lightgbm", metadata={"score_accuracy": lightgbm_score_accuracy}
)

[p.name for p in MODELS_DIR.iterdir()]

['lightgbm', 'xgboost']