In [27]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
import logging

for handler in logging.root.handlers:
    logging.root.removeHandler(handler)
logging.basicConfig(level=logging.INFO)

In [29]:
from customer_segmentation_toolkit.data_zoo import download_data_csv

from featurologists.features.customer_segmentation import transform
from featurologists.models.customer_segmentation import (
    calc_score_roc_auc,
    train_test_split,
    save_model,
    train_lightgbm,
    train_xgboost,
)

In [32]:
def prepare_nltk():
    import nltk
    nltk.download('punkt')
    nltk.download('averaged_perceptron_tagger')

def _load_data():
    no_live_data = download_data_csv(
        "data/output/01_data_split_offline_online/no_live_data.csv",
        datetime_columns=["InvoiceDate"],
    )

    df = transform(no_live_data)
    # columns: [CustomerID,count,min,max,mean,sum,categ_0,categ_1,categ_2,categ_3,
    #           categ_4,LastPurchase,FirstPurchase,cluster]
    return df

In [33]:
prepare_nltk()

df = _load_data()
df.head()

[nltk_data] Downloading package punkt to /home/ay/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ay/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
INFO:root:Downloading dataset 'https://raw.githubusercontent.com/artemlops/customer-segmentation-toolkit/master/data/output/01_data_split_offline_online/no_live_data.csv'


Unnamed: 0,CustomerID,count,min,max,mean,sum,categ_0,categ_1,categ_2,categ_3,categ_4,LastPurchase,FirstPurchase,cluster
0,12347.0,5,382.52,711.79,558.172,2790.86,10.442659,29.836681,8.676179,36.519926,14.524555,59,297,2
1,12348.0,4,227.44,892.8,449.31,1797.24,38.016069,41.953217,0.0,20.030714,0.0,5,288,9
2,12350.0,1,334.4,334.4,334.4,334.4,11.692584,48.444976,0.0,11.961722,27.900718,240,240,1
3,12352.0,6,144.35,840.3,345.663333,2073.98,0.491808,12.89212,13.584991,69.660749,3.370331,2,226,4
4,12353.0,1,89.0,89.0,89.0,89.0,0.0,13.033708,67.078652,0.0,19.88764,134,134,0


In [34]:
from pathlib import Path
DATA = Path("../data/customer_segmentation")
DATA.mkdir(exist_ok=True)

df.to_csv(DATA / "no_live_data__transformed.csv")

In [35]:
X_train, X_test, Y_train, Y_test = train_test_split(df)

In [36]:
xgboost_model = train_xgboost(X_train, Y_train)
xgboost_model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='aucpr',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.16, max_delta_step=0,
              max_depth=50, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=12,
              num_parallel_tree=1, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [37]:
xgboost_score = calc_score_roc_auc(xgboost_model, X_test, Y_test)
print(f"XGBoost score_roc_auc: {xgboost_score:.6f}")

XGBoost score_roc_auc: 0.929973


In [38]:
lightgbm_model = train_lightgbm(X_train, Y_train)
lightgbm_model

<lightgbm.basic.Booster at 0x7f56b2eec690>

In [39]:
lightgbm_score = calc_score_roc_auc(lightgbm_model, X_test, Y_test)
print(f"LightGBM score_roc_auc: {lightgbm_score:.6f}")

LightGBM score_roc_auc: 0.933648


In [40]:
from pathlib import Path

! pwd
PROJECT_ROOT = Path("..").resolve()
PROJECT_ROOT_LS = [p.name for p in PROJECT_ROOT.iterdir()]
assert "featurologists" in PROJECT_ROOT_LS, f"Not a project root? {PROJECT_ROOT}"

MODELS_DIR = PROJECT_ROOT / "models" / "customer_segmentation"
! rm -r {MODELS_DIR}

save_model(
    xgboost_model, MODELS_DIR / "xgboost", metadata={"score_roc_auc": xgboost_score}
)
save_model(
    lightgbm_model, MODELS_DIR / "lightgbm", metadata={"score_roc_auc": lightgbm_score}
)

[p.name for p in MODELS_DIR.iterdir()]

/plain/github/opensource/Featurologists/notebooks


['lightgbm', 'xgboost']