In [13]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
import logging

for handler in logging.root.handlers:
    logging.root.removeHandler(handler)
logging.basicConfig(level=logging.INFO)

In [15]:
# Set project root
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()
PROJECT_ROOT_LS = [p.name for p in PROJECT_ROOT.iterdir()]
assert "featurologists" in PROJECT_ROOT_LS, f"Not a project root? {PROJECT_ROOT}, pwd: {Path().resolve()}"

In [16]:
MODELS_DIR = PROJECT_ROOT/'models/cancellation_prediction'
DATA_DIR = PROJECT_ROOT/'data/output/cancellation_prediction'

PREPROCESSED_DATA_PATH = DATA_DIR / 'offline_preprocessed.csv'
COUNTRY_ENCODER_PATH = DATA_DIR / 'country_encoder.npy'

In [17]:
import pandas as pd
from featurologists.cancellation_prediction import (
    build_country_encoder,
    save_country_encoder,
    load_country_encoder,
    preprocess_replace_invoice_date,
    train_test_split,
    train_xgboost,
    predict,
    calc_all_metrics,
    save_model,
)

import sklearn

In [18]:
data = pd.read_csv(PREPROCESSED_DATA_PATH)
data

Unnamed: 0,InvoiceDate,Quantity,UnitPrice,Country,IsCancelled
0,2010-12-01 08:26:00,6,2.55,34,0
1,2010-12-01 08:26:00,6,3.39,34,0
2,2010-12-01 08:26:00,8,2.75,34,0
3,2010-12-01 08:26:00,6,3.39,34,0
4,2010-12-01 08:26:00,6,3.39,34,0
...,...,...,...,...,...
263810,2011-09-30 15:52:00,4,4.25,34,0
263811,2011-09-30 15:52:00,4,4.25,34,0
263812,2011-09-30 15:52:00,12,2.10,34,0
263813,2011-09-30 15:52:00,12,2.10,34,0


In [19]:
country_encoder = load_country_encoder(COUNTRY_ENCODER_PATH)
country_encoder.classes_

array(['Australia', 'Austria', 'Bahrain', 'Belgium', 'Brazil', 'Canada',
       'Channel Islands', 'Cyprus', 'Czech Republic', 'Denmark', 'EIRE',
       'European Community', 'Finland', 'France', 'Germany', 'Greece',
       'Iceland', 'Israel', 'Italy', 'Japan', 'Lebanon', 'Lithuania',
       'Malta', 'Netherlands', 'Norway', 'Poland', 'Portugal',
       'Saudi Arabia', 'Singapore', 'Spain', 'Sweden', 'Switzerland',
       'USA', 'United Arab Emirates', 'United Kingdom', 'Unspecified'],
      dtype='<U20')

In [24]:
data = preprocess_replace_invoice_date(data)
data

Unnamed: 0,Quantity,UnitPrice,Country,IsCancelled,InvoiceDate_year,InvoiceDate_month,InvoiceDate_day,InvoiceDate_hour,InvoiceDate_minute,InvoiceDate_second
0,6,2.55,34,0,2010,12,1,8,26,0
1,6,3.39,34,0,2010,12,1,8,26,0
2,8,2.75,34,0,2010,12,1,8,26,0
3,6,3.39,34,0,2010,12,1,8,26,0
4,6,3.39,34,0,2010,12,1,8,26,0
...,...,...,...,...,...,...,...,...,...,...
263810,4,4.25,34,0,2011,9,30,15,52,0
263811,4,4.25,34,0,2011,9,30,15,52,0
263812,12,2.10,34,0,2011,9,30,15,52,0
263813,12,2.10,34,0,2011,9,30,15,52,0


In [25]:
print(data.columns)

Index(['Quantity', 'UnitPrice', 'Country', 'IsCancelled', 'InvoiceDate_year',
       'InvoiceDate_month', 'InvoiceDate_day', 'InvoiceDate_hour',
       'InvoiceDate_minute', 'InvoiceDate_second'],
      dtype='object')


In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(data)

In [27]:
m = train_xgboost(X_train, Y_train)
m

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='aucpr',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.15, max_delta_step=0,
              max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=200, n_jobs=12,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=None)

In [28]:
meta = calc_all_metrics(m, X_test, Y_test)
meta

{'score_accuracy': 0.9819665295756496,
 'score_roc_auc': 0.5277552004917307,
 'error_mse': 0.018033470424350398,
 'error_r2': 0.02629827944691643,
 'confidence_least': 0.5018929243087769,
 'confidence_most': 0.9999920129776001,
 'predicted_positive_count': 133,
 'predicted_negative_count': 105393,
 'predicted_positive_to_negative_fraction': 0.0012619433928249505,
 'groundtruth_positive_count': 1992,
 'groundtruth_negative_count': 103534,
 'groundtruth_positive_to_negative_fraction': 0.019240056406591072,
 'predicted_positive_to_groundtruth_negative_fraction': 0.06676706827309237}

In [29]:
# Try row-by-row inference

In [30]:
test_row = X_test.iloc[[0]]
test_row

Unnamed: 0,Quantity,UnitPrice,Country,InvoiceDate_year,InvoiceDate_month,InvoiceDate_day,InvoiceDate_hour,InvoiceDate_minute,InvoiceDate_second
222970,2,0.79,34,2011,8,28,16,9,0


In [31]:
predict(m, test_row, return_proba=True)

(array([0]), array([[0.99352753, 0.00647248]], dtype=float32))

In [32]:
test_rows = X_test.iloc[:3]
test_rows

Unnamed: 0,Quantity,UnitPrice,Country,InvoiceDate_year,InvoiceDate_month,InvoiceDate_day,InvoiceDate_hour,InvoiceDate_minute,InvoiceDate_second
222970,2,0.79,34,2011,8,28,16,9,0
184534,12,0.83,28,2011,7,18,11,55,0
2985,10,0.85,34,2010,12,2,14,9,0


In [33]:
predict(m, test_rows, return_proba=True)

(array([0, 0, 0]),
 array([[0.99352753, 0.00647248],
        [0.97446704, 0.02553296],
        [0.9906428 , 0.00935723]], dtype=float32))

In [34]:
MODELS_DIR = PROJECT_ROOT / "models/cancellation_prediction"
! rm -r {MODELS_DIR}

save_model(m, MODELS_DIR / "xgboost", metadata=meta)

[p.name for p in MODELS_DIR.iterdir()]

['xgboost']