In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import logging

for handler in logging.root.handlers:
    logging.root.removeHandler(handler)
logging.basicConfig(level=logging.INFO)

In [3]:
# Set project root
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()
PROJECT_ROOT_LS = [p.name for p in PROJECT_ROOT.iterdir()]
assert "featurologists" in PROJECT_ROOT_LS, f"Not a project root? {PROJECT_ROOT}, pwd: {Path().resolve()}"

In [16]:
MODELS_DIR = PROJECT_ROOT/'models/cancellation_prediction'
DATA_DIR = PROJECT_ROOT/'data/output/cancellation_prediction'

PREPROCESSED_DATA_PATH = DATA_DIR / 'offline_preprocessed.csv'
COUNTRY_ENCODER_PATH = DATA_DIR / 'country_encoder.npy'

In [19]:
import pandas as pd
from featurologists.cancellation_prediction import (
    build_country_encoder,
    save_country_encoder,
    load_country_encoder,
    preprocess,
    train_test_split,
    train_xgboost,
    predict,
    calc_all_metrics,
    save_model,
)

import sklearn

In [20]:
df = pd.read_csv(PROJECT_ROOT/'data/output/offline_cleaned.csv')
df.shape, df.columns

  interactivity=interactivity, compiler=compiler, result=result)


((263815, 10),
 Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
        'UnitPrice', 'CustomerID', 'Country', 'QuantityCanceled', 'TotalPrice'],
       dtype='object'))

In [21]:
df['StockCode'].unique().size

3509

In [22]:
df['CustomerID'].unique().size

3616

In [23]:
total = df.shape[0]
non_null = df[df['QuantityCanceled'] > 0]['InvoiceNo'].count()
print(f'QuantityCancelled: {non_null} out of {total} ({non_null/total*100:.2f}%)')

QuantityCancelled: 4980 out of 263815 (1.89%)


In [24]:
df['Country'].unique().size

36

In [25]:
country_encoder = build_country_encoder(list(df['Country'].unique()))
country_encoder.classes_

array(['Australia', 'Austria', 'Bahrain', 'Belgium', 'Brazil', 'Canada',
       'Channel Islands', 'Cyprus', 'Czech Republic', 'Denmark', 'EIRE',
       'European Community', 'Finland', 'France', 'Germany', 'Greece',
       'Iceland', 'Israel', 'Italy', 'Japan', 'Lebanon', 'Lithuania',
       'Malta', 'Netherlands', 'Norway', 'Poland', 'Portugal',
       'Saudi Arabia', 'Singapore', 'Spain', 'Sweden', 'Switzerland',
       'USA', 'United Arab Emirates', 'United Kingdom', 'Unspecified'],
      dtype='<U20')

In [38]:
len(country_encoder.classes_)

36

In [26]:
save_country_encoder(country_encoder, COUNTRY_ENCODER_PATH)

In [13]:
country_encoder = load_country_encoder(COUNTRY_ENCODER_PATH)
country_encoder.classes_

array(['Australia', 'Austria', 'Bahrain', 'Belgium', 'Brazil', 'Canada',
       'Channel Islands', 'Cyprus', 'Czech Republic', 'Denmark', 'EIRE',
       'European Community', 'Finland', 'France', 'Germany', 'Greece',
       'Iceland', 'Israel', 'Italy', 'Japan', 'Lebanon', 'Lithuania',
       'Malta', 'Netherlands', 'Norway', 'Poland', 'Portugal',
       'Saudi Arabia', 'Singapore', 'Spain', 'Sweden', 'Switzerland',
       'USA', 'United Arab Emirates', 'United Kingdom', 'Unspecified'],
      dtype='<U20')

In [27]:
data = preprocess(df, country_encoder)
data

Unnamed: 0,Quantity,UnitPrice,Country,InvoiceDate_year,InvoiceDate_month,InvoiceDate_day,InvoiceDate_hour,InvoiceDate_minute,InvoiceDate_second,IsCancelled
0,6,2.55,34,2010,12,1,8,26,0,0
1,6,3.39,34,2010,12,1,8,26,0,0
2,8,2.75,34,2010,12,1,8,26,0,0
3,6,3.39,34,2010,12,1,8,26,0,0
4,6,3.39,34,2010,12,1,8,26,0,0
...,...,...,...,...,...,...,...,...,...,...
263810,4,4.25,34,2011,9,30,15,52,0,0
263811,4,4.25,34,2011,9,30,15,52,0,0
263812,12,2.10,34,2011,9,30,15,52,0,0
263813,12,2.10,34,2011,9,30,15,52,0,0


In [28]:
print(data.columns)

Index(['Quantity', 'UnitPrice', 'Country', 'InvoiceDate_year',
       'InvoiceDate_month', 'InvoiceDate_day', 'InvoiceDate_hour',
       'InvoiceDate_minute', 'InvoiceDate_second', 'IsCancelled'],
      dtype='object')


In [29]:
X_train, X_test, Y_train, Y_test = train_test_split(data)

In [30]:
m = train_xgboost(X_train, Y_train)
m

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='aucpr',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.15, max_delta_step=0,
              max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=200, n_jobs=12,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=None)

In [31]:
meta = calc_all_metrics(m, X_test, Y_test)
meta

{'score_accuracy': 0.9819475769004795,
 'score_roc_auc': 0.5260223190364177,
 'error_mse': 0.018052423099520497,
 'error_r2': 0.02527494605695002,
 'confidence_least': 0.5028083324432373,
 'confidence_most': 0.9999750852584839,
 'predicted_positive_count': 121,
 'predicted_negative_count': 105405,
 'predicted_positive_to_negative_fraction': 0.0011479531331530762,
 'groundtruth_positive_count': 1992,
 'groundtruth_negative_count': 103534,
 'groundtruth_positive_to_negative_fraction': 0.019240056406591072,
 'predicted_positive_to_groundtruth_negative_fraction': 0.0607429718875502}

In [32]:
# Try row-by-row inference

In [33]:
test_row = X_test.iloc[[0]]
test_row

Unnamed: 0,Quantity,UnitPrice,Country,InvoiceDate_year,InvoiceDate_month,InvoiceDate_day,InvoiceDate_hour,InvoiceDate_minute,InvoiceDate_second
241913,6,2.55,34,2011,9,15,18,5,0


In [34]:
predict(m, test_row, return_proba=True)

(array([0]), array([[0.9910913 , 0.00890869]], dtype=float32))

In [35]:
test_rows = X_test.iloc[:3]
test_rows

Unnamed: 0,Quantity,UnitPrice,Country,InvoiceDate_year,InvoiceDate_month,InvoiceDate_day,InvoiceDate_hour,InvoiceDate_minute,InvoiceDate_second
241913,6,2.55,34,2011,9,15,18,5,0
28297,20,1.65,34,2011,1,6,15,29,0
110088,2,5.95,34,2011,4,19,16,52,0


In [36]:
predict(m, test_rows, return_proba=True)

(array([0, 0, 0]),
 array([[0.9910913 , 0.00890869],
        [0.8684514 , 0.1315486 ],
        [0.98157156, 0.01842844]], dtype=float32))

In [37]:
MODELS_DIR = PROJECT_ROOT / "models/cancellation_prediction"
! rm -r {MODELS_DIR}

save_model(m, MODELS_DIR / "xgboost", metadata=meta)

[p.name for p in MODELS_DIR.iterdir()]

['xgboost']