In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import random
random.seed(42)

In [3]:
import logging

for handler in logging.root.handlers:
    logging.root.removeHandler(handler)
logging.basicConfig(level=logging.INFO)

In [4]:
import datetime

In [5]:
# Set project root
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()
PROJECT_ROOT_LS = [p.name for p in PROJECT_ROOT.iterdir()]
assert "featurologists" in PROJECT_ROOT_LS, f"Not a project root? {PROJECT_ROOT}, pwd: {Path().resolve()}"

In [6]:
from customer_segmentation_toolkit.data_zoo import download_data_csv

data = download_data_csv('data/data.csv', datetime_columns=['InvoiceDate'])
print(f'data: {data.shape}\n{list(data.columns)}')

INFO:root:Downloading dataset 'https://raw.githubusercontent.com/artemlops/customer-segmentation-toolkit/master/data/data.csv'


data: (541909, 8)
['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country']


In [7]:
data.dtypes

InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID            float64
Country                object
dtype: object

In [8]:
data_path = PROJECT_ROOT / "data/data.csv"
data_path.parent.mkdir(exist_ok=True)
data.to_csv(data_path, index=False)
print(f'Saved to {data_path}')

Saved to /plain/github/opensource/Featurologists/data/data.csv


## data.csv -> (offline_raw.csv + online_raw.csv):

In [9]:
from customer_segmentation_toolkit.load_split import split_by_invoice_date
DATE_SPLIT = datetime.date(2011,10,1)

offline_raw, online_raw = split_by_invoice_date(data, DATE_SPLIT)

print(f'offline_raw: {offline_raw.shape}\n{list(offline_raw.columns)}')
print(f'online_raw: {online_raw.shape}\n{list(online_raw.columns)}')

offline_raw: (370931, 8)
['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country']
online_raw: (170978, 8)
['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country']


In [10]:
offline_raw_path = PROJECT_ROOT / "data/output/offline_raw.csv"
offline_raw_path.parent.mkdir(exist_ok=True)
offline_raw.to_csv(offline_raw_path, index=False)
print(f'Saved to {offline_raw_path}')

online_raw_path = PROJECT_ROOT / "data/output/online_raw.csv"
online_raw_path.parent.mkdir(exist_ok=True)
online_raw.to_csv(online_raw_path, index=False)
print(f'Saved to {online_raw_path}')

Saved to /plain/github/opensource/Featurologists/data/output/offline_raw.csv
Saved to /plain/github/opensource/Featurologists/data/output/online_raw.csv


## offline_raw.csv -> offline_cleaned.csv:

In [11]:
from customer_segmentation_toolkit.clean_rows import clean_data_rows

offline_cleaned = clean_data_rows(offline_raw)
print(f'offline_cleaned: {offline_cleaned.shape}\n{list(offline_cleaned.columns)}')

offline_cleaned: (263815, 10)
['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country', 'QuantityCanceled', 'TotalPrice']


In [12]:
offline_cleaned_path = PROJECT_ROOT / "data/output/offline_cleaned.csv"
offline_cleaned_path.parent.mkdir(exist_ok=True)

offline_cleaned.to_csv(offline_cleaned_path, index=False)
print(f'Saved to {offline_cleaned_path}')

Saved to /plain/github/opensource/Featurologists/data/output/offline_cleaned.csv


# cancellation_prediction

## offline_cleaned.csv -> offline_preprocessed.csv:

In [22]:
from featurologists.cancellation_prediction import build_country_encoder, preprocess, save_country_encoder

country_encoder = build_country_encoder(list(offline_cleaned['Country'].unique()))

offline_preprocessed = preprocess(offline_cleaned, country_encoder)
offline_preprocessed

Unnamed: 0,Quantity,UnitPrice,Country,InvoiceDate_year,InvoiceDate_month,InvoiceDate_day,InvoiceDate_hour,InvoiceDate_minute,InvoiceDate_second,IsCancelled
0,6,2.55,34,2010,12,1,8,26,0,0
1,6,3.39,34,2010,12,1,8,26,0,0
2,8,2.75,34,2010,12,1,8,26,0,0
3,6,3.39,34,2010,12,1,8,26,0,0
4,6,3.39,34,2010,12,1,8,26,0,0
...,...,...,...,...,...,...,...,...,...,...
370675,4,4.25,34,2011,9,30,15,52,0,0
370676,4,4.25,34,2011,9,30,15,52,0,0
370677,12,2.10,34,2011,9,30,15,52,0,0
370678,12,2.10,34,2011,9,30,15,52,0,0


In [24]:
cancellation_prediction_root = PROJECT_ROOT / "data/output/cancellation_prediction"
cancellation_prediction_root.mkdir(exist_ok=True)

offline_preprocessed_path = cancellation_prediction_root / "offline_preprocessed.csv"
country_encoder_path = cancellation_prediction_root / 'country_encoder.npy'
offline_preprocessed.to_csv(offline_preprocessed_path, index=False)

save_country_encoder(country_encoder, country_encoder_path)

print(f'Saved to {cancellation_prediction_root}')
!ls {cancellation_prediction_root}

Saved to /plain/github/opensource/Featurologists/data/output/cancellation_prediction
country_encoder.npy  offline_preprocessed.csv
