In [1]:
%load_ext autoreload
%autoreload 2

In [105]:
import random
random.seed(42)

In [2]:
import logging

for handler in logging.root.handlers:
    logging.root.removeHandler(handler)
logging.basicConfig(level=logging.INFO)

In [3]:
import datetime

In [4]:
# Set project root
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()
PROJECT_ROOT_LS = [p.name for p in PROJECT_ROOT.iterdir()]
assert "featurologists" in PROJECT_ROOT_LS, f"Not a project root? {PROJECT_ROOT}, pwd: {Path().resolve()}"

In [22]:
from customer_segmentation_toolkit.data_zoo import download_data_csv

data = download_data_csv('data/data.csv', datetime_columns=['InvoiceDate'])
print(f'data: {data.shape}\n{list(data.columns)}')

INFO:root:Downloading dataset 'https://raw.githubusercontent.com/artemlops/customer-segmentation-toolkit/master/data/data.csv'


data: (541909, 8)
['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country']


In [23]:
data_path = PROJECT_ROOT / "data/data.csv"
data_path.parent.mkdir(exist_ok=True)
data.to_csv(data_path, index=False)
print(f'Saved to {data_path}')

Saved to /plain/github/opensource/Featurologists/data/data.csv


## data.csv -> (offline_raw.csv + online_raw.csv):

In [24]:
from customer_segmentation_toolkit.load_split import split_by_invoice_date
DATE_SPLIT = datetime.date(2011,10,1)

offline_raw, online_raw = split_by_invoice_date(data, DATE_SPLIT)

print(f'offline_raw: {offline_raw.shape}\n{list(offline_raw.columns)}')
print(f'online_raw: {online_raw.shape}\n{list(online_raw.columns)}')

offline_raw: (370931, 8)
['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country']
online_raw: (170978, 8)
['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country']


In [25]:
offline_raw_path = PROJECT_ROOT / "data/output/offline_raw.csv"
offline_raw_path.parent.mkdir(exist_ok=True)
offline_raw.to_csv(offline_raw_path, index=False)
print(f'Saved to {offline_raw_path}')

online_raw_path = PROJECT_ROOT / "data/output/online_raw.csv"
online_raw_path.parent.mkdir(exist_ok=True)
online_raw.to_csv(online_raw_path, index=False)
print(f'Saved to {online_raw_path}')

Saved to /plain/github/opensource/Featurologists/data/output/offline_raw.csv
Saved to /plain/github/opensource/Featurologists/data/output/online_raw.csv


## offline_raw.csv -> offline_cleaned.csv:

In [115]:
from customer_segmentation_toolkit.clean_rows import clean_data_rows

offline_cleaned = clean_data_rows(offline_raw)
print(f'offline_cleaned: {offline_cleaned.shape}\n{list(offline_cleaned.columns)}')

offline_cleaned: (263815, 10)
['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country', 'QuantityCanceled', 'TotalPrice']


In [116]:
offline_cleaned_path = PROJECT_ROOT / "data/output/offline_cleaned.csv"
offline_cleaned_path.parent.mkdir(exist_ok=True)

offline_cleaned.to_csv(offline_cleaned_path, index=False)
print(f'Saved to {offline_cleaned_path}')

Saved to /plain/github/opensource/Featurologists/data/output/offline_cleaned.csv


## offline_cleaned.csv -> offline_clusters.csv:

In [148]:
from featurologists.data_transforms import build_client_clusters

offline_clusters = build_client_clusters(offline_cleaned)
print(f'offline_clusters: {offline_clusters.shape}\n{list(offline_clusters.columns)}')

offline_clusters: (3616, 14)
['CustomerID', 'count', 'min', 'max', 'mean', 'sum', 'categ_0', 'categ_1', 'categ_2', 'categ_3', 'categ_4', 'LastPurchase', 'FirstPurchase', 'cluster']


In [149]:
offline_clusters_path = PROJECT_ROOT / "data/output/offline_clusters.csv"
offline_clusters_path.parent.mkdir(exist_ok=True)

offline_clusters.to_csv(offline_clusters_path, index=False)
print(f'Saved to {offline_clusters_path}')

Saved to /plain/github/opensource/Featurologists/data/output/offline_clusters.csv
