Setup and imports

In [1]:
import sys, os
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# Auto-reload
%load_ext autoreload
%autoreload 2

# Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from src.classifier import train_model, evaluate_model


Load data and create label

In [2]:
df = pd.read_csv("../data/online_retail_cleaned.csv")
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# Recent purchase label
cutoff = df['InvoiceDate'].max() - pd.Timedelta(days=30)
df['RecentPurchase'] = (df['InvoiceDate'] > cutoff).astype(int)

# Aggregate to customer level
X = df.groupby('CustomerID').agg({
    'Quantity': 'sum',
    'UnitPrice': 'mean',
    'TotalPrice': 'sum',
    'InvoiceNo': 'nunique',
    'RecentPurchase': 'max'
}).reset_index()

X = X.rename(columns={'InvoiceNo': 'Frequency'})
y = X.pop('RecentPurchase')


Train-test split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X.drop('CustomerID', axis=1),
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

Train model and evaluate

In [4]:
model = train_model(X_train, y_train, model_type='random_forest')

cm, cr = evaluate_model(model, X_test, y_test, report_path="../results/prediction_report.txt")

print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", cr)


Confusion Matrix:
 [[449  89]
 [165 165]]

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.83      0.78       538
           1       0.65      0.50      0.57       330

    accuracy                           0.71       868
   macro avg       0.69      0.67      0.67       868
weighted avg       0.70      0.71      0.70       868

