In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
import joblib

In [3]:
df = pd.read_csv("../data/online_retail_cleaned.csv")

In [4]:

df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
cutoff = df['InvoiceDate'].max() - pd.Timedelta(days=30)
df['RecentPurchase'] = (df['InvoiceDate'] > cutoff).astype(int)

In [5]:
# Create customer-level data
X = df.groupby('CustomerID').agg({
    'Quantity': 'sum',
    'UnitPrice': 'mean',
    'TotalPrice': 'sum',
    'InvoiceNo': 'nunique',
    'RecentPurchase': 'max'
}).reset_index()

X = X.rename(columns={'InvoiceNo': 'Frequency'})
y = X.pop('RecentPurchase')

X.head()

Unnamed: 0,CustomerID,Quantity,UnitPrice,TotalPrice,Frequency
0,12346.0,74215,1.04,77183.6,1
1,12347.0,2458,2.644011,4310.0,7
2,12348.0,2341,5.764839,1797.24,4
3,12349.0,631,8.289041,1757.55,1
4,12350.0,197,3.841176,334.4,1


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X.drop('CustomerID', axis=1),
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [15]:
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [16]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

with open("../results/prediction_report.txt", "w") as f:
    f.write(classification_report(y_test, y_pred))

print("✅ Report saved to 'results/prediction_report.txt'")

Confusion Matrix:
 [[451  87]
 [165 165]]

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.84      0.78       538
           1       0.65      0.50      0.57       330

    accuracy                           0.71       868
   macro avg       0.69      0.67      0.67       868
weighted avg       0.70      0.71      0.70       868

✅ Report saved to 'results/prediction_report.txt'


In [None]:

param_dist = {
    'n_estimators': [200, 500, 1000],
    'max_depth': [None, 10, 20, 30],
    'min_samples_leaf': [1, 2, 5, 10],
    'max_features': ['auto', 'sqrt', 0.5],
    'class_weight': ['balanced', {0:1, 1:2}, {0:1,1:3}]
}

clf = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=50,
    scoring='f1',
    cv=5,
    verbose=1,
    n_jobs=-1
)
clf.fit(X_train, y_train)
print("Best params:", clf.best_params_)


Fitting 5 folds for each of 50 candidates, totalling 250 fits


105 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
37 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Dokumenty\PythonProjects\CustomerSegmentationAndPrediction\venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Dokumenty\PythonProjects\CustomerSegmentationAndPrediction\venv\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "d:\Dokumenty\PythonProjects\CustomerSegmentationAndPrediction\venv\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "d:\Dokumenty\PythonPr

Best params: {'n_estimators': 500, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'max_depth': 10, 'class_weight': {0: 1, 1: 3}}


In [18]:
best_params = {
    'n_estimators': 500,
    'max_depth': 10,
    'min_samples_leaf': 10,
    'max_features': 'sqrt',
    'class_weight': {0:1, 1:3},
    'random_state': 42
}

opt_rf = RandomForestClassifier(**best_params)
opt_rf.fit(X_train, y_train)

y_pred = opt_rf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[304 234]
 [ 63 267]]
              precision    recall  f1-score   support

           0       0.83      0.57      0.67       538
           1       0.53      0.81      0.64       330

    accuracy                           0.66       868
   macro avg       0.68      0.69      0.66       868
weighted avg       0.72      0.66      0.66       868



In [None]:
y_pred = model.predict(X_test)
df_test = X_test.copy()

# attach predictions to your test‐set
df_test['purchase_pred'] = y_pred
df_test.to_csv('purchase_predictions.csv', index=False)

# serialize the trained model
joblib.dump(model, 'purchase_model.pkl')

['purchase_model.pkl']