In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
from sklearn.metrics import mean_squared_error
color_pal = sns.color_palette()
plt.style.use('fivethirtyeight')

import random

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

In [None]:
file_path = "sample_data.csv"
df = pd.read_csv(file_path, header=None, delimiter=';', on_bad_lines='skip', dtype=str)

df.columns = [
    "OrderID", "Date", "CustomerID", "Country", "City",
    "?", "Quantity", "ProductCode", "Description",
    "Category", "SubCategory", "Format"
]
# Convert date and quantity
df["Quantity"] = pd.to_numeric(df["Quantity"], errors='coerce')

# Drop rows with missing essential fields
df = df.dropna(subset=["Date", "Description", "Quantity"])

# Group and aggregate
df = df.groupby(["Date", "ProductCode"]).agg({ 
    "OrderID": "first",  
    "CustomerID": "first",
    "Country": "first",
    "City": "first",
    "?": "first",
    "Quantity": "sum",
    "Description": "first",
    "Category": "first",
    "SubCategory": "first",
    "Format": "first"
}).reset_index()

df["Quantity"] = df["Quantity"].clip(upper=df["Quantity"].quantile(0.95))

df["Date"] = pd.to_datetime(df["Date"], errors='coerce')

df = df.set_index("Date")

df = df[(df["Category"] != "GEEN") & (df["Category"].notna()) & (df["Category"] != "")]

df = df[~df["Description"].str.contains("leaflet", case=False, na=False)]

df = df.sort_index()

product_codes = df["ProductCode"].unique()

random_codes = ["sampleOfProductCodes"]

In [None]:
def create_features(df):
    df = df.copy()
    df['hour'] = df.index.hour
    df['dayofweek'] = df.index.dayofweek
    df['quarter'] = df.index.quarter
    df['month'] = df.index.month
    df['year'] = df.index.year
    df['dayofyear'] = df.index.dayofyear
    return df
df =create_features(df)

In [None]:
FEATURES = [
    'hour', 'dayofweek', 'quarter', 'month', 'year', 'dayofyear',
    'lag_1', 'lag_7', 'lag_30',
    'rolling_mean_7', 'rolling_std_7', 'rolling_mean_30', 'rolling_diff_7', 'pct_change_7'
]
TARGET = 'Quantity'

In [None]:
def add_lags(df, lags=[1, 7, 30]):
    for lag in lags:
        df[f'lag_{lag}'] = df.groupby("ProductCode")["Quantity"].shift(lag)
    return df

def add_rolling_stats(df):
    df['rolling_mean_7'] = df.groupby("ProductCode")["Quantity"].transform(lambda x: x.rolling(7).mean())
    df['rolling_std_7'] = df.groupby("ProductCode")["Quantity"].transform(lambda x: x.rolling(7).std())
    df['rolling_mean_30'] = df.groupby("ProductCode")["Quantity"].transform(lambda x: x.rolling(30).mean())
    df['rolling_diff_7'] = df['rolling_mean_7'].diff()
    df['pct_change_7'] = df.groupby("ProductCode")["Quantity"].transform(lambda x: x.pct_change(periods=7))
    return df

In [None]:
for code in random_codes:
    print(f"\nEvaluating ProductCode: {code}")
    df_product = df[df["ProductCode"] == code].copy().sort_index()


    if len(df_product) < 100 or df_product["Quantity"].sum() < 300 or df_product["Quantity"].nunique() < 4:
        print("Product skipped — not enough signal to model.")
        continue

  
    df_product = add_lags(df_product)
    df_product = add_rolling_stats(df_product)
    df_product = df_product.dropna()
    df_product = create_features(df_product)
    
    if df_product.empty or df_product.index.min() is pd.NaT or df_product.index.max() is pd.NaT:
        print("Product skipped — no valid date range.")
        continue

    split_date = df_product.index.min() + pd.Timedelta(
        days=round(0.8 * (df_product.index.max() - df_product.index.min()).days)
    )
    train = df_product[df_product.index < split_date]
    test = df_product[df_product.index >= split_date]
    
    X_train = train[FEATURES]
    X_test = test[FEATURES]
    y_train = np.log1p(train[TARGET])  
    y_test = np.log1p(test[TARGET])


    reg = xgb.XGBRegressor(n_estimators=1000, early_stopping_rounds=50, learning_rate=0.1)
    reg.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)
    
    test.loc[:, "prediction"] = np.expm1(reg.predict(X_test)) # reverse log1p

    y_pred = test['prediction']
    y_true = test['Quantity']

    if len(y_true) >= 2:
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        mae = mean_absolute_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)
        print(f"Evaluation on Test Set:")
        print(f"RMSE: {rmse:.2f}")
        print(f"MAE: {mae:.2f}")
        print(f"R² Score: {r2:.4f}")
    else:
        print("Not enough test samples to calculate R².")

    train["prediction"] = np.expm1(reg.predict(X_train))
    test["prediction"] = np.expm1(reg.predict(X_test))

    def evaluate(y_true, y_pred, label=""):
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        mae = mean_absolute_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)
        print(f"{label} RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.4f}")
        return rmse, mae, r2

    print("Evaluation on Train Set:")
    evaluate(train["Quantity"], train["prediction"], label="Train")

    print("Evaluation on Test Set:")
    evaluate(test["Quantity"], test["prediction"], label="Test")

    plt.figure(figsize=(15, 5))
    plt.plot(test.index, test["Quantity"], label="Actual", color="blue")

    plt.plot(test.index, test["prediction"], label="Predicted", color="orange")


    plt.title(f"Forecast vs. Actual for ProductCode {code}")
    plt.xlabel("Date")
    plt.ylabel("Quantity")
    plt.legend()
    plt.show()
