In [1]:
import numpy as np
import pandas as pd

np.random.seed(42)

n_customers = 50
n_weeks = 104  # 2 years of weekly data
customer_ids = [f"CUST{i+1:03}" for i in range(n_customers)]
weeks = pd.date_range("2023-01-02", periods=n_weeks, freq="W-MON")

data = []
for cust in customer_ids:
    base_gsar = np.random.uniform(1000, 5000)
    gsar = base_gsar + np.cumsum(np.random.normal(0, 25, size=n_weeks))
    issue_delivery = np.random.poisson(lam=2, size=n_weeks)
    issue_billing = np.random.poisson(lam=1, size=n_weeks)
    issue_service = np.random.poisson(lam=0.5, size=n_weeks)
    for i in range(n_weeks):
        data.append([
            cust,
            weeks[i].strftime("%Y-W%U"),
            issue_delivery[i],
            issue_billing[i],
            issue_service[i],
            gsar[i]
        ])

df = pd.DataFrame(data, columns=[
    "CUSTOMER_ID", "FISCAL_WEEK_YEAR", "ISSUE_DELIVERY", "ISSUE_BILLING", "ISSUE_SERVICE", "GSAR"
])
# Convert week to number for seasonality
df['FISCAL_WEEK'] = df['FISCAL_WEEK_YEAR'].str[-2:].astype(int)


In [4]:
# Lag features
for col in ['ISSUE_DELIVERY', 'ISSUE_BILLING', 'ISSUE_SERVICE', 'GSAR']:
    for lag in [1, 4]:
        df[f'{col}_lag_{lag}'] = df.groupby('CUSTOMER_ID')[col].shift(lag)

# Rolling sums
for window in [4, 8]:
    for col in ['ISSUE_DELIVERY', 'ISSUE_BILLING']:
        df[f'{col}_roll_sum_{window}'] = (
            df.groupby('CUSTOMER_ID')[col]
            .rolling(window, min_periods=1)
            .sum()
            .reset_index(0, drop=True)
        )

# Ratio feature
df['DELIVERY_BILLING_RATIO'] = df['ISSUE_DELIVERY'] / (df['ISSUE_BILLING'] + 1)

# Seasonal (cyclical) features: sine/cosine of week number
df['WEEK_SIN'] = np.sin(2 * np.pi * df['FISCAL_WEEK'] / 52)
df['WEEK_COS'] = np.cos(2 * np.pi * df['FISCAL_WEEK'] / 52)

# Customer tenure
df['CUSTOMER_TENURE'] = df.groupby('CUSTOMER_ID').cumcount()

# Target variable: weekly change in sales
df['GSAR_CHANGE'] = df.groupby('CUSTOMER_ID')['GSAR'].diff()

# Remove initial NaNs
df = df.dropna().reset_index(drop=True)

In [5]:
df.head()

Unnamed: 0,CUSTOMER_ID,FISCAL_WEEK_YEAR,ISSUE_DELIVERY,ISSUE_BILLING,ISSUE_SERVICE,GSAR,FISCAL_WEEK,ISSUE_DELIVERY_lag_1,ISSUE_DELIVERY_lag_4,ISSUE_BILLING_lag_1,...,GSAR_lag_4,ISSUE_DELIVERY_roll_sum_4,ISSUE_BILLING_roll_sum_4,ISSUE_DELIVERY_roll_sum_8,ISSUE_BILLING_roll_sum_8,DELIVERY_BILLING_RATIO,WEEK_SIN,WEEK_COS,CUSTOMER_TENURE,GSAR_CHANGE
0,CUST001,2023-W05,1,2,0,2496.052988,5,2.0,1.0,0.0,...,2470.363472,5.0,8.0,6.0,8.0,0.333333,0.568065,0.822984,4,-14.521953
1,CUST001,2023-W06,3,1,0,2482.923743,6,1.0,0.0,2.0,...,2478.336027,8.0,7.0,9.0,9.0,1.5,0.663123,0.748511,5,-13.129245
2,CUST001,2023-W07,2,1,0,2468.639239,7,3.0,2.0,1.0,...,2485.312059,8.0,4.0,11.0,10.0,1.0,0.748511,0.663123,6,-14.284504
3,CUST001,2023-W08,3,1,1,2445.537168,8,2.0,2.0,1.0,...,2510.574941,9.0,5.0,14.0,11.0,1.5,0.822984,0.568065,7,-23.102071
4,CUST001,2023-W09,2,0,0,2380.223443,9,3.0,1.0,1.0,...,2496.052988,10.0,3.0,15.0,11.0,2.0,0.885456,0.464723,8,-65.313725


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, root_mean_squared_error

feature_cols = [
    'ISSUE_DELIVERY', 'ISSUE_BILLING', 'ISSUE_SERVICE',
    'ISSUE_DELIVERY_lag_1', 'ISSUE_BILLING_lag_1', 'ISSUE_SERVICE_lag_1', 'GSAR_lag_1',
    'ISSUE_DELIVERY_lag_4', 'ISSUE_BILLING_lag_4', 'ISSUE_SERVICE_lag_4', 'GSAR_lag_4',
    'ISSUE_DELIVERY_roll_sum_4', 'ISSUE_BILLING_roll_sum_4',
    'ISSUE_DELIVERY_roll_sum_8', 'ISSUE_BILLING_roll_sum_8',
    'DELIVERY_BILLING_RATIO',
    'WEEK_SIN', 'WEEK_COS', 'CUSTOMER_TENURE'
]

df.sort_values(by=["CUSTOMER_ID", "FISCAL_WEEK_YEAR"], inplace=True)
df.reset_index(drop=True, inplace=True)

X = df[feature_cols]
y = df['GSAR_CHANGE']

# 3. TimeSeriesSplit Cross-Validation
n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)
mae_scores = []
rmse_scores = []

for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        min_samples_split=5,
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, y_pred)
    rmse = root_mean_squared_error(y_val, y_pred)
    mae_scores.append(mae)
    rmse_scores.append(rmse)
    print(f"Fold {fold+1}: MAE={mae:.2f}, R2={rmse:.3f}")

print(f"\nAverage MAE over {n_splits} splits: {np.mean(mae_scores):.2f}")
print(f"Average RMSE over {n_splits} splits: {np.mean(rmse_scores):.3f}")

# Optional: Feature importance from the final fold's model
importances = model.feature_importances_
print("\nFeature Importances (last fold):")
for col, imp in sorted(zip(feature_cols, importances), key=lambda x: -x[1]):
    print(f"{col}: {imp:.3f}")
