In [1]:
import numpy as np
import pandas as pd

np.random.seed(42)

n_customers = 50
n_weeks = 104  # 2 years of weekly data
customer_ids = [f"CUST{i+1:03}" for i in range(n_customers)]
weeks = pd.date_range("2023-01-02", periods=n_weeks, freq="W-MON")

data = []
for cust in customer_ids:
    base_gsar = np.random.uniform(1000, 5000)
    gsar = base_gsar + np.cumsum(np.random.normal(0, 25, size=n_weeks))
    issue_delivery = np.random.poisson(lam=2, size=n_weeks)
    issue_billing = np.random.poisson(lam=1, size=n_weeks)
    issue_service = np.random.poisson(lam=0.5, size=n_weeks)
    for i in range(n_weeks):
        data.append([
            cust,
            weeks[i].strftime("%Y-W%U"),
            issue_delivery[i],
            issue_billing[i],
            issue_service[i],
            gsar[i]
        ])

df = pd.DataFrame(data, columns=[
    "CUSTOMER_ID", "FISCAL_WEEK_YEAR", "ISSUE_DELIVERY", "ISSUE_BILLING", "ISSUE_SERVICE", "GSAR"
])


In [2]:
# Convert week to number for seasonality
df['FISCAL_WEEK'] = df['FISCAL_WEEK_YEAR'].str[-2:].astype(int)

# Lag features
for col in ['ISSUE_DELIVERY', 'ISSUE_BILLING', 'ISSUE_SERVICE', 'GSAR']:
    for lag in [1, 4]:
        df[f'{col}_lag_{lag}'] = df.groupby('CUSTOMER_ID')[col].shift(lag)

# Rolling sums
for window in [4, 8]:
    for col in ['ISSUE_DELIVERY', 'ISSUE_BILLING']:
        df[f'{col}_roll_sum_{window}'] = (
            df.groupby('CUSTOMER_ID')[col]
            .rolling(window, min_periods=1)
            .sum()
            .reset_index(0, drop=True)
        )

# Ratio feature
df['DELIVERY_BILLING_RATIO'] = df['ISSUE_DELIVERY'] / (df['ISSUE_BILLING'] + 1)

# Seasonal (cyclical) features: sine/cosine of week number
df['WEEK_SIN'] = np.sin(2 * np.pi * df['FISCAL_WEEK'] / 52)
df['WEEK_COS'] = np.cos(2 * np.pi * df['FISCAL_WEEK'] / 52)

# Customer tenure
df['CUSTOMER_TENURE'] = df.groupby('CUSTOMER_ID').cumcount()

# Target variable: weekly change in sales
df['GSAR_CHANGE'] = df.groupby('CUSTOMER_ID')['GSAR'].diff()

# Remove initial NaNs
df = df.dropna().reset_index(drop=True)


In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, root_mean_squared_error

feature_cols = [
    'ISSUE_DELIVERY', 'ISSUE_BILLING', 'ISSUE_SERVICE',
    'ISSUE_DELIVERY_lag_1', 'ISSUE_BILLING_lag_1', 'ISSUE_SERVICE_lag_1', 'GSAR_lag_1',
    'ISSUE_DELIVERY_lag_4', 'ISSUE_BILLING_lag_4', 'ISSUE_SERVICE_lag_4', 'GSAR_lag_4',
    'ISSUE_DELIVERY_roll_sum_4', 'ISSUE_BILLING_roll_sum_4',
    'ISSUE_DELIVERY_roll_sum_8', 'ISSUE_BILLING_roll_sum_8',
    'DELIVERY_BILLING_RATIO',
    'WEEK_SIN', 'WEEK_COS', 'CUSTOMER_TENURE'
]

X = df[feature_cols]
y = df['GSAR_CHANGE']

# Chronological split: last 20% for the test set
split_idx = int(0.8 * len(df))
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

# Model
rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", root_mean_squared_error(y_test, y_pred))

# Feature importance inspection
importances = rf.feature_importances_
for col, imp in sorted(zip(feature_cols, importances), key=lambda x: -x[1]):
    print(f"{col}: {imp:.3f}")


MAE: 19.874794193484778
RMSE: 24.781916599914478
GSAR_lag_1: 0.139
GSAR_lag_4: 0.113
CUSTOMER_TENURE: 0.080
ISSUE_DELIVERY_roll_sum_8: 0.073
WEEK_SIN: 0.067
ISSUE_DELIVERY_roll_sum_4: 0.062
ISSUE_BILLING_roll_sum_8: 0.061
WEEK_COS: 0.061
DELIVERY_BILLING_RATIO: 0.047
ISSUE_BILLING_roll_sum_4: 0.045
ISSUE_DELIVERY_lag_4: 0.040
ISSUE_DELIVERY_lag_1: 0.040
ISSUE_BILLING_lag_4: 0.028
ISSUE_SERVICE_lag_1: 0.028
ISSUE_DELIVERY: 0.026
ISSUE_BILLING: 0.024
ISSUE_BILLING_lag_1: 0.024
ISSUE_SERVICE: 0.020
ISSUE_SERVICE_lag_4: 0.020
