In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import matplotlib.pyplot as plt
import pickle

# 1. Load data (includes a merged file with orders and product details)
df = pd.read_csv("orders_products_merged.csv", parse_dates=["order_date"])

# 2. Aggregate daily sales revenue and add features
daily = df.groupby(
    ['order_date', 'product_id', 'title', 'price', 'product_type', 'vendor']
)[['total']].sum().reset_index().sort_values(['product_id', 'order_date'])

# Lags for sales revenue
for lag in [1,3,7,14,28]:
    daily[f'lag_{lag}'] = daily.groupby('product_id')['total'].shift(lag)
for window in [3,7,14,28]:
    daily[f'rolling_mean_{window}'] = daily.groupby('product_id')['total'].transform(lambda x: x.rolling(window).mean())
    daily[f'rolling_std_{window}'] = daily.groupby('product_id')['total'].transform(lambda x: x.rolling(window).std())

# Calendar features
daily['dayofweek'] = daily['order_date'].dt.dayofweek
daily['month'] = daily['order_date'].dt.month
daily['weekofyear'] = daily['order_date'].dt.isocalendar().week

# Holiday flags (add more if necessary)
holidays = [
    '2023-01-01','2024-01-01','2025-01-01','2023-02-14','2024-02-14','2025-02-14',
    '2023-11-23','2024-11-28','2025-11-27','2023-12-25','2024-12-25','2025-12-25'
]
daily['is_holiday'] = daily['order_date'].isin([pd.Timestamp(d) for d in holidays]).astype(int)

# Product averages
prod_avg = daily.groupby('product_id')['total'].agg(['mean','std']).rename(columns={'mean':'prod_avg','std':'prod_std'}).reset_index()
daily = daily.merge(prod_avg, on='product_id', how='left')

# One-hot encoding
cat_cols = ['product_type', 'vendor']
enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
cat_features = enc.fit_transform(daily[cat_cols])
cat_feature_names = enc.get_feature_names_out(cat_cols)
cat_df = pd.DataFrame(cat_features, columns=cat_feature_names, index=daily.index)
daily = pd.concat([daily, cat_df], axis=1)

# Drop rows with NaN 
daily = daily.dropna()

# 3. Prepare features and target
feature_cols = (
    ['price', 'prod_avg', 'prod_std', 'lag_1', 'lag_3', 'lag_7', 'lag_14', 'lag_28',
     'rolling_mean_3', 'rolling_mean_7', 'rolling_mean_14', 'rolling_mean_28',
     'rolling_std_3', 'rolling_std_7', 'rolling_std_14', 'rolling_std_28',
     'dayofweek', 'month', 'weekofyear', 'is_holiday']
    + list(cat_feature_names)
)
X = daily[feature_cols]
y = daily['total']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Chronological train-test split
test_size = int(0.2 * len(X_scaled))
X_train, X_test = X_scaled[:-test_size], X_scaled[-test_size:]
y_train, y_test = y.iloc[:-test_size], y.iloc[-test_size:]

# 4. Hyperparameter tuning
param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [8, 10, 12, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 10],
    'max_features': ['sqrt', 'log2', None]
}
tscv = TimeSeriesSplit(n_splits=3)
rf = RandomForestRegressor(random_state=42, n_jobs=-1)
rand_search = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=20, 
    cv=tscv,
    verbose=2,
    n_jobs=-1,
    scoring='neg_mean_absolute_error'
)
rand_search.fit(X_train, y_train)

# 5. Evaluation and Visualization
best_model = rand_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Best Params:", rand_search.best_params_)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

plt.figure(figsize=(12,5))
plt.plot(y_test.values, label="Actual Sales Revenue")
plt.plot(y_pred, label="Predicted Sales Revenue")
plt.title("Actual vs Predicted Sales Revenue (Tuned Model)")
plt.legend()
plt.tight_layout()
plt.show()

# Feature importance
importances = best_model.feature_importances_
sorted_idx = np.argsort(importances)[::-1]
print("Top features:")
for idx in sorted_idx[:10]:
    print(f"{feature_cols[idx]}: {importances[idx]:.3f}")

# 6. Save the model and scaler for deployment
with open('sales_forecast_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

with open('feature_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
