In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib

# Add feature_engineering module to path
sys.path.append("../feature_engineering")
from feature_utils import make_features

# Create images directory
os.makedirs("images", exist_ok=True)

sns.set_style("whitegrid")
%matplotlib inline

## 1. Load Data for Multiple Store‚ÄìDept Combinations

Instead of filtering to a single Store‚ÄìDept, we'll load **multiple combinations** to train a global model.

In [None]:
# Load full training data
df = pd.read_csv("../baseline_prophet_forecast/data/train.csv", parse_dates=["Date"])

# Select a representative sample of Store-Dept combinations
# For demo, use Store 1 with multiple departments + Store 2 with some departments
selected_combos = [
    (1, 1), (1, 2), (1, 3), (1, 4), (1, 5),  # Store 1, multiple depts
    (2, 1), (2, 2), (2, 3),                   # Store 2, multiple depts
    (3, 1), (3, 2),                           # Store 3, few depts (simulate limited history)
]

# Filter to selected combinations
df_filtered = df[
    df.apply(lambda x: (x['Store'], x['Dept']) in selected_combos, axis=1)
].copy()

print(f"Total records: {len(df_filtered):,}")
print(f"\nStore-Dept combinations: {df_filtered.groupby(['Store', 'Dept']).size().shape[0]}")
print(f"\nRecords per combination:")
print(df_filtered.groupby(['Store', 'Dept']).size())

## 2. Feature Engineering with Entity IDs

Global models require:
1. **Time-series features** (lags, rolling stats) computed per entity
2. **Entity identifiers** (Store, Dept) as categorical features for cross-learning

In [None]:
def make_global_features(df):
    """
    Create features for global model across multiple Store-Dept combinations.
    """
    all_features = []
    
    for (store, dept), group in df.groupby(['Store', 'Dept']):
        # Sort by date and set as index
        entity_df = group.sort_values('Date').set_index('Date')
        
        # Create time-series features using reusable function
        entity_features = make_features(entity_df, target='Weekly_Sales')
        
        # Add entity identifiers
        entity_features['Store'] = store
        entity_features['Dept'] = dept
        
        # Add target variable
        entity_features['Weekly_Sales'] = entity_df['Weekly_Sales']
        
        all_features.append(entity_features)
    
    # Concatenate all entities
    global_df = pd.concat(all_features, axis=0)
    return global_df

# Generate global features
features_df = make_global_features(df_filtered)

print(f"Global feature dataset shape: {features_df.shape}")
print(f"\nFeature columns: {features_df.columns.tolist()[:10]}...")
print(f"\nSample data:")
print(features_df.head())

## 3. Train-Test Split

For time-series forecasting:
- **Training**: Earlier time periods (e.g., first 80% of data per entity)
- **Testing**: Recent time periods (last 20%) to simulate forward-looking forecasts

In [None]:
# Define cutoff date for train/test split (80/20 split)
# Calculate per-entity cutoff to ensure balanced split
train_list = []
test_list = []

for (store, dept), group in features_df.groupby(['Store', 'Dept']):
    group_sorted = group.sort_index()
    n = len(group_sorted)
    cutoff_idx = int(n * 0.8)
    
    train_list.append(group_sorted.iloc[:cutoff_idx])
    test_list.append(group_sorted.iloc[cutoff_idx:])

train_df = pd.concat(train_list)
test_df = pd.concat(test_list)

print(f"Training samples: {len(train_df):,}")
print(f"Testing samples: {len(test_df):,}")
print(f"\nDate range - Train: {train_df.index.min()} to {train_df.index.max()}")
print(f"Date range - Test: {test_df.index.min()} to {test_df.index.max()}")

## 4. Prepare Features and Target

Remove any rows with NaN values (from lagging/rolling window startup) and separate features from target.

In [None]:
# Drop NaN values created by lagging/rolling features
train_clean = train_df.dropna()
test_clean = test_df.dropna()

# Separate features and target
target_col = 'Weekly_Sales'
feature_cols = [col for col in train_clean.columns if col != target_col]

X_train = train_clean[feature_cols]
y_train = train_clean[target_col]

X_test = test_clean[feature_cols]
y_test = test_clean[target_col]

print(f"Training set: {X_train.shape[0]} samples, {X_train.shape[1]} features")
print(f"Testing set: {X_test.shape[0]} samples")
print(f"\nFeature dtypes:")
print(X_train.dtypes.value_counts())

## 5. Train Global LightGBM Model

Key hyperparameters for global models:
- **Categorical features**: Store and Dept encoded as categories for efficient tree splits
- **More trees**: Global models benefit from higher `n_estimators` due to diverse patterns
- **Regularization**: Higher `reg_alpha`/`reg_lambda` to prevent overfitting to dominant entities

In [None]:
# Convert Store and Dept to categorical for LightGBM
categorical_features = ['Store', 'Dept']
for col in categorical_features:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')

# Train global LightGBM model
model = LGBMRegressor(
    n_estimators=800,           # More trees for global model
    learning_rate=0.05,
    max_depth=6,
    num_leaves=31,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,              # L1 regularization
    reg_lambda=0.1,             # L2 regularization
    random_state=42,
    verbose=-1
)

print("Training global LightGBM model...")
model.fit(
    X_train, 
    y_train,
    categorical_feature=categorical_features
)

print("‚úì Training complete!")

## 6. Evaluate Global Model Performance

Evaluate both:
- **Overall metrics**: Aggregated across all Store-Dept combinations
- **Per-entity metrics**: Performance breakdown by Store-Dept to identify weak performers

In [None]:
# Generate predictions
y_pred = model.predict(X_test)

# Overall metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print("=" * 50)
print("GLOBAL MODEL PERFORMANCE")
print("=" * 50)
print(f"MAE:  ${mae:,.2f}")
print(f"RMSE: ${rmse:,.2f}")
print(f"MAPE: {mape:.2f}%")
print("=" * 50)

In [None]:
# Per-entity performance analysis
results_df = X_test.copy()
results_df['Actual'] = y_test.values
results_df['Predicted'] = y_pred
results_df['Error'] = results_df['Actual'] - results_df['Predicted']
results_df['Abs_Error'] = np.abs(results_df['Error'])

# Calculate per-entity metrics
entity_metrics = results_df.groupby(['Store', 'Dept']).agg({
    'Abs_Error': 'mean',
    'Actual': 'count'
}).rename(columns={'Abs_Error': 'MAE', 'Actual': 'Test_Samples'}).round(2)

entity_metrics = entity_metrics.sort_values('MAE', ascending=False)

print("\nPer-Entity Performance (Top 10 by MAE):")
print(entity_metrics.head(10))

print("\nPer-Entity Performance (Bottom 10 by MAE):")
print(entity_metrics.tail(10))

## 7. Feature Importance Analysis

Understanding which features drive predictions in a global model:
- **Entity features** (Store, Dept): High importance indicates entity-specific patterns
- **Time-series features** (lags, rolling stats): High importance indicates temporal patterns
- **Calendar features** (month, weekofyear): High importance indicates seasonality

In [None]:
# Feature importance
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

# Plot top 20 features
plt.figure(figsize=(10, 8))
sns.barplot(data=importance_df.head(20), y='Feature', x='Importance', palette='viridis')
plt.title('Top 20 Feature Importances (Global Model)', fontsize=14, fontweight='bold')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.savefig('images/global_feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nTop 15 Features:")
print(importance_df.head(15))

## 8. Visualize Predictions for Selected Entities

Compare actual vs predicted for a few Store-Dept combinations to assess model quality.

In [None]:
# Select 3 entities for visualization
viz_entities = [(1, 1), (2, 1), (3, 1)]

fig, axes = plt.subplots(3, 1, figsize=(14, 10))

for idx, (store, dept) in enumerate(viz_entities):
    # Filter results for this entity
    entity_results = results_df[
        (results_df['Store'] == store) & (results_df['Dept'] == dept)
    ].sort_index()
    
    # Plot
    ax = axes[idx]
    ax.plot(entity_results.index, entity_results['Actual'], 
            label='Actual', linewidth=2, marker='o', markersize=4)
    ax.plot(entity_results.index, entity_results['Predicted'], 
            label='Predicted', linewidth=2, marker='s', markersize=4, alpha=0.7)
    ax.fill_between(entity_results.index, 
                     entity_results['Actual'], 
                     entity_results['Predicted'],
                     alpha=0.2, color='red')
    
    entity_mae = entity_metrics.loc[(store, dept), 'MAE']
    ax.set_title(f'Store {store}, Dept {dept} | MAE: ${entity_mae:,.2f}', 
                fontsize=12, fontweight='bold')
    ax.set_ylabel('Weekly Sales ($)')
    ax.legend()
    ax.grid(True, alpha=0.3)

axes[-1].set_xlabel('Date')
plt.tight_layout()
plt.savefig('images/global_model_predictions.png', dpi=150, bbox_inches='tight')
plt.show()

## 9. Cold-Start Scenario Simulation

**Key advantage of global models**: Can predict for new entities with **zero historical data** by leveraging patterns from similar entities.

### Simulation:
1. Pretend Store 3, Dept 2 is a **brand new entity** with no training data
2. Remove it from training set
3. Train a new global model
4. Predict for Store 3, Dept 2 using only entity ID features
5. Compare to actual values

In [None]:
# Define cold-start entity
cold_start_entity = (3, 2)

# Remove cold-start entity from training data
train_no_coldstart = train_clean[
    ~((train_clean['Store'] == cold_start_entity[0]) & 
      (train_clean['Dept'] == cold_start_entity[1]))
]

# Get cold-start entity test data
cold_start_test = test_clean[
    (test_clean['Store'] == cold_start_entity[0]) & 
    (test_clean['Dept'] == cold_start_entity[1])
]

print(f"Cold-start entity: Store {cold_start_entity[0]}, Dept {cold_start_entity[1]}")
print(f"Training samples (excluding cold-start): {len(train_no_coldstart):,}")
print(f"Cold-start test samples: {len(cold_start_test)}")

In [None]:
# Train model without cold-start entity
X_train_no_cold = train_no_coldstart[feature_cols]
y_train_no_cold = train_no_coldstart[target_col]

# Convert categorical features
for col in categorical_features:
    X_train_no_cold[col] = X_train_no_cold[col].astype('category')

model_no_cold = LGBMRegressor(
    n_estimators=800,
    learning_rate=0.05,
    max_depth=6,
    num_leaves=31,
    random_state=42,
    verbose=-1
)

print("Training model WITHOUT cold-start entity...")
model_no_cold.fit(
    X_train_no_cold, 
    y_train_no_cold,
    categorical_feature=categorical_features
)
print("‚úì Training complete!")

In [None]:
# Predict for cold-start entity
X_cold_start = cold_start_test[feature_cols]
y_cold_start = cold_start_test[target_col]

# Convert categorical features
for col in categorical_features:
    X_cold_start[col] = X_cold_start[col].astype('category')

y_cold_pred = model_no_cold.predict(X_cold_start)

# Evaluate cold-start performance
cold_mae = mean_absolute_error(y_cold_start, y_cold_pred)
cold_rmse = np.sqrt(mean_squared_error(y_cold_start, y_cold_pred))
cold_mape = np.mean(np.abs((y_cold_start - y_cold_pred) / y_cold_start)) * 100

print("=" * 50)
print(f"COLD-START PERFORMANCE (Store {cold_start_entity[0]}, Dept {cold_start_entity[1]})")
print("=" * 50)
print(f"MAE:  ${cold_mae:,.2f}")
print(f"RMSE: ${cold_rmse:,.2f}")
print(f"MAPE: {cold_mape:.2f}%")
print("=" * 50)
print("\n‚úì Model successfully predicted for an entity with ZERO training history!")
print("  This demonstrates the power of global models for handling new stores/products.")

In [None]:
# Visualize cold-start predictions
plt.figure(figsize=(14, 5))
plt.plot(cold_start_test.index, y_cold_start, 
         label='Actual (New Entity)', linewidth=2, marker='o', markersize=6)
plt.plot(cold_start_test.index, y_cold_pred, 
         label='Predicted (Global Model)', linewidth=2, marker='s', markersize=6, alpha=0.7)
plt.fill_between(cold_start_test.index, y_cold_start, y_cold_pred,
                alpha=0.2, color='orange')
plt.title(f'Cold-Start Prediction: Store {cold_start_entity[0]}, Dept {cold_start_entity[1]} (Zero Training History)\nMAE: ${cold_mae:,.2f}',
         fontsize=14, fontweight='bold')
plt.xlabel('Date')
plt.ylabel('Weekly Sales ($)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('images/cold_start_prediction.png', dpi=150, bbox_inches='tight')
plt.show()

## 10. Save Global Model

Save the trained global model for production deployment.

In [None]:
# Save the full global model (trained on all entities)
model_path = "../../artifacts/global_lgbm_model.pkl"
joblib.dump(model, model_path)
print(f"‚úì Global model saved to: {model_path}")

# Save feature list for inference pipeline
feature_list_path = "../../artifacts/global_model_features.txt"
with open(feature_list_path, 'w') as f:
    f.write('\n'.join(feature_cols))
print(f"‚úì Feature list saved to: {feature_list_path}")

## Key Takeaways

### ‚úÖ Advantages of Global Models
1. **Cross-learning**: Model learns from patterns across all entities (e.g., holiday spikes are similar across stores)
2. **Cold-start handling**: Can predict for new entities with zero historical data
3. **Scalability**: One model serves thousands of entities vs. managing thousands of models
4. **Consistency**: Same methodology applied across all forecasts
5. **Maintenance**: Single model to retrain, monitor, and debug

### ‚ö†Ô∏è Considerations
1. **Entity diversity**: Works best when entities share similar patterns (e.g., all grocery stores)
2. **Feature engineering**: Requires careful treatment of entity-specific features
3. **Imbalanced entities**: Dominant entities (high volume stores) can overshadow smaller ones
4. **Model complexity**: Needs more hyperparameter tuning than per-entity models

### üöÄ Production Extensions
- **Hierarchical features**: Add Store Type, Region, Department Category
- **Entity embeddings**: Learn dense representations of Store/Dept IDs
- **Hybrid approach**: Global model for most entities + specialized models for critical SKUs
- **Online learning**: Incrementally update model as new data arrives
- **Multi-horizon forecasting**: Predict 1-week, 4-week, 13-week ahead simultaneously