# Demand Forecasting - Quick Exploration

This notebook provides a quick exploration of the demand forecasting dataset and models.

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from generate_data import generate_sample_data
from data_prep import load_data, clean_data, train_test_split
from features import create_all_features

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

## 1. Generate and Load Data

In [None]:
# Generate sample data
generate_sample_data(days=730, output_path='../data/sample_data.csv')

# Load data
df = load_data('../data/sample_data.csv')
df = clean_data(df)

print(f"Dataset shape: {df.shape}")
df.head()

## 2. Data Exploration

In [None]:
# Basic statistics
print("\nBasic Statistics:")
df['demand'].describe()

In [None]:
# Plot demand over time
fig, ax = plt.subplots(figsize=(14, 6))
ax.plot(df['date'], df['demand'], linewidth=1.5)
ax.set_title('Demand Over Time', fontsize=16)
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Demand', fontsize=12)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Distribution of demand
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(df['demand'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_title('Demand Distribution', fontsize=14)
axes[0].set_xlabel('Demand', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)

# Box plot
axes[1].boxplot(df['demand'], vert=True)
axes[1].set_title('Demand Box Plot', fontsize=14)
axes[1].set_ylabel('Demand', fontsize=12)

plt.tight_layout()
plt.show()

## 3. Feature Engineering

In [None]:
# Create features
df_features = create_all_features(df.copy())

print(f"\nOriginal features: 2 (date, demand)")
print(f"Engineered features: {len(df_features.columns)}")
print(f"\nAll features:")
print(df_features.columns.tolist())

In [None]:
# Correlation heatmap
# Select numeric columns only
numeric_cols = df_features.select_dtypes(include=[np.number]).columns[:15]
correlation_matrix = df_features[numeric_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Feature Correlation Heatmap (Top 15 Features)', fontsize=14)
plt.tight_layout()
plt.show()

## 4. Train Models

In [None]:
from train_prophet import train_prophet_model, predict_prophet
from train_xgboost import prepare_xgboost_data, train_xgboost_model, predict_xgboost
from evaluate import calculate_metrics, print_metrics

# Split data
train_df, test_df = train_test_split(df, test_days=90)

print(f"Training set: {len(train_df)} samples")
print(f"Test set: {len(test_df)} samples")

### Prophet Model

In [None]:
# Train Prophet
prophet_model = train_prophet_model(train_df)
prophet_results = predict_prophet(prophet_model, test_df)

# Evaluate
prophet_metrics = calculate_metrics(
    prophet_results['actual'].values,
    prophet_results['predicted'].values
)
print_metrics(prophet_metrics, 'Prophet')

In [None]:
# Plot Prophet predictions
fig, ax = plt.subplots(figsize=(14, 6))
ax.plot(prophet_results['date'], prophet_results['actual'], 
        label='Actual', marker='o', linewidth=2, markersize=4)
ax.plot(prophet_results['date'], prophet_results['predicted'], 
        label='Predicted', marker='x', linewidth=2, markersize=4)
ax.fill_between(prophet_results['date'],
                prophet_results['lower_bound'],
                prophet_results['upper_bound'],
                alpha=0.2, label='Confidence Interval')
ax.set_title('Prophet: Actual vs Predicted Demand', fontsize=16)
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Demand', fontsize=12)
ax.legend(fontsize=10)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

### XGBoost Model

In [None]:
# Train XGBoost
X_train, y_train, _ = prepare_xgboost_data(train_df)
X_test, y_test, dates_test = prepare_xgboost_data(test_df)

xgb_model = train_xgboost_model(X_train, y_train)
xgb_predictions = predict_xgboost(xgb_model, X_test)

# Evaluate
xgb_metrics = calculate_metrics(y_test.values, xgb_predictions)
print_metrics(xgb_metrics, 'XGBoost')

In [None]:
# Plot XGBoost predictions
fig, ax = plt.subplots(figsize=(14, 6))
ax.plot(dates_test, y_test.values, 
        label='Actual', marker='o', linewidth=2, markersize=4)
ax.plot(dates_test, xgb_predictions, 
        label='Predicted', marker='x', linewidth=2, markersize=4)
ax.set_title('XGBoost: Actual vs Predicted Demand', fontsize=16)
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Demand', fontsize=12)
ax.legend(fontsize=10)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 5. Model Comparison

In [None]:
from evaluate import compare_models

# Compare models
results_dict = {
    'Prophet': (prophet_results['actual'].values, prophet_results['predicted'].values),
    'XGBoost': (y_test.values, xgb_predictions)
}

comparison_df = compare_models(results_dict)
print("\nModel Comparison:")
comparison_df

In [None]:
# Visualize comparison
fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(comparison_df))
width = 0.2

ax.bar(x - width*1.5, comparison_df['MAE'], width, label='MAE')
ax.bar(x - width*0.5, comparison_df['RMSE'], width, label='RMSE')
ax.bar(x + width*0.5, comparison_df['MAPE'], width, label='MAPE')
ax.bar(x + width*1.5, comparison_df['R2']*100, width, label='R²×100')

ax.set_xlabel('Model', fontsize=12)
ax.set_ylabel('Metric Value', fontsize=12)
ax.set_title('Model Performance Comparison', fontsize=14)
ax.set_xticks(x)
ax.set_xticklabels(comparison_df['Model'])
ax.legend()
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

## 6. Feature Importance (XGBoost)

In [None]:
from train_xgboost import get_feature_importance

# Get feature importance
importance_df = get_feature_importance(xgb_model, X_train.columns.tolist())

# Plot top 15 features
top_features = importance_df.head(15)

fig, ax = plt.subplots(figsize=(10, 8))
ax.barh(range(len(top_features)), top_features['importance'])
ax.set_yticks(range(len(top_features)))
ax.set_yticklabels(top_features['feature'])
ax.set_xlabel('Importance', fontsize=12)
ax.set_title('Top 15 Most Important Features (XGBoost)', fontsize=14)
ax.invert_yaxis()
ax.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

print("\nTop 10 Most Important Features:")
importance_df.head(10)

## 7. Residual Analysis

In [None]:
# Calculate residuals
prophet_residuals = prophet_results['actual'].values - prophet_results['predicted'].values
xgb_residuals = y_test.values - xgb_predictions

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Prophet residuals over time
axes[0, 0].plot(prophet_results['date'], prophet_residuals, marker='o', linestyle='-', alpha=0.7)
axes[0, 0].axhline(y=0, color='r', linestyle='--', linewidth=2)
axes[0, 0].set_title('Prophet: Residuals Over Time', fontsize=12)
axes[0, 0].set_xlabel('Date', fontsize=10)
axes[0, 0].set_ylabel('Residual', fontsize=10)
axes[0, 0].grid(True, alpha=0.3)

# Prophet residuals distribution
axes[0, 1].hist(prophet_residuals, bins=30, edgecolor='black', alpha=0.7)
axes[0, 1].set_title('Prophet: Residuals Distribution', fontsize=12)
axes[0, 1].set_xlabel('Residual', fontsize=10)
axes[0, 1].set_ylabel('Frequency', fontsize=10)
axes[0, 1].grid(True, alpha=0.3)

# XGBoost residuals over time
axes[1, 0].plot(dates_test, xgb_residuals, marker='o', linestyle='-', alpha=0.7)
axes[1, 0].axhline(y=0, color='r', linestyle='--', linewidth=2)
axes[1, 0].set_title('XGBoost: Residuals Over Time', fontsize=12)
axes[1, 0].set_xlabel('Date', fontsize=10)
axes[1, 0].set_ylabel('Residual', fontsize=10)
axes[1, 0].grid(True, alpha=0.3)

# XGBoost residuals distribution
axes[1, 1].hist(xgb_residuals, bins=30, edgecolor='black', alpha=0.7)
axes[1, 1].set_title('XGBoost: Residuals Distribution', fontsize=12)
axes[1, 1].set_xlabel('Residual', fontsize=10)
axes[1, 1].set_ylabel('Frequency', fontsize=10)
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Conclusion

This notebook demonstrated:
1. Loading and exploring demand data
2. Feature engineering for time series
3. Training Prophet and XGBoost models
4. Evaluating and comparing model performance
5. Analyzing feature importance and residuals

Next steps:
- Fine-tune model hyperparameters
- Try additional features or external data
- Experiment with ensemble methods
- Deploy the best model using the Streamlit app