# ‚è∞ Time Series Forecasting

Complete time series analysis and forecasting workflow.

**Level**: Intermediate  
**Time Required**: ~45 minutes

In [None]:
import sys
sys.path.insert(0, '../../')

from data_science_master_system import DataLoader, Plotter
from data_science_master_system.features.engineering.feature_factory import FeatureFactory
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Ready!")

In [None]:
# Load time series data
loader = DataLoader()
df = loader.read('../data/csv/sales_timeseries.csv')
df['date'] = pd.to_datetime(df['date'])

print(f"Dataset: {df.shape}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
df.head()

## 1. Time Series Visualization

In [None]:
# Focus on one product category
electronics = df[df['product_category'] == 'Electronics'].copy()
electronics = electronics.sort_values('date').reset_index(drop=True)

# Plot sales over time
fig, axes = plt.subplots(2, 1, figsize=(14, 8))

# Daily sales
axes[0].plot(electronics['date'], electronics['sales'], color='steelblue', alpha=0.7)
axes[0].set_title('Daily Electronics Sales')
axes[0].set_xlabel('Date')
axes[0].set_ylabel('Sales ($)')

# Add rolling average
electronics['sales_7d_avg'] = electronics['sales'].rolling(7).mean()
axes[1].plot(electronics['date'], electronics['sales'], alpha=0.3, label='Daily')
axes[1].plot(electronics['date'], electronics['sales_7d_avg'], color='red', label='7-day MA')
axes[1].set_title('Sales with 7-day Moving Average')
axes[1].legend()

plt.tight_layout()
plt.show()

## 2. Seasonality Analysis

In [None]:
# Day of week effect
dow_sales = electronics.groupby('day_of_week')['sales'].mean()

# Reorder days
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
dow_sales = dow_sales.reindex(day_order)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

dow_sales.plot(kind='bar', ax=axes[0], color='coral')
axes[0].set_title('Average Sales by Day of Week')
axes[0].set_xlabel('')
axes[0].tick_params(axis='x', rotation=45)

# Monthly pattern
electronics['month'] = electronics['date'].dt.month
monthly_sales = electronics.groupby('month')['sales'].mean()
monthly_sales.plot(kind='bar', ax=axes[1], color='steelblue')
axes[1].set_title('Average Sales by Month')

plt.tight_layout()
plt.show()

## 3. Feature Engineering for Time Series

In [None]:
# Create time-based features
factory = FeatureFactory()

# Generate datetime features
datetime_features = factory.generate_datetime_features(electronics['date'], prefix='date')
electronics = pd.concat([electronics, datetime_features], axis=1)

# Lag features
for lag in [1, 7, 14, 30]:
    electronics[f'sales_lag_{lag}'] = electronics['sales'].shift(lag)

# Rolling features
for window in [7, 14, 30]:
    electronics[f'sales_rolling_mean_{window}'] = electronics['sales'].rolling(window).mean()
    electronics[f'sales_rolling_std_{window}'] = electronics['sales'].rolling(window).std()

# Drop NaN rows from lagging/rolling
electronics = electronics.dropna()

print(f"Features created: {electronics.shape[1]}")
print(electronics.columns.tolist()[-15:])

## 4. Train Forecasting Model

In [None]:
# Prepare features
feature_cols = [c for c in electronics.columns if c not in ['date', 'product_category', 'day_of_week', 'sales', 'sales_7d_avg']]

X = electronics[feature_cols]
y = electronics['sales']

# Time-based split (no random!)
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

print(f"Training: {X_train.shape}, Test: {X_test.shape}")

In [None]:
# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print(f"\nüìä Forecast Performance:")
print(f"  MAE: ${mae:.2f}")
print(f"  RMSE: ${rmse:.2f}")
print(f"  MAPE: {mape:.1f}%")

In [None]:
# Plot forecast vs actual
test_dates = electronics['date'].iloc[train_size:].reset_index(drop=True)

plt.figure(figsize=(14, 6))
plt.plot(test_dates, y_test.values, label='Actual', color='blue')
plt.plot(test_dates, y_pred, label='Predicted', color='red', alpha=0.7)
plt.fill_between(test_dates, y_pred - rmse, y_pred + rmse, alpha=0.2, color='red')
plt.title('Sales Forecast vs Actual')
plt.xlabel('Date')
plt.ylabel('Sales ($)')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Feature importance for forecasting
importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importances_
}).nlargest(15, 'importance')

plt.figure(figsize=(10, 6))
plt.barh(importance['feature'], importance['importance'], color='steelblue')
plt.xlabel('Importance')
plt.title('Top 15 Features for Sales Forecasting')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## üéØ Key Takeaways

1. Time-based train/test split (not random)
2. Lag features capture autocorrelation
3. Rolling statistics smooth noise
4. Datetime components capture seasonality
5. Recent lags most important for short-term forecasting