# TSA_ch8_random_forest

## Random Forest for Time Series Forecasting

**Data**: Germany Daily Electricity Consumption

**Source**: ENTSO-E / Open Power System Data

**Author**: Daniel Traian PELE

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Germany Electricity Data

In [None]:
# Download from Open Power System Data
url = "https://data.open-power-system-data.org/time_series/2020-10-06/time_series_60min_singleindex.csv"

print("Downloading Germany electricity data (this may take a moment)...")
df_raw = pd.read_csv(url, parse_dates=['utc_timestamp'], low_memory=False)

# Extract Germany total load
df = df_raw[['utc_timestamp', 'DE_load_actual_entsoe_transparency']].copy()
df.columns = ['datetime', 'load_mw']
df = df.dropna()

# Aggregate to daily
df['date'] = df['datetime'].dt.date
daily = df.groupby('date')['load_mw'].sum().reset_index()
daily['date'] = pd.to_datetime(daily['date'])
daily['consumption_gwh'] = daily['load_mw'] / 1000  # MW*h to GWh

# Filter 2015-2019
daily = daily[(daily['date'] >= '2015-01-01') & (daily['date'] <= '2019-12-31')]
daily = daily.reset_index(drop=True)

print(f"\nData loaded: {daily['date'].min().date()} to {daily['date'].max().date()}")
print(f"Observations: {len(daily)} days")
print(f"Mean consumption: {daily['consumption_gwh'].mean():.1f} GWh/day")

In [None]:
# Visualize the data
fig, axes = plt.subplots(2, 1, figsize=(14, 8))

axes[0].plot(daily['date'], daily['consumption_gwh'], linewidth=0.5, color='steelblue')
axes[0].set_title('Germany Daily Electricity Consumption (2015-2019)', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Consumption (GWh)')
axes[0].grid(True, alpha=0.3)

# Weekly pattern
weekly_avg = daily.groupby(daily['date'].dt.dayofweek)['consumption_gwh'].mean()
axes[1].bar(range(7), weekly_avg.values, color='steelblue')
axes[1].set_xticks(range(7))
axes[1].set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
axes[1].set_title('Average Consumption by Day of Week', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Consumption (GWh)')
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 2. Feature Engineering

In [None]:
def create_features(df):
    """Create features for time series forecasting."""
    df = df.copy()
    
    # Target
    df['y'] = df['consumption_gwh']
    
    # Lag features (use only past data!)
    for lag in [1, 2, 3, 7, 14, 21, 28]:
        df[f'lag_{lag}'] = df['y'].shift(lag)
    
    # Rolling statistics (shifted by 1 to avoid data leakage)
    for window in [7, 14, 30]:
        df[f'roll_mean_{window}'] = df['y'].shift(1).rolling(window).mean()
        df[f'roll_std_{window}'] = df['y'].shift(1).rolling(window).std()
    
    # Calendar features
    df['day_of_week'] = df['date'].dt.dayofweek
    df['month'] = df['date'].dt.month
    df['day_of_year'] = df['date'].dt.dayofyear
    df['week_of_year'] = df['date'].dt.isocalendar().week.astype(int)
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    
    # Cyclical encoding
    df['dow_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['dow_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    
    return df.dropna()

df_features = create_features(daily)
print(f"Features created: {len(df_features)} samples")
print(f"\nFeature columns:")
feature_cols = [c for c in df_features.columns if c not in ['date', 'load_mw', 'consumption_gwh', 'y']]
print(feature_cols)

## 3. Train/Test Split (Temporal)

In [None]:
# 70% train, 15% validation, 15% test
n = len(df_features)
train_end = int(n * 0.70)
val_end = int(n * 0.85)

train = df_features.iloc[:train_end]
val = df_features.iloc[train_end:val_end]
test = df_features.iloc[val_end:]

X_train, y_train = train[feature_cols], train['y']
X_val, y_val = val[feature_cols], val['y']
X_test, y_test = test[feature_cols], test['y']

print(f"Training:   {len(train)} days ({train['date'].min().date()} to {train['date'].max().date()})")
print(f"Validation: {len(val)} days ({val['date'].min().date()} to {val['date'].max().date()})")
print(f"Test:       {len(test)} days ({test['date'].min().date()} to {test['date'].max().date()})")

## 4. Train Random Forest

In [None]:
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

# Predictions
y_pred_train = rf.predict(X_train)
y_pred_val = rf.predict(X_val)
y_pred_test = rf.predict(X_test)

# Metrics
def mape(y_true, y_pred):
    return 100 * np.mean(np.abs((y_true - y_pred) / y_true))

print("="*60)
print("RANDOM FOREST PERFORMANCE")
print("="*60)
print(f"\nTraining Set:")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_train, y_pred_train)):.2f} GWh")
print(f"  MAE:  {mean_absolute_error(y_train, y_pred_train):.2f} GWh")
print(f"  MAPE: {mape(y_train, y_pred_train):.2f}%")

print(f"\nValidation Set:")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_val, y_pred_val)):.2f} GWh")
print(f"  MAE:  {mean_absolute_error(y_val, y_pred_val):.2f} GWh")
print(f"  MAPE: {mape(y_val, y_pred_val):.2f}%")

print(f"\nTest Set:")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_test)):.2f} GWh")
print(f"  MAE:  {mean_absolute_error(y_test, y_pred_test):.2f} GWh")
print(f"  MAPE: {mape(y_test, y_pred_test):.2f}%")

## 5. Feature Importance

In [None]:
importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(importance.head(10).to_string(index=False))

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Predictions vs Actual
ax = axes[0, 0]
ax.plot(test['date'], y_test, 'b-', linewidth=1, label='Actual')
ax.plot(test['date'], y_pred_test, 'r-', linewidth=1, alpha=0.7, label='Predicted')
ax.set_title('Random Forest: Predictions vs Actual (Test Set)', fontweight='bold')
ax.set_xlabel('Date')
ax.set_ylabel('Consumption (GWh)')
ax.legend()
ax.grid(True, alpha=0.3)

# 2. Feature Importance
ax = axes[0, 1]
top15 = importance.head(15)
colors = plt.cm.Blues(np.linspace(0.4, 0.9, len(top15)))
ax.barh(range(len(top15)), top15['importance'].values, color=colors[::-1])
ax.set_yticks(range(len(top15)))
ax.set_yticklabels(top15['feature'].values)
ax.invert_yaxis()
ax.set_xlabel('Importance')
ax.set_title('Feature Importance (Top 15)', fontweight='bold')
ax.grid(True, alpha=0.3, axis='x')

# 3. Residuals
ax = axes[1, 0]
residuals = y_test.values - y_pred_test
ax.plot(test['date'], residuals, 'g-', linewidth=0.8)
ax.axhline(y=0, color='red', linestyle='--', linewidth=1)
ax.set_title('Prediction Residuals (Test Set)', fontweight='bold')
ax.set_xlabel('Date')
ax.set_ylabel('Residual (GWh)')
ax.grid(True, alpha=0.3)

# 4. Actual vs Predicted scatter
ax = axes[1, 1]
ax.scatter(y_test, y_pred_test, alpha=0.5, s=20)
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', linewidth=2)
ax.set_xlabel('Actual (GWh)')
ax.set_ylabel('Predicted (GWh)')
ax.set_title('Actual vs Predicted', fontweight='bold')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('ch8_random_forest_results.pdf', bbox_inches='tight', dpi=150)
plt.show()

## 6. Time Series Cross-Validation

In [None]:
X = df_features[feature_cols]
y = df_features['y']

tscv = TimeSeriesSplit(n_splits=5)
cv_scores = []

print("Time Series Cross-Validation (5 folds):")
for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
    X_cv_train, X_cv_val = X.iloc[train_idx], X.iloc[val_idx]
    y_cv_train, y_cv_val = y.iloc[train_idx], y.iloc[val_idx]
    
    rf_cv = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)
    rf_cv.fit(X_cv_train, y_cv_train)
    y_cv_pred = rf_cv.predict(X_cv_val)
    
    mape_score = mape(y_cv_val, y_cv_pred)
    cv_scores.append(mape_score)
    print(f"  Fold {fold + 1}: MAPE = {mape_score:.2f}%")

print(f"\nMean CV MAPE: {np.mean(cv_scores):.2f}% (+/- {np.std(cv_scores):.2f}%)")