In [None]:
!pip install ucimlrepo

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Fetch dataset
ds = fetch_ucirepo(id=235)
X_raw = ds.data.features
y_raw = ds.data.targets
df = pd.concat([X_raw, y_raw], axis=1)

# Preprocessing
df['Global_active_power'] = pd.to_numeric(df['Global_active_power'], errors='coerce')
df = df[['Global_active_power']].fillna(df['Global_active_power'].mean())

# Sliding Window
def create_sliding_window(df, window_size=30, forecast_horizon=120):
    X, y = [], []
    data = df['Global_active_power'].values
    for i in range(window_size, len(data) - forecast_horizon):
        X.append(data[i-window_size:i])
        y.append(data[i+1:i+forecast_horizon+1])
    return np.array(X), np.array(y)

window_size = 30
forecast_horizon = 120
X, y = create_sliding_window(df, window_size, forecast_horizon)

# Split & Scale
# Using the same split logic: last 20% is test, but we only use 5000 samples for training from the train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Limit training size to 5000 samples as per paper/original notebook to manage compute time
X_train = X_train[:5000]
y_train = y_train[:5000]

# Standardization is crucial for SVM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data prepared. Train shape:", X_train_scaled.shape, "Test shape:", X_test_scaled.shape)

In [None]:
# Initialize Models

# 1. SVM (SVR)
# SVR does not support multi-output natively, so we wrap it in MultiOutputRegressor.
# This means it will fit 120 individual SVR models (one for each future time step).
# RBF kernel is generally best for non-linear time series.
print("Initializing SVM (MultiOutput SVR)...")
svm_model = MultiOutputRegressor(SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1), n_jobs=-1)

# 2. Random Forest
# Random Forest natively supports multi-output regression.
# However, to be consistent with the paper's likely approach of '1 model per minute' (implied by the error variance plotting per step),
# and the previous notebook's use of MultiOutputRegressor for DecisionTrees, we can wrap it too.
# Wrapping it means we get 120 independent forests. Using native RF means we get one forest predicting 120 outputs.
# Native is much faster and often better. But strictly '1 model per minute' implies MultiOutputWrapper.
# Let's use MultiOutputRegressor to strictly follow the '1 model per minute' paradigm mentioned in the previous notebook's comments.
print("Initializing Random Forest (MultiOutput)... ")
rf_model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1), n_jobs=-1)

print("Training SVM (this may take a few minutes for 5000 samples x 120 outputs)...")
svm_model.fit(X_train_scaled, y_train)
print("SVM Trained.")

print("Training Random Forest...")
rf_model.fit(X_train_scaled, y_train)
print("Random Forest Trained.")

In [None]:
# Predictions
print("Predicting SVM...")
y_pred_svm = svm_model.predict(X_test_scaled)
print("Predicting Random Forest...")
y_pred_rf = rf_model.predict(X_test_scaled)

# Evaluation Metrics (Global)
mae_svm = mean_absolute_error(y_test, y_pred_svm)
mse_svm = mean_squared_error(y_test, y_pred_svm)
r2_svm = r2_score(y_test, y_pred_svm)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"SVM - MAE: {mae_svm:.4f}, MSE: {mse_svm:.4f}, R2: {r2_svm:.4f}")
print(f"Random Forest - MAE: {mae_rf:.4f}, MSE: {mse_rf:.4f}, R2: {r2_rf:.4f}")

In [None]:
# Visualization
plt.figure(figsize=(18, 12))

# 1. Error Variance per forecast step
plt.subplot(2, 2, 1)
error_variance_svm = np.var(y_test - y_pred_svm, axis=0)
error_variance_rf = np.var(y_test - y_pred_rf, axis=0)
plt.plot(np.arange(1, forecast_horizon + 1), error_variance_svm, label='SVM', marker='.')
plt.plot(np.arange(1, forecast_horizon + 1), error_variance_rf, label='Random Forest', marker='.')
plt.title('Error Variance per Forecast Step (Lower is better)')
plt.xlabel('Forecast Span [minutes]')
plt.ylabel('Variance')
plt.legend()
plt.grid(True, alpha=0.3)

# 2. MAE per step
plt.subplot(2, 2, 2)
plt.plot(np.mean(np.abs(y_test - y_pred_svm), axis=0), label='SVM')
plt.plot(np.mean(np.abs(y_test - y_pred_rf), axis=0), label='Random Forest')
plt.title('MAE per Forecast Step (Lower is better)')
plt.xlabel('Forecast Span [minutes]')
plt.ylabel('MAE')
plt.legend()
plt.grid(True, alpha=0.3)

# 3. MSE per step
plt.subplot(2, 2, 3)
plt.plot(np.mean((y_test - y_pred_svm)**2, axis=0), label='SVM')
plt.plot(np.mean((y_test - y_pred_rf)**2, axis=0), label='Random Forest')
plt.title('MSE per Forecast Step (Lower is better)')
plt.xlabel('Forecast Span [minutes]')
plt.ylabel('MSE')
plt.legend()
plt.grid(True, alpha=0.3)

# 4. R2 per step
plt.subplot(2, 2, 4)
r2_svm_steps = [r2_score(y_test[:, i], y_pred_svm[:, i]) for i in range(forecast_horizon)]
r2_rf_steps = [r2_score(y_test[:, i], y_pred_rf[:, i]) for i in range(forecast_horizon)]
plt.plot(r2_svm_steps, label='SVM')
plt.plot(r2_rf_steps, label='Random Forest')
plt.title('R2 per Forecast Step (Higher is better)')
plt.xlabel('Forecast Span [minutes]')
plt.ylabel('R2 Score')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()