# Energy Consumption Forecasting

Forecast household energy consumption using time series features.

**Dataset:** [https://www.kaggle.com/datasets/uciml/electric-power-consumption-data-set](https://www.kaggle.com/datasets/uciml/electric-power-consumption-data-set)  
**Target:** `Global_active_power`  
**Type:** Time Series Forecasting

> **TODO:** Download the dataset, place it in `../../data/raw/`, then update `DATA_PATH`, `DATE_COL`, and `TARGET` below.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
sns.set_theme(style='whitegrid')

## 1. Load & Parse Dates

In [None]:
DATA_PATH = "../../data/raw/household_power_consumption.txt"
DATE_COL = "Date"  # TODO: verify date column name
TARGET = "Global_active_power"      # TODO: verify target column name

df = pd.read_csv(DATA_PATH, parse_dates=[DATE_COL])
df = df.sort_values(DATE_COL).reset_index(drop=True)
print(f'Shape: {df.shape}')
print(f'Date range: {df[DATE_COL].min()} → {df[DATE_COL].max()}')
df.head()

## 2. Time Series EDA

In [None]:
# Overall trend
plt.figure(figsize=(14, 4))
plt.plot(df[DATE_COL], df[TARGET], linewidth=0.8)
plt.title(f'{TARGET} over time')
plt.xlabel(DATE_COL); plt.ylabel(TARGET)
plt.tight_layout(); plt.show()

print(df[TARGET].describe())

In [None]:
# Seasonal patterns
df['year'] = df[DATE_COL].dt.year
df['month'] = df[DATE_COL].dt.month
df['dayofweek'] = df[DATE_COL].dt.dayofweek

fig, axes = plt.subplots(1, 2, figsize=(14, 4))
df.groupby('month')[TARGET].mean().plot(ax=axes[0])
axes[0].set_title('Average by Month')
df.groupby('dayofweek')[TARGET].mean().plot(ax=axes[1])
axes[1].set_title('Average by Day of Week')
plt.tight_layout(); plt.show()

## 3. Feature Engineering

In [None]:
# Calendar features
df['quarter'] = df[DATE_COL].dt.quarter
df['weekofyear'] = df[DATE_COL].dt.isocalendar().week.astype(int)

# Lag features — adjust window sizes to your data frequency
for lag in [1, 7, 14, 28]:
    df[f'lag_{lag}'] = df[TARGET].shift(lag)

# Rolling statistics
for window in [7, 14]:
    df[f'rolling_mean_{window}'] = df[TARGET].shift(1).rolling(window).mean()
    df[f'rolling_std_{window}'] = df[TARGET].shift(1).rolling(window).std()

df = df.dropna().reset_index(drop=True)
print(f'Shape after feature engineering: {df.shape}')

## 4. Time-Based Train / Test Split

In [None]:
# Use last 20% of time as test set (never shuffle time series!)
split_idx = int(len(df) * 0.8)
train_df = df.iloc[:split_idx]
test_df = df.iloc[split_idx:]

drop_cols = [TARGET, DATE_COL]
feature_cols = [c for c in df.columns if c not in drop_cols]

X_train, y_train = train_df[feature_cols], train_df[TARGET]
X_test, y_test = test_df[feature_cols], test_df[TARGET]
print(f'Train: {X_train.shape}, Test: {X_test.shape}')

## 5. Model Training

In [None]:
def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / np.where(y_true == 0, 1e-9, y_true))) * 100

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mp = mape(y_test.values, preds)
    results[name] = {'model': model, 'preds': preds}
    print(f'{name}: MAE={mae:.3f}  RMSE={rmse:.3f}  MAPE={mp:.2f}%')

## 6. Forecast Plot

In [None]:
best_name = min(
    results,
    key=lambda k: mean_squared_error(y_test, results[k]['preds'])
)
best_preds = results[best_name]['preds']

plt.figure(figsize=(14, 5))
plt.plot(test_df[DATE_COL].values, y_test.values, label='Actual', linewidth=1)
plt.plot(test_df[DATE_COL].values, best_preds, label=f'Predicted ({best_name})',
         linewidth=1, linestyle='--')
plt.title(f'Forecast vs Actual — {best_name}')
plt.xlabel('Date'); plt.ylabel(TARGET)
plt.legend(); plt.tight_layout(); plt.show()

## 7. Conclusion

| Model | MAE | RMSE | MAPE |
|---|---|---|---|
| *(fill after running)* | | | |

**Observations:**
- 

**Next steps:**
- Add more lag windows
- Try cross-validation with TimeSeriesSplit
- Explore SARIMA / Prophet for pure time-series approaches