In [2]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv(r'D:\Projects_25\Energy_Ananlysis\energy\Datasets\eia_training_data_2023.csv')
df.head()


Unnamed: 0,datetime,load_mw
0,2023-01-01 00:00:00,-7211
1,2023-01-01 00:00:00,29324
2,2023-01-01 00:00:00,22239
3,2023-01-01 00:00:00,28753
4,2023-01-01 01:00:00,30063


In [3]:
# Check shape and nulls
print(df.shape)
print(df.isnull().sum())

# Check datatypes
print(df.dtypes)

# Optional: check unique values
print(df.nunique())


(34948, 2)
datetime    0
load_mw     0
dtype: int64
datetime    object
load_mw      int64
dtype: object
datetime     8737
load_mw     20491
dtype: int64


In [4]:
# Convert datetime column
df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')

# Convert load_mw to numeric (if not already), force errors to NaN
df['load_mw'] = pd.to_numeric(df['load_mw'], errors='coerce')

# Drop rows with any nulls in datetime or load_mw
df = df.dropna(subset=['datetime', 'load_mw'])

# Set datetime as index
df = df.sort_values('datetime')
df = df.set_index('datetime')

# Final check
print(df.shape)
print(df.isna().sum())
print(df.dtypes)


(34948, 1)
load_mw    0
dtype: int64
load_mw    int64
dtype: object


In [5]:
# Feature Engineering
df['hour'] = df.index.hour
df['dayofweek'] = df.index.dayofweek
df['month'] = df.index.month
df['is_weekend'] = df['dayofweek'].apply(lambda x: 1 if x >= 5 else 0)

# Lag Features
df['lag_1'] = df['load_mw'].shift(1)
df['lag_2'] = df['load_mw'].shift(2)
df['lag_24'] = df['load_mw'].shift(24)

# Rolling Features
df['rolling_mean_3'] = df['load_mw'].rolling(window=3).mean()
df['rolling_mean_24'] = df['load_mw'].rolling(window=24).mean()

# Drop rows with NaNs (due to lag/rolling windows)
df = df.dropna()

# Final check
print(df.shape)
print(df.head())


(34924, 10)
                     load_mw  hour  dayofweek  month  is_weekend    lag_1  \
datetime                                                                    
2023-01-01 06:00:00    29441     6          6      1           1  30447.0   
2023-01-01 06:00:00    19700     6          6      1           1  29441.0   
2023-01-01 06:00:00    28880     6          6      1           1  19700.0   
2023-01-01 06:00:00    -8371     6          6      1           1  28880.0   
2023-01-01 07:00:00    28658     7          6      1           1  -8371.0   

                       lag_2   lag_24  rolling_mean_3  rolling_mean_24  
datetime                                                                
2023-01-01 06:00:00  29688.0  -7211.0    29858.666667     20435.416667  
2023-01-01 06:00:00  30447.0  29324.0    26529.333333     20034.416667  
2023-01-01 06:00:00  29441.0  22239.0    26007.000000     20311.125000  
2023-01-01 06:00:00  19700.0  28753.0    13403.000000     18764.291667  
2023-01-01

In [6]:
from sklearn.model_selection import train_test_split

# Define features and target
features = ['hour', 'dayofweek', 'month', 'is_weekend',
            'lag_1', 'lag_2', 'lag_24', 'rolling_mean_3', 'rolling_mean_24']
target = 'load_mw'

X = df[features]
y = df[target]

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=False  # no shuffle to preserve time order
)


In [8]:
from sklearn.ensemble import RandomForestRegressor

# Initialize and train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)


In [9]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) ** 0.5  # Manually compute RMSE
r2 = r2_score(y_test, y_pred)

print(f"📊 MAE: {mae:.2f}")
print(f"📊 RMSE: {rmse:.2f}")
print(f"📊 R² Score: {r2:.4f}")


📊 MAE: 608.48
📊 RMSE: 1140.39
📊 R² Score: 0.9932


In [10]:
import joblib

joblib.dump(model, '../models/rf_model_eia_2023.pkl')


['../models/rf_model_eia_2023.pkl']