In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv(r'D:\Projects_25\Energy_Ananlysis\energy\Datasets\eia_training_data_2023.csv')
df.head()


Unnamed: 0,datetime,load_mw,hour,dayofweek,month,is_weekend,lag_1,lag_2,lag_24,rolling_mean_3,rolling_mean_24
0,2023-01-06 00:00:00,-4573,0,4,1,0,-4232.0,-4814.0,-6605.0,-4539.666667,4215.958333
1,2023-01-06 01:00:00,30667,1,4,1,0,-4573.0,-4232.0,-7379.0,7287.333333,5801.208333
2,2023-01-06 02:00:00,-6895,2,4,1,0,30667.0,-4573.0,32462.0,6399.666667,4161.333333
3,2023-01-06 03:00:00,-7032,3,4,1,0,-6895.0,30667.0,-7757.0,5580.0,4191.541667
4,2023-01-06 04:00:00,33271,4,4,1,0,-7032.0,-6895.0,33515.0,6448.0,4181.375


In [2]:
# Check shape and nulls
print(df.shape)
print(df.isnull().sum())

# Check datatypes
print(df.dtypes)

# Optional: check unique values
print(df.nunique())


(8810, 11)
datetime           0
load_mw            0
hour               0
dayofweek          0
month              0
is_weekend         0
lag_1              0
lag_2              0
lag_24             0
rolling_mean_3     0
rolling_mean_24    0
dtype: int64
datetime            object
load_mw              int64
hour                 int64
dayofweek            int64
month                int64
is_weekend           int64
lag_1              float64
lag_2              float64
lag_24             float64
rolling_mean_3     float64
rolling_mean_24    float64
dtype: object
datetime           8810
load_mw            7203
hour                 24
dayofweek             7
month                12
is_weekend            2
lag_1              7203
lag_2              7203
lag_24             7197
rolling_mean_3     8136
rolling_mean_24    8607
dtype: int64


In [3]:
# Convert datetime column
df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')

# Convert load_mw to numeric, force errors to NaN
df['load_mw'] = pd.to_numeric(df['load_mw'], errors='coerce')

# Drop rows with any nulls in datetime or load_mw
df = df.dropna(subset=['datetime', 'load_mw'])

# Set datetime as index
df = df.sort_values('datetime')
df = df.set_index('datetime')

print(df.shape)
print(df.isna().sum())
print(df.dtypes)


(8810, 10)
load_mw            0
hour               0
dayofweek          0
month              0
is_weekend         0
lag_1              0
lag_2              0
lag_24             0
rolling_mean_3     0
rolling_mean_24    0
dtype: int64
load_mw              int64
hour                 int64
dayofweek            int64
month                int64
is_weekend           int64
lag_1              float64
lag_2              float64
lag_24             float64
rolling_mean_3     float64
rolling_mean_24    float64
dtype: object


In [4]:
# Feature Engineering
df['hour'] = df.index.hour
df['dayofweek'] = df.index.dayofweek
df['month'] = df.index.month
df['is_weekend'] = df['dayofweek'].apply(lambda x: 1 if x >= 5 else 0)

# Lag Features
df['lag_1'] = df['load_mw'].shift(1)
df['lag_2'] = df['load_mw'].shift(2)
df['lag_24'] = df['load_mw'].shift(24)

# Rolling Features
df['rolling_mean_3'] = df['load_mw'].rolling(window=3).mean()
df['rolling_mean_24'] = df['load_mw'].rolling(window=24).mean()

# Drop rows with NaNs (due to lag/rolling windows)
df = df.dropna()

# Final check
print(df.shape)
print(df.head())


(8786, 10)
                     load_mw  hour  dayofweek  month  is_weekend    lag_1  \
datetime                                                                    
2023-01-07 00:00:00    23673     0          5      1           1  27676.0   
2023-01-07 01:00:00    -7347     1          5      1           1  23673.0   
2023-01-07 02:00:00    -7895     2          5      1           1  -7347.0   
2023-01-07 03:00:00    32692     3          5      1           1  -7895.0   
2023-01-07 04:00:00    -7535     4          5      1           1  32692.0   

                       lag_2   lag_24  rolling_mean_3  rolling_mean_24  
datetime                                                                
2023-01-07 00:00:00  -2934.0  -4573.0    16138.333333      8539.708333  
2023-01-07 01:00:00  27676.0  30667.0    14667.333333      6955.791667  
2023-01-07 02:00:00  23673.0  -6895.0     2810.333333      6914.125000  
2023-01-07 03:00:00  -7347.0  -7032.0     5816.666667      8569.291667  
2023-01-07 

In [5]:
from sklearn.model_selection import train_test_split

# Define features and target
features = ['hour', 'dayofweek', 'month', 'is_weekend',
            'lag_1', 'lag_2', 'lag_24', 'rolling_mean_3', 'rolling_mean_24']
target = 'load_mw'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=False  # no shuffle to preserve time order
)


In [6]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


In [7]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) ** 0.5  
r2 = r2_score(y_test, y_pred)

print(f"ðŸ“Š MAE: {mae:.2f}")
print(f"ðŸ“Š RMSE: {rmse:.2f}")
print(f"ðŸ“Š RÂ² Score: {r2:.4f}")


ðŸ“Š MAE: 814.66
ðŸ“Š RMSE: 1406.75
ðŸ“Š RÂ² Score: 0.9931


In [9]:
import joblib

joblib.dump(model, '../models/rf_model_eia_2023.pkl')


['../models/rf_model_eia_2023.pkl']