In [8]:
import sys, os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import numpy as np
import os
import joblib
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

from src.data_utils import create_window

df = pd.read_csv('../data/raw/electricity_marketing_dataset.csv')
df.head()

Unnamed: 0,timestamp,temperature,humidity,is_weekend,is_holiday,consumer_type,price_signal,historical_avg_demand,voltage_level,grid_frequency,energy_source_mix,demand_category
0,2024-01-01 00:00:00,0.526922,0.265203,0,1,residential,0.469818,0.080994,0.538437,0.228592,0.44054,Medium
1,2024-01-01 01:00:00,0.437412,0.539677,0,1,residential,0.431329,0.080842,0.412133,0.478937,0.312881,Medium
2,2024-01-01 02:00:00,0.548204,0.632031,0,1,commercial,0.712748,0.43479,0.784449,0.211724,0.127635,High
3,2024-01-01 03:00:00,0.671595,0.254353,0,1,commercial,0.708677,0.641761,0.596124,0.483553,0.468987,High
4,2024-01-01 04:00:00,0.423895,0.135163,0,1,residential,0.537836,0.585632,0.535468,0.396965,0.121962,Medium


In [9]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['hour_of_day'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek
df['month'] = df['timestamp'].dt.month

df = df.drop('timestamp', axis=1)

print("Time features created.")
df.head()

Time features created.


Unnamed: 0,temperature,humidity,is_weekend,is_holiday,consumer_type,price_signal,historical_avg_demand,voltage_level,grid_frequency,energy_source_mix,demand_category,hour_of_day,day_of_week,month
0,0.526922,0.265203,0,1,residential,0.469818,0.080994,0.538437,0.228592,0.44054,Medium,0,0,1
1,0.437412,0.539677,0,1,residential,0.431329,0.080842,0.412133,0.478937,0.312881,Medium,1,0,1
2,0.548204,0.632031,0,1,commercial,0.712748,0.43479,0.784449,0.211724,0.127635,High,2,0,1
3,0.671595,0.254353,0,1,commercial,0.708677,0.641761,0.596124,0.483553,0.468987,High,3,0,1
4,0.423895,0.135163,0,1,residential,0.537836,0.585632,0.535468,0.396965,0.121962,Medium,4,0,1


In [10]:
ohe = OneHotEncoder(sparse_output=False, drop='first', dtype=int)
consumer_features = ohe.fit_transform(df[['consumer_type']])

consumer_df = pd.DataFrame(
    consumer_features, 
    columns=ohe.get_feature_names_out(['consumer_type']), 
    index=df.index
)

df = pd.concat([df, consumer_df], axis=1)

df = df.drop('consumer_type', axis=1)

print("One-hot encoding complete.")
df.head()

One-hot encoding complete.


Unnamed: 0,temperature,humidity,is_weekend,is_holiday,price_signal,historical_avg_demand,voltage_level,grid_frequency,energy_source_mix,demand_category,hour_of_day,day_of_week,month,consumer_type_industrial,consumer_type_residential
0,0.526922,0.265203,0,1,0.469818,0.080994,0.538437,0.228592,0.44054,Medium,0,0,1,0,1
1,0.437412,0.539677,0,1,0.431329,0.080842,0.412133,0.478937,0.312881,Medium,1,0,1,0,1
2,0.548204,0.632031,0,1,0.712748,0.43479,0.784449,0.211724,0.127635,High,2,0,1,0,0
3,0.671595,0.254353,0,1,0.708677,0.641761,0.596124,0.483553,0.468987,High,3,0,1,0,0
4,0.423895,0.135163,0,1,0.537836,0.585632,0.535468,0.396965,0.121962,Medium,4,0,1,0,1


In [11]:
n = len(df)

train_end = int(n * 0.70)
val_end = int(n * (0.70 + 0.15))

train_df = df.iloc[:train_end]
val_df = df.iloc[train_end:val_end]
test_df = df.iloc[val_end:]

print(f"Data split chronologically:")
print(f"  Training set shape:   {train_df.shape}")
print(f"  Validation set shape: {val_df.shape}")
print(f"  Test set shape:       {test_df.shape}")

Data split chronologically:
  Training set shape:   (503, 15)
  Validation set shape: (109, 15)
  Test set shape:       (108, 15)


In [20]:
TARGET_COL = 'historical_avg_demand'

feature_cols = [col for col in df.columns if col not in [TARGET_COL, 'demand_category']]

feature_scaler = MinMaxScaler(feature_range=(0, 1))

# FIT the scaler ONLY on the training features
feature_scaler.fit(train_df[feature_cols])

# TRANSFORM all sets
train_features_scaled = feature_scaler.transform(train_df[feature_cols])
val_features_scaled = feature_scaler.transform(val_df[feature_cols])
test_features_scaled = feature_scaler.transform(test_df[feature_cols])

target_scaler = MinMaxScaler(feature_range=(0, 1))

# FIT the scaler ONLY on the training target
target_scaler.fit(train_df[[TARGET_COL]])

# TRANSFORM all sets
train_target_scaled = target_scaler.transform(train_df[[TARGET_COL]])
val_target_scaled = target_scaler.transform(val_df[[TARGET_COL]])
test_target_scaled = target_scaler.transform(test_df[[TARGET_COL]])

joblib.dump(feature_scaler, '../models/feature_scaler.joblib')
joblib.dump(target_scaler, '../models/target_scaler.joblib')

print("Features and target scaled. Scalers saved to 'models/' folder.")


Features and target scaled. Scalers saved to 'models/' folder.


In [18]:
WINDOW_SIZE = 24 

print(f"Applying {WINDOW_SIZE}-hour sliding window...")

X_train, y_train = create_window(
    train_features_scaled, 
    train_target_scaled, 
    WINDOW_SIZE
)

X_val, y_val = create_window(
    val_features_scaled, 
    val_target_scaled, 
    WINDOW_SIZE
)

X_test, y_test = create_window(
    test_features_scaled, 
    test_target_scaled, 
    WINDOW_SIZE
)

print("Sliding window transformation complete.")
print(f"X_train shape: {X_train.shape} | y_train shape: {y_train.shape}")
print(f"X_val shape:   {X_val.shape}   | y_val shape:   {y_val.shape}")
print(f"X_test shape:  {X_test.shape}  | y_test shape:  {y_test.shape}")

Applying 24-hour sliding window...
Sliding window transformation complete.
X_train shape: (479, 24, 13) | y_train shape: (479, 1)
X_val shape:   (85, 24, 13)   | y_val shape:   (85, 1)
X_test shape:  (84, 24, 13)  | y_test shape:  (84, 1)


In [21]:
print("Saving final .npy arrays to 'data/processed/'...")

np.save('../data/processed/X_train.npy', X_train)
np.save('../data/processed/y_train.npy', y_train)

np.save('../data/processed/X_val.npy', X_val)
np.save('../data/processed/y_val.npy', y_val)

np.save('../data/processed/X_test.npy', X_test)
np.save('../data/processed/y_test.npy', y_test)

print("All data is preprocessed and saved.")

Saving final .npy arrays to 'data/processed/'...
All data is preprocessed and saved.
