## Importing Libraries And Loading the data

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the data
df = pd.read_csv("../data/processed/cleaned_energy_data.csv", parse_dates=True, index_col="timestamp")

In [3]:
# Check shape
print(f"Data shape: {df.shape}")

Data shape: (26304, 7)


In [4]:
# Explore the data
print(df.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 26304 entries, 2022-01-01 00:00:00 to 2024-12-31 23:00:00
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   energy_consumption  26304 non-null  float64
 1   temperature_C       26304 non-null  float64
 2   humidity_pct        26304 non-null  float64
 3   hour                26304 non-null  int64  
 4   dayofweek           26304 non-null  int64  
 5   is_weekend          26304 non-null  int64  
 6   is_holiday          26304 non-null  int64  
dtypes: float64(3), int64(4)
memory usage: 1.6 MB
None


In [5]:
# Check the columns in the dataset
print("Columns in the dataset: \n")
df.columns

Columns in the dataset: 



Index(['energy_consumption', 'temperature_C', 'humidity_pct', 'hour',
       'dayofweek', 'is_weekend', 'is_holiday'],
      dtype='object')

In [6]:
# Check the data types of each column
df.dtypes

energy_consumption    float64
temperature_C         float64
humidity_pct          float64
hour                    int64
dayofweek               int64
is_weekend              int64
is_holiday              int64
dtype: object

In [7]:
# Check the summary statistics
display(df.describe())

Unnamed: 0,energy_consumption,temperature_C,humidity_pct,hour,dayofweek,is_weekend,is_holiday
count,26304.0,26304.0,26304.0,26304.0,26304.0,26304.0,26304.0
mean,298.441932,18.002317,64.986499,11.5,3.0,0.286496,0.021898
std,64.172603,7.528693,6.284307,6.922318,2.002318,0.452133,0.146353
min,98.530011,0.25,41.18,0.0,0.0,0.0,0.0
25%,249.643739,11.45,60.69,5.75,1.0,0.0,0.0
50%,297.471015,17.97,65.02,11.5,3.0,0.0,0.0
75%,348.869403,24.57,69.25,17.25,5.0,1.0,0.0
max,505.477664,34.49,90.03,23.0,6.0,1.0,1.0


In [8]:
# Check the first few rows of the dataset
df.head()

Unnamed: 0_level_0,energy_consumption,temperature_C,humidity_pct,hour,dayofweek,is_weekend,is_holiday
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-01-01 00:00:00,208.515343,16.08,66.08,0,5,1,1
2022-01-01 01:00:00,178.759515,14.51,70.85,1,5,1,1
2022-01-01 02:00:00,172.203652,14.96,64.12,2,5,1,1
2022-01-01 03:00:00,193.027518,15.5,70.87,3,5,1,1
2022-01-01 04:00:00,182.605914,12.09,72.43,4,5,1,1


## Data Cleaning

In [9]:
# Check missing values
print("Missing values in each column:\n", df.isnull().sum())

Missing values in each column:
 energy_consumption    0
temperature_C         0
humidity_pct          0
hour                  0
dayofweek             0
is_weekend            0
is_holiday            0
dtype: int64


## Feature Engineering

In [10]:
# Extracting date features
df["month"]     = df.index.month
df["dayofyear"] = df.index.dayofyear

In [11]:
# Cyclical Encoding of Time Features
df["hour_sin"]  = np.sin(2 * np.pi * df["hour"] / 24)
df["hour_cos"]  = np.cos(2 * np.pi * df["hour"] / 24)

df["dow_sin"]   = np.sin(2 * np.pi * df["dayofweek"] / 7)
df["dow_cos"]   = np.cos(2 * np.pi * df["dayofweek"] / 7)

df["month_sin"] = np.sin(2 * np.pi * df["month"] / 12)
df["month_cos"] = np.cos(2 * np.pi * df["month"] / 12)

In [12]:
# Creating weather-related features
df["temp_change"]     = df["temperature_C"].diff()
df["humidity_change"] = df["humidity_pct"].diff()

In [13]:
# Creating lag features for energy consumption
lags = [1, 6, 12, 24, 48, 72]
for lag in lags:
    df[f"lag_{lag}"] = df["energy_consumption"].shift(lag)

In [14]:
# Creating rolling window features for energy consumption
df["rolling_6h"]      = df["energy_consumption"].rolling(6).mean()
df["rolling_12h"]     = df["energy_consumption"].rolling(12).mean()
df["rolling_24h"]     = df["energy_consumption"].rolling(24).mean()
df["rolling_7d"]      = df["energy_consumption"].rolling(24*7).mean()

df["rolling_24h_std"] = df["energy_consumption"].rolling(24).std()
df["rolling_7d_std"]  = df["energy_consumption"].rolling(24*7).std()

In [15]:
# Check missing values
print("Missing values in each column:\n", df.isnull().sum())

Missing values in each column:
 energy_consumption      0
temperature_C           0
humidity_pct            0
hour                    0
dayofweek               0
is_weekend              0
is_holiday              0
month                   0
dayofyear               0
hour_sin                0
hour_cos                0
dow_sin                 0
dow_cos                 0
month_sin               0
month_cos               0
temp_change             1
humidity_change         1
lag_1                   1
lag_6                   6
lag_12                 12
lag_24                 24
lag_48                 48
lag_72                 72
rolling_6h              5
rolling_12h            11
rolling_24h            23
rolling_7d            167
rolling_24h_std        23
rolling_7d_std        167
dtype: int64


In [16]:
# Shape after feature engineering
print(f"Shape after feature engineering: {df.shape}")

Shape after feature engineering: (26304, 29)


In [17]:
# Target encoding and final cleanup
df["target"] = df["energy_consumption"].shift(-1)


In [18]:
# Drop rows with NaN values generated by differencing and lagging
df_fe = df.dropna().copy()
print(f"Final Feature Engineered Shape: {df_fe.shape}")

Final Feature Engineered Shape: (26136, 30)


In [19]:
# Save the engineered dataset
df_fe.to_csv("../data/processed/feature_engineered_energy_data.csv")

In [20]:
# ------------------------------------------------------
# 9. SEPARATE DATA FOR EACH MODEL TYPE
# ------------------------------------------------------

In [21]:
# -------- A) LSTM FEATURES (minimal) --------
LSTM_FEATURES = [
    "energy_consumption",
    "temperature_C",
    "humidity_pct",
    "hour_sin", "hour_cos",
    "dow_sin", "dow_cos",
    "target"
]
df_lstm = df_fe[LSTM_FEATURES]
df_lstm.to_csv("../data/features/features_LSTM.csv")

In [22]:
# -------- B) XGBOOST FEATURES (full) --------
XGB_FEATURES = [
    "hour", "dayofweek", "is_weekend", "month", "dayofyear",
    "hour_sin", "hour_cos", "dow_sin", "dow_cos", "month_sin", "month_cos",
    "temperature_C", "humidity_pct", "temp_change", "humidity_change",
    "lag_1", "lag_6", "lag_12", "lag_24", "lag_48", "lag_72",
    "rolling_6h", "rolling_12h", "rolling_24h", "rolling_7d",
    "rolling_24h_std", "rolling_7d_std",
    "target"
]
df_xgb = df_fe[XGB_FEATURES]
df_xgb.to_csv("../data/features/features_XGBOOST.csv")

In [23]:
# -------- C) MLP FEATURES (medium) --------
MLP_FEATURES = [
    "hour_sin", "hour_cos",
    "dow_sin", "dow_cos",
    "month_sin", "month_cos",
    "temperature_C", "humidity_pct", "temp_change",
    "lag_1", "lag_6", "lag_12", "lag_24",
    "rolling_6h", "rolling_12h", "rolling_24h",
    "target"
]
df_mlp = df_fe[MLP_FEATURES]
df_mlp.to_csv("../data/features/features_MLP.csv")

In [24]:
# ------------------------------------------------------
# 10. Print output confirmation
# ------------------------------------------------------
print("\nSaved:")
print(" - features_LSTM.csv")
print(" - features_XGBOOST.csv")
print(" - features_MLP.csv")
print("\nFeature Engineering completed successfully!")


Saved:
 - features_LSTM.csv
 - features_XGBOOST.csv
 - features_MLP.csv

Feature Engineering completed successfully!
