In [1]:
# train_energy_model_no_holidays.py

# 1) If you don't have it already, install:
#    pip install pandas numpy scikit-learn joblib

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# 2) Load your dataset (adjust path as needed)
df = pd.read_csv('smart_home_energy_consumption_large.csv')

# 3) Feature engineering: combine Date+Time, extract & encode time features
df['Datetime']    = pd.to_datetime(df['Date'] + ' ' + df['Time'])
df['hour']        = df['Datetime'].dt.hour
df['day_of_week'] = df['Datetime'].dt.dayofweek
df['month']       = df['Datetime'].dt.month
df['is_weekend']  = df['day_of_week'].isin([5,6]).astype(int)

# 3a) Cyclical encodings for hour & month
df['hour_sin']   = np.sin(2 * np.pi * df['hour']   / 24)
df['hour_cos']   = np.cos(2 * np.pi * df['hour']   / 24)
df['month_sin']  = np.sin(2 * np.pi * (df['month']-1) / 12)
df['month_cos']  = np.cos(2 * np.pi * (df['month']-1) / 12)

# 4) Drop raw/unneeded columns *only if they exist*
to_drop = [c for c in ['Home ID','Date','Time','Datetime'] if c in df.columns]
if to_drop:
    df.drop(columns=to_drop, inplace=True)

# 5) Split into features X and target y
X = df.drop(columns=['Energy Consumption (kWh)'])
y = df['Energy Consumption (kWh)']

# 6) Define which columns to scale vs encode
numeric_features     = [
    'Outdoor Temperature (°C)',
    'Household Size',
    'hour','day_of_week','month','is_weekend',
    'hour_sin','hour_cos','month_sin','month_cos'
]
categorical_features = ['Appliance Type','Season']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
])

# 7) Build the full pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# 8) Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 9) Fit
pipeline.fit(X_train, y_train)

# 10) Evaluate
y_pred = pipeline.predict(X_test)
mse  = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2   = r2_score(y_test, y_pred)

print(f"MSE:  {mse:.4f}")
print(f"RMSE: {rmse:.4f} kWh")
print(f"R²:   {r2:.4f}")

# 11) Save the trained pipeline
joblib.dump(pipeline, 'energy_predictor.pkl')
print("Saved model to energy_predictor.pkl")

# 12) Helper to predict on new data
def predict_energy(input_df):
    """
    input_df must include at least:
      - 'Date' (YYYY-MM-DD string)
      - 'Time' (HH:MM:SS string)
      - all other feature columns: 
         'Outdoor Temperature (°C)', 'Household Size',
         'Appliance Type', 'Season'
    """
    df_new = input_df.copy()
    df_new['Datetime']    = pd.to_datetime(df_new['Date'] + ' ' + df_new['Time'])
    df_new['hour']        = df_new['Datetime'].dt.hour
    df_new['day_of_week'] = df_new['Datetime'].dt.dayofweek
    df_new['month']       = df_new['Datetime'].dt.month
    df_new['is_weekend']  = df_new['day_of_week'].isin([5,6]).astype(int)

    df_new['hour_sin']   = np.sin(2 * np.pi * df_new['hour']   / 24)
    df_new['hour_cos']   = np.cos(2 * np.pi * df_new['hour']   / 24)
    df_new['month_sin']  = np.sin(2 * np.pi * (df_new['month']-1) / 12)
    df_new['month_cos']  = np.cos(2 * np.pi * (df_new['month']-1) / 12)

    # Drop any of the raw/temp columns if present
    drop_cols = [c for c in ['Home ID','Date','Time','Datetime'] if c in df_new.columns]
    if drop_cols:
        df_new.drop(columns=drop_cols, inplace=True)

    return pipeline.predict(df_new)

# Example usage:
new_data = pd.DataFrame([
   {
     'Date': '2025-04-10', 'Time': '15:30:00',
     'Outdoor Temperature (°C)': 22.5, 'Household Size': 3,
     'Appliance Type': 'Refrigerator', 'Season': 'Spring'
   }
])
print(predict_energy(new_data))


MSE:  0.3410
RMSE: 0.5840 kWh
R²:   0.7575
Saved model to energy_predictor.pkl
[1.50286041]
