In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Basic Information

In [None]:
import pandas as pd

# Define the file paths
train_path = '/kaggle/input/energy-consumption-dataset-linear-regression/train_energy_data.csv'
test_path = '/kaggle/input/energy-consumption-dataset-linear-regression/test_energy_data.csv'

# Load the datasets
try:
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    # 1. Basic Information (Columns, Data Types, Null Values)
    print("--- Train Dataset Info ---")
    print(train_df.info())
    
    print("\n--- Test Dataset Info ---")
    print(test_df.info())

    # 2. First 5 Rows (Quick Preview)
    print("\n--- Train Dataset Preview (Head) ---")
    display(train_df.head())

    # 3. Statistical Summary (Mean, Std, Min, Max, etc.)
    print("\n--- Train Dataset Statistics ---")
    display(train_df.describe())

    # 4. Check for Missing Values
    print("\n--- Missing Values in Train Set ---")
    print(train_df.isnull().sum())

except FileNotFoundError as e:
    print(f"Error: {e}. Please ensure the dataset is added to your Kaggle notebook.")

# Statistical Distribution Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot distributions of numerical features and target
num_cols = ['Square Footage', 'Number of Occupants', 'Appliances Used', 'Average Temperature', 'Energy Consumption']
train_df[num_cols].hist(bins=20, figsize=(15, 10))
plt.suptitle("Feature Distributions")
plt.show()

# Boxplots to check for outliers
plt.figure(figsize=(12, 6))
sns.boxplot(data=train_df[num_cols])
plt.title("Outlier Detection")
plt.show()

# Categorical Impact (Feature vs. Target)

In [None]:
# Comparison of Energy Consumption across categories
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

sns.boxplot(x='Building Type', y='Energy Consumption', data=train_df, ax=axes[0])
axes[0].set_title('Energy Consumption by Building Type')

sns.boxplot(x='Day of Week', y='Energy Consumption', data=train_df, ax=axes[1])
axes[1].set_title('Energy Consumption by Day of Week')
plt.xticks(rotation=45)
plt.show()

# Correlation & Multicollinearity

In [None]:
# Heatmap of correlations
plt.figure(figsize=(10, 8))
sns.heatmap(train_df[num_cols].corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Matrix")
plt.show()

# Linearity Check

In [None]:
# Scatter plots for linearity
sns.pairplot(train_df, x_vars=['Square Footage', 'Average Temperature', 'Appliances Used'], 
             y_vars='Energy Consumption', height=5, aspect=0.8, kind='reg')
plt.show()

# Feature Engineering & Preprocessing

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Prepare Features and Target
X = train_df.drop('Energy Consumption', axis=1)
y = train_df['Energy Consumption']

# Convert object columns to 'category' dtype for LightGBM
cat_features = ['Building Type', 'Day of Week']
for col in cat_features:
    X[col] = X[col].astype('category')

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Initialization and Training

In [None]:
import lightgbm as lgb

# 1. Initialize the Regressor
# verbose=-1 suppresses the internal [Info] and [Warning] messages
lgbm_model = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    random_state=42,
    n_jobs=-1,
    verbose=-1 
)

# 2. Fit the model
# log_evaluation(period=1) ensures only the metrics per iteration are shown
# early_stopping shows the final best iteration
lgbm_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='rmse',
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=10) # Shows metrics every 10 rounds to keep it clean
    ]
)

# Model Evaluation

In [None]:
# Predictions
y_pred = lgbm_model.predict(X_val)

# Metrics
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)

print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation R2 Score: {r2:.4f}")

# Feature Importance Visualization

In [None]:
# Plot Feature Importance
plt.figure(figsize=(10, 6))
lgb.plot_importance(lgbm_model, importance_type='split', max_num_features=10)
plt.title("LightGBM Feature Importance")
plt.show()

# Final Prediction on Test Set

In [None]:
# Prepare Test Set
X_test = test_df.drop('Energy Consumption', axis=1)
for col in cat_features:
    X_test[col] = X_test[col].astype('category')

# Predict
test_predictions = lgbm_model.predict(X_test)

# Add to test_df for viewing
test_df['Predicted_Energy'] = test_predictions
print(test_df[['Building Type', 'Energy Consumption', 'Predicted_Energy']].head())

# Randomized Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import lightgbm as lgb

# Define the parameter distribution
param_dist = {
    'n_estimators': [500, 1000],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [20, 31, 50, 70],
    'max_depth': [-1, 10, 20],
    'min_child_samples': [10, 20, 30],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Initialize the base model
lgbm = lgb.LGBMRegressor(random_state=42, verbose=-1)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=lgbm,
    param_distributions=param_dist,
    n_iter=50,             # Number of parameter settings sampled
    scoring='neg_mean_squared_error',
    cv=3,                  # 3-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Fit the search
random_search.fit(X_train, y_train)

print(f"Best Parameters (Random Search): {random_search.best_params_}")

# Optuna

In [None]:
import optuna
from sklearn.metrics import mean_squared_error

def objective(trial):
    # Define the search space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'verbose': -1
    }

    # Train model with current trial parameters
    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train)
    
    # Evaluate on validation set
    preds = model.predict(X_val)
    rmse = mean_squared_error(y_val, preds, squared=False)
    return rmse

# Create a study and optimize
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print(f"Best Trial RMSE: {study.best_value}")
print(f"Best Parameters (Optuna): {study.best_params}")

# Final Model & Reproducibility

In [None]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import joblib # For saving the model

# Set seed for reproducibility
SEED = 42

# Final Best Parameters from Optuna
best_params = {
    'n_estimators': 1538,
    'learning_rate': 0.03402359183605272,
    'num_leaves': 103,
    'max_depth': 5,
    'min_child_samples': 85,
    'lambda_l1': 0.001852547182792142,
    'lambda_l2': 0.0001965599894239328,
    'random_state': SEED,
    'verbose': -1
}

# Preprocessing Function (consistent across train/test)
def preprocess_data(df):
    temp_df = df.copy()
    temp_df.columns = [c.replace(' ', '_') for c in temp_df.columns]
    cat_cols = ['Building_Type', 'Day_of_Week']
    for col in cat_cols:
        temp_df[col] = temp_df[col].astype('category')
    return temp_df

# Prepare final data
final_train = preprocess_data(train_df)
X_final = final_train.drop('Energy_Consumption', axis=1)
y_final = final_train['Energy_Consumption']

# Train final model
final_model = lgb.LGBMRegressor(**best_params)
final_model.fit(X_final, y_final)

# Save the model for demo/deployment
joblib.dump(final_model, 'energy_lgbm_model.pkl')
print("Model finalized and saved.")

# Advanced Visualizations for Presentation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Actual vs Predicted (Validation Set)
val_preds = final_model.predict(X_val) # Use your previous X_val
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_val, y=val_preds, alpha=0.5)
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], '--r', lw=2)
plt.title('Actual vs. Predicted Energy Consumption')
plt.xlabel('Actual Values')
plt.ylabel('Predictions')
plt.show()

# 2. Feature Importance (SHAP values are best for presentations)
# If SHAP is not installed, use plot_importance
plt.figure(figsize=(10, 8))
lgb.plot_importance(final_model, importance_type='gain', precision=0)
plt.title('Key Drivers of Energy Consumption')
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

def load_and_clean_data(filepath):
    # 1. Ingestion
    df = pd.read_csv(filepath)
    
    # 2. Cleaning: Convert to datetime and sort
    # Replace 'Datetime' with your actual column name
    df['Datetime'] = pd.to_datetime(df['Datetime'])
    df = df.sort_values('Datetime')
    df = df.set_index('Datetime')
    
    # Handle duplicates or missing values
    df = df[~df.index.duplicated(keep='first')]
    df = df.resample('H').mean().ffill() # Ensure hourly continuity
    return df

def feature_engineering(df):
    # 3. Basic Feature Extraction
    df['hour'] = df.index.hour
    df['day_of_week'] = df.index.dayofweek
    df['month'] = df.index.month
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    return df

def establish_baseline(df, target_col):
    # 4. Baseline: Predict that tomorrow same hour = today same hour
    # We shift by 24 hours to create a 'Persistence' baseline
    df['baseline_pred'] = df[target_col].shift(24)
    
    # Drop rows where we don't have a baseline yet (first 24 hours)
    valid_data = df.dropna()
    
    mae = mean_absolute_error(valid_data[target_col], valid_data['baseline_pred'])
    rmse = np.sqrt(mean_squared_error(valid_data[target_col], valid_data['baseline_pred']))
    
    print(f"--- Baseline Performance (Persistence Model) ---")
    print(f"MAE: {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    return valid_data

# Usage
# df = load_and_clean_data('energy_consumption.csv')
# df = feature_engineering(df)
# df_final = establish_baseline(df, 'MW_Consumption')

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

def prepare_energy_data(file_path, window_size=24):
    # 1. Load Data
    df = pd.read_csv(file_path, parse_dates=['Datetime'], index_col='Datetime')
    df = df.sort_index()

    # 2. Feature Engineering: Extract time-based features
    df['hour'] = df.index.hour
    df['day_of_week'] = df.index.dayofweek
    df['month'] = df.index.month
    
    # 3. Scaling
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(df)

    # 4. Create Windows (X = last 24hrs, y = next hr)
    X, y = [], []
    for i in range(window_size, len(scaled_data)):
        X.append(scaled_data[i-window_size:i])
        y.append(scaled_data[i, 0]) # Predicting the first column (Energy Consumption)

    return np.array(X), np.array(y), scaler

# Example Usage:
# X_train, y_train, scaler = prepare_energy_data('pjm_energy_data.csv')
# print(f"Input Shape (Samples, Time Steps, Features): {X_train.shape}")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, mean_squared_error

# 1. Load the dataset
# Adjust the path if your file name is different (e.g., 'train_energy_data.csv')
file_path = '/kaggle/input/energy-consumption-dataset-linear-regression/Energy_consumption.csv'
try:
    df = pd.read_csv(file_path)
except FileNotFoundError:
    print("File not found. Please check the dataset path.")
    # For demonstration, creating dummy data matching the dataset structure
    data = {
        'Building Type': np.random.choice(['Residential', 'Commercial', 'Industrial'], 1000),
        'Square Footage': np.random.randint(1000, 50000, 1000),
        'Number of Occupants': np.random.randint(1, 100, 1000),
        'Appliances Used': np.random.randint(1, 50, 1000),
        'Average Temperature': np.random.uniform(10, 35, 1000),
        'Day of Week': np.random.choice(['Weekday', 'Weekend'], 1000),
        'Energy Consumption': np.random.uniform(1000, 7000, 1000)
    }
    df = pd.DataFrame(data)

# 2. Preprocessing
# Encoding categorical variables
le = LabelEncoder()
df['Building Type'] = le.fit_transform(df['Building Type'])
df['Day of Week'] = le.fit_transform(df['Day of Week'])

X = df.drop('Energy Consumption', axis=1)
y = df['Energy Consumption']

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. Model Training & Stability (Loss vs Iterations)
# Using SGDRegressor to capture the loss curve over iterations
model = SGDRegressor(max_iter=1, tol=-np.infty, warm_start=True, random_state=42)
train_loss = []
test_loss = []

epochs = 100
for epoch in range(epochs):
    model.partial_fit(X_train_scaled, y_train)
    
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)
    
    train_loss.append(mean_squared_error(y_train, y_train_pred))
    test_loss.append(mean_squared_error(y_test, y_test_pred))

# Plotting Training Stability
plt.figure(figsize=(10, 5))
plt.plot(train_loss, label='Training Loss (MSE)')
plt.plot(test_loss, label='Validation Loss (MSE)')
plt.title('Training Stability (Loss vs Epochs)')
plt.xlabel('Epochs')
plt.ylabel('Mean Squared Error')
plt.legend()
plt.grid(True)
plt.savefig('training_stability.png')
plt.show()

# 4. Confusion Matrix (Error Analysis)
# Since regression is continuous, we bin values into categories: Low, Medium, High
def bin_energy(values):
    bins = np.quantile(y, [0, 0.33, 0.66, 1.0])
    return np.digitize(values, bins[1:-1])

y_test_binned = bin_energy(y_test)
y_pred_binned = bin_energy(y_test_pred)

labels = ['Low', 'Medium', 'High']
cm = confusion_matrix(y_test_binned, y_pred_binned)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=labels, yticklabels=labels)
plt.title('Confusion Matrix (Error Analysis via Binning)')
plt.xlabel('Predicted Category')
plt.ylabel('Actual Category')
plt.savefig('confusion_matrix.png')
plt.show()

print("Analysis Complete. Images saved as 'training_stability.png' and 'confusion_matrix.png'.")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import r2_score, confusion_matrix, ConfusionMatrixDisplay

# 1. Load the dataset
train_path = '/kaggle/input/energy-consumption-dataset-linear-regression/train_energy_data.csv'
test_path = '/kaggle/input/energy-consumption-dataset-linear-regression/test_energy_data.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# 2. Preprocessing
# Target variable: 'Energy Consumption'
X_train = train_df.drop('Energy Consumption', axis=1)
y_train = train_df['Energy Consumption']
X_test = test_df.drop('Energy Consumption', axis=1)
y_test = test_df['Energy Consumption']

categorical_cols = ['Building Type', 'Day of Week']
numerical_cols = ['Square Footage', 'Number of Occupants', 'Appliances Used', 'Average Temperature']

# Create a preprocessor to handle scaling and encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc = preprocessor.transform(X_test)

# 3. Training Stability (Accuracy vs Loss)
# Using SGDRegressor to record metrics per "epoch"
model = SGDRegressor(max_iter=1, tol=None, warm_start=True, learning_rate='constant', eta0=0.01)
epochs = 100
loss_history = []
accuracy_history = []

for epoch in range(epochs):
    model.partial_fit(X_train_proc, y_train)
    y_pred_train = model.predict(X_train_proc)
    
    # Track Mean Squared Error (Loss) and R^2 Score (Accuracy Proxy)
    mse = np.mean((y_train - y_pred_train) ** 2)
    r2 = r2_score(y_train, y_pred_train)
    
    loss_history.append(mse)
    accuracy_history.append(max(0, r2)) # Ensure R2 doesn't go below 0 for plotting

# Plotting Training Stability
fig, ax1 = plt.subplots(figsize=(10, 6))

ax1.set_xlabel('Epochs')
ax1.set_ylabel('Loss (MSE)', color='tab:red')
ax1.plot(range(epochs), loss_history, color='tab:red', label='Training Loss')
ax1.tick_params(axis='y', labelcolor='tab:red')

ax2 = ax1.twinx()
ax2.set_ylabel('R^2 Score (Accuracy)', color='tab:blue')
ax2.plot(range(epochs), accuracy_history, color='tab:blue', label='R^2 Score')
ax2.tick_params(axis='y', labelcolor='tab:blue')

plt.title('Training Stability: Accuracy ($R^2$) vs Loss (MSE)')
fig.tight_layout()
plt.show()

# 4. Confusion Matrix (Error Analysis via Binning)
y_pred_test = model.predict(X_test_proc)

# Convert continuous values into 3 categories: Low, Medium, High
def bin_energy(values, bins):
    return np.digitize(values, bins)

# Define bins based on the distribution of training data
bins = np.percentile(y_train, [33, 66])
cat_labels = ['Low', 'Medium', 'High']

y_test_binned = bin_energy(y_test, bins)
y_pred_binned = bin_energy(y_pred_test, bins)

cm = confusion_matrix(y_test_binned, y_pred_binned)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=cat_labels, yticklabels=cat_labels)
plt.title('Error Analysis: Confusion Matrix (Binned Consumption)')
plt.xlabel('Predicted Consumption Category')
plt.ylabel('Actual Consumption Category')
plt.show()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, classification_report

# 1. Load Data
train_df = pd.read_csv('/kaggle/input/energy-consumption-dataset-linear-regression/train_energy_data.csv')
test_df = pd.read_csv('/kaggle/input/energy-consumption-dataset-linear-regression/test_energy_data.csv')

# 2. Simple Preprocessing (Handling only numeric for brevity)
X_train = train_df.select_dtypes(include=[np.number]).drop(columns=['Energy Consumption'])
y_train = train_df['Energy Consumption']
X_test = test_df.select_dtypes(include=[np.number]).drop(columns=['Energy Consumption'])
y_test = test_df['Energy Consumption']

# 3. Train Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# 4. DISCRETIZATION (The "Error Analysis" Trick)
# We define 3 categories: Low, Medium, High based on the distribution of energy
def categorize_energy(value, bins):
    if value <= bins[0]: return 'Low'
    if value <= bins[1]: return 'Medium'
    return 'High'

# Use percentiles to create fair bins (33rd and 66th percentile)
bins = np.percentile(y_train, [33, 66])

y_test_cat = [categorize_energy(v, bins) for v in y_test]
y_pred_cat = [categorize_energy(v, bins) for v in y_pred]
labels = ['Low', 'Medium', 'High']

# 5. Generate and Plot Confusion Matrix
cm = confusion_matrix(y_test_cat, y_pred_cat, labels=labels)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Reds', 
            xticklabels=labels, yticklabels=labels)

plt.xlabel('Predicted Energy Usage')
plt.ylabel('Actual Energy Usage')
plt.title('Error Analysis: Confusion Matrix (Binned Regression Results)')
plt.show()

# 6. Print Classification Report for deeper insights
print(classification_report(y_test_cat, y_pred_cat, target_names=labels))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, r2_score

# 1. Load the dataset
# Assuming paths: /kaggle/input/energy-consumption-dataset-linear-regression/train_energy_data.csv
try:
    train_df = pd.read_csv('/kaggle/input/energy-consumption-dataset-linear-regression/train_energy_data.csv')
    test_df = pd.read_csv('/kaggle/input/energy-consumption-dataset-linear-regression/test_energy_data.csv')
except FileNotFoundError:
    print("Files not found. Please ensure the paths are correct.")
    # Exit or use dummy data for demonstration
    exit()

# 2. Preprocessing
# Target: 'Energy Consumption'
# Features based on dataset inspection: Building Type, Square Footage, Number of Occupants, 
# Appliances Used, Average Temperature, Day of Week
X = train_df.drop('Energy Consumption', axis=1)
y = train_df['Energy Consumption']

categorical_cols = ['Building Type', 'Day of Week']
numerical_cols = ['Square Footage', 'Number of Occupants', 'Appliances Used', 'Average Temperature']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

X_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(test_df.drop('Energy Consumption', axis=1))
y_test = test_df['Energy Consumption']

# 3. Training Stability (Loss vs. Epochs)
model = SGDRegressor(max_iter=1, tol=None, warm_start=True, learning_rate='constant', eta0=0.01)
epochs = 100
losses = []
accuracies = [] # Using R^2 as proxy

for epoch in range(epochs):
    model.partial_fit(X_processed, y)
    y_pred_train = model.predict(X_processed)
    
    # Calculate Loss (MSE)
    mse = np.mean((y - y_pred_train) ** 2)
    losses.append(mse)
    
    # Calculate R^2 (Accuracy proxy)
    r2 = r2_score(y, y_pred_train)
    accuracies.append(max(0, r2)) # Clamp to 0 for visualization

# Plot Training Stability
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(range(epochs), losses, color='red', label='Training Loss (MSE)')
plt.title('Training Stability: Loss over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(range(epochs), accuracies, color='blue', label='R^2 Score ("Accuracy")')
plt.title('Training Stability: R^2 Score over Epochs')
plt.xlabel('Epochs')
plt.ylabel('R^2 Score')
plt.legend()
plt.tight_layout()
plt.savefig('stability_plots.png')

# 4. Confusion Matrix (Error Analysis via Binning)
y_pred_test = model.predict(X_test_processed)

# Create bins (Low, Medium, High consumption)
bins = np.percentile(y, [33, 66])
def categorize(val):
    if val <= bins[0]: return 'Low'
    if val <= bins[1]: return 'Medium'
    return 'High'

y_test_cat = [categorize(v) for v in y_test]
y_pred_cat = [categorize(v) for v in y_pred_test]
labels = ['Low', 'Medium', 'High']

cm = confusion_matrix(y_test_cat, y_pred_cat, labels=labels)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title('Error Analysis: Confusion Matrix (Binned Energy Consumption)')
plt.xlabel('Predicted Category')
plt.ylabel('Actual Category')
plt.savefig('confusion_matrix.png')

print("Plots saved: stability_plots.png and confusion_matrix.png")