# Forecasting

In [None]:
! pip install pandas openpyxl numpy scikit-learn prophet matplotlib seaborn jupyter ipywidgets

In [None]:
%load_ext autoreload
%autoreload 2


In [None]:
import pandas as pd
import numpy as np
from prophet import Prophet
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

In [None]:
from utils import forecasting

In [None]:
import os 

if not os.path.exists('output'):
    os.mkdir('output')

# Inital Data Extraction (Only run once)

In [None]:
# dfs = pd.read_excel("data/SES_2024.xlsx.coredownload.xlsx", sheet_name="T3.5",header=[3, 4])


In [None]:
# df_region = dfs.iloc[:49, :]
# df_region

In [None]:
# region_index = df_region[('Overall', 'Unnamed: 0_level_1')].to_list()

In [None]:
# df = df_region.loc[:, df_region.columns.get_level_values(1).astype(str).str.isnumeric()]
# df.index = region_index

# df.columns = pd.to_datetime([f"{year}-{month}-01" for year, month in df.columns])
# df.to_csv('data/region_monthly_electricity_consumption.csv', index=True)


# Data Preparation for Forecasting 

In [None]:
df = pd.read_csv('data/region_monthly_electricity_consumption.csv', index_col=0)

df.head()

In [None]:
df.index

In [None]:
df_long, df_stats = forecasting.prepare_electricity_data(df=df)
df_long

In [None]:
seasonality_df = forecasting.analyze_seasonality(df_long=df_long)

peak_counts = seasonality_df['peak_month'].value_counts()
low_counts = seasonality_df['low_month'].value_counts()

months_df = pd.DataFrame({
    'Month': peak_counts.index.append(low_counts.index),
    'Type': ['Peak'] * len(peak_counts) + ['Low'] * len(low_counts),
    'Count': pd.concat([peak_counts, low_counts]).values
})

# Plotting the results
plt.figure(figsize=(10, 6))
sns.barplot(data=months_df, x='Month', y='Count', hue='Type', palette='pastel')
plt.title('Frequency of Peak and Low Months Across Regions')
plt.ylabel('Count')
plt.xlabel('Month')
plt.xticks(rotation=45)
plt.legend(title='Month Type')

plt.savefig('output/peak_low_months_frequency.png')
plt.show()

In [None]:
anomalies = forecasting.identify_anomalies(df_long=df_long, z_score_threshold=3)
top_10_anomalous_regions = anomalies.groupby('region')['region'].value_counts().sort_values(ascending=False)[:10]
top_10_anomalous_regions

In [None]:
forecasting.plot_anomalies(df_long=df_long, anomalies=anomalies, regions_to_plot=top_10_anomalous_regions.index, save=True)

Many anomalies during Covid period. High electricity consumption due to the lockdown. Tengah anomalies are due to the region being a new BTO estate.

In [None]:
print(f"Earliest Date: {min(df_long['date'])}")
print(f"Latest Date: {max(df_long['date'])}")

In [None]:
print("=== Top 10 Regions with Highest Electricity Consumption ===\n")
df_long.groupby(['region_category','region'])['consumption'].mean().sort_values(ascending=False)[:10]


In [None]:
# Run the forecasting
results = forecasting.train_forecasting_model(df_long=df_long, region_category='Subzone')


# Print metrics
metrics_df = forecasting.print_metrics(results)
print("\nModel Performance Metrics:")
print(metrics_df)

# Plot forecasts for key regions
regions_to_plot = metrics_df['Region']
fig = forecasting.plot_forecasts(results=results, regions_to_plot=regions_to_plot, save=True)
plt.show()

# Get future predictions for Overall consumption
overall_forecast = results['Overall']['forecast'].head(6)[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]
print("\nForecast for Overall consumption (next 6 months):")
print(overall_forecast.round(2))


In [None]:
results['Clementi']

In [None]:
overall_forecast

In [None]:
metrics_df[['RMSE']].describe()