In [3]:
# Imports


import requests
import zipfile
import io
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import pytz
import numpy as np
import math
import warnings
import seaborn as sns
import os
import shutil
import datetime

# Suppress specific warnings (in this case, FutureWarnings)
warnings.simplefilter(action='ignore', category=FutureWarning)

from sktime.clustering.k_medoids import TimeSeriesKMedoids
from scipy.fftpack import fft, fftfreq
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.metrics import silhouette_score, davies_bouldin_score
from tslearn.clustering import KShape
from scipy.signal import welch
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.cluster import DBSCAN
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA

from scipy.stats import boxcox
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

from tqdm.notebook import tqdm
import pandas as pd


In [11]:
# Load the electricity usage data
NUM_CLUSTERS = 4

cluster_dfs = list()

for i in range(1, NUM_CLUSTERS + 1):
    df = pd.concat([
        pd.read_parquet(f'../dataset/cluster_{i}/training.parquet'),
        pd.read_parquet(f'../dataset/cluster_{i}/validation.parquet'),
        pd.read_parquet(f'../dataset/cluster_{i}/test.parquet'),
    ], axis=1).T.sort_index()
    df.index = pd.to_datetime(df.index)
    df.index.name = 'date'
    cluster_dfs.append(df)

In [13]:
explanatory_variables_df = pd.read_parquet('../dataset/combined_explanatory_variables/explanatory_variables.parquet')
explanatory_variables_df.index = pd.to_datetime(explanatory_variables_df.index)
explanatory_variables_df = explanatory_variables_df.sort_index().asfreq('D')
explanatory_variables_df.head()

Unnamed: 0_level_0,heating_degree_days,cooling_degree_days,precip,precipprob,is_holiday,sunlight_length_hours,is_weekend
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012-01-01,6.578559,0.0,0.0,0.0,0,10.0,1
2012-01-02,6.659918,0.0,0.111,1.0,0,10.0,0
2012-01-03,7.977833,0.0,0.0,0.0,0,10.0,0
2012-01-04,6.335269,0.0,0.0,0.0,0,10.0,0
2012-01-05,7.437771,0.0,0.0,0.0,0,10.0,0


In [15]:
melted_dfs = list()
for i, df in enumerate(cluster_dfs, start=1):
    melted_df = pd.melt(df.reset_index(), id_vars=['date'], var_name='customer', value_name='electricity_usage')
    melted_df['cluster'] = i
    melted_dfs.append(melted_df)

combined_melted_df = pd.concat(melted_dfs, axis=0)
customer_level_dataset_df = pd.merge(combined_melted_df, explanatory_variables_df, on='date', how='inner')
customer_level_dataset_df = customer_level_dataset_df.set_index(['date', 'customer', 'cluster'])
customer_level_dataset_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,electricity_usage,heating_degree_days,cooling_degree_days,precip,precipprob,is_holiday,sunlight_length_hours,is_weekend
date,customer,cluster,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2012-01-02,MT_091,1,3141.260841,6.659918,0.0,0.111,1.0,0,10.0,0
2012-01-03,MT_091,1,3424.616411,7.977833,0.0,0.0,0.0,0,10.0,0
2012-01-04,MT_091,1,3465.81054,6.335269,0.0,0.0,0.0,0,10.0,0
2012-01-05,MT_091,1,3578.052035,7.437771,0.0,0.0,0.0,0,10.0,0
2012-01-06,MT_091,1,3697.96531,8.166583,0.0,0.0,0.0,0,10.0,0


In [17]:
cluster_num = 1