<a href="https://colab.research.google.com/github/fabriziobasso/Colab_backup/blob/main/03_Handling_Missing_Data_(Long_Gaps)_v0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# from google.colab import auth
# auth.authenticate_user()

In [None]:
%%capture
# Clone the repository
!git clone https://github.com/PacktPublishing/Modern-Time-Series-Forecasting-with-Python-2E.git

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import os
import plotly.io as pio
pio.templates.default = "plotly_white"
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import missingno as msno
from itertools import cycle
from sklearn.metrics import mean_absolute_error
from IPython.display import display, HTML
# %load_ext autoreload
# %autoreload 2
np.random.seed()
tqdm.pandas()

# Navigate to the repository's root directory
%cd Modern-Time-Series-Forecasting-with-Python-2E

from src.utils.data_utils import compact_to_expanded
from src.imputation.interpolation import SeasonalInterpolation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd

In [None]:
os.makedirs("/content/drive/MyDrive/Books/Modern Time Series Forecasting/Chapter II/imgs/chapter_2", exist_ok=True)
preprocessed = Path("/content/drive/MyDrive/Books/Modern Time Series Forecasting/Data/data/london_smart_meters/data/london_smart_meters/preprocessed")

In [None]:
assert preprocessed.is_dir(), "You have to run 02 - Preprocessing London Smart Meter Dataset.ipynb in Chapter02 before running this notebook"

In [None]:
def format_plot(fig, legends = None, font_size=15, title_font_size=20):
    if legends:
        names = cycle(legends)
        fig.for_each_trace(lambda t:  t.update(name = next(names)))
    fig.update_layout(
            autosize=False,
            width=900,
            height=500,
            title={
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'},
            titlefont={
                "size": title_font_size
            },
            legend_title = None,
            legend=dict(
                font=dict(size=font_size),
                orientation="h",
                yanchor="bottom",
                y=0.98,
                xanchor="right",
                x=1,
            ),
            yaxis=dict(
                title_text="Value",
                titlefont=dict(size=font_size),
                tickfont=dict(size=font_size),
            ),
            xaxis=dict(
                title_text="Day",
                titlefont=dict(size=font_size),
                tickfont=dict(size=font_size),
            )
        )
    return fig

### Reading Blocks 0-7

In [None]:
try:
    block_df = pd.read_parquet(preprocessed/"london_smart_meters_merged_block_0-7.parquet")
    display(block_df.head())
except FileNotFoundError:
    display(HTML("""
    <div class="alert alert-block alert-warning">
    <b>Warning!</b> File not found. Please make sure you have run 02 - Preprocessing London Smart Meter Dataset.ipynb in Chapter02
    </div>
    """))

Let's take all the consumers in one block for our exercise. Keep in mind, you can do the same with more number of blocks as long as it fits in your RAM

In [None]:
#compact_to_expanded??

In [None]:
#Converting to expanded form
exp_block_df = compact_to_expanded(block_df[block_df.file=="block_7"], timeseries_col = 'energy_consumption',
                                            static_cols = ["frequency", "series_length", "stdorToU", "Acorn", "Acorn_grouped", "file"],
                                            time_varying_cols = ['holidays', 'visibility', 'windBearing', 'temperature', 'dewPoint',
                                                  'pressure', 'apparentTemperature', 'windSpeed', 'precipType', 'icon',
                                                  'humidity', 'summary'],
                                            ts_identifier = "LCLid")

exp_block_df.head()

### Missing Data and Advanced Imputation Techniques

In [None]:
# Pivot the data to set the index as the datetime and the different time series along the columns
plot_df = pd.pivot_table(exp_block_df, index="timestamp", columns="LCLid", values="energy_consumption")
# Generate Plot. Since we have a datetime index, we can mention the frequency to decide what do we want on the X axis
msno.matrix(plot_df, freq="M", fontsize=10)
#plt.savefig('/content/drive/MyDrive/Books/Modern Time Series Forecasting/Chapter II/imgs/chapter_2/imgs/missing_no_full.png', bbox_inches='tight')
plt.show()

In [None]:
# Taking a single time series from the block
ts_df = exp_block_df[exp_block_df.LCLid=="MAC000193"].set_index("timestamp")

We know that there is some missing data between 2012-09-30 and 2012-10-31. But instead of picking a window where there is an actual missing data, let's pick a window where data is present, but we artificially remove it and then look at techniques to fill it. This way, we can have some way to see how well a particular method is working.

In [None]:
msno.matrix(ts_df["2012-10-13": "2012-10-24"], freq="D", fontsize=10)
#plt.savefig('imgs/chapter_2/missing_no_mac000193.png', bbox_inches='tight')
plt.show()

We see that there are two days of information and normally would preceed to fill in the gaps using some of the techniques we will be revieweing. But just to see how effective the different techniques we are reviewing, we will artificially create a missing section and see how well we are able to impute it.

In [None]:
# The dates between which we are nulling out the time series
window = slice("2012-10-07", "2012-10-08")
# Creating a new column and artificially creating missing values
ts_df['energy_consumption_missing'] = ts_df.energy_consumption
ts_df

In [None]:
ts_df.loc[window, "energy_consumption_missing"] = np.nan
# Creating a new window with a few dats added on either side for visualization purposes
vis_window = slice(pd.to_datetime(window.start) - pd.Timedelta(days=2), pd.to_datetime(window.stop) + pd.Timedelta(days=3))

In [None]:
fig = px.line(ts_df[vis_window], y="energy_consumption_missing", title=f"MAC000193 Energy Consumption between {vis_window.start.strftime('%Y-%m-%d')} and {vis_window.stop.strftime('%Y-%m-%d')}")
fig = format_plot(fig)
#fig.write_image("imgs/chapter_2/mac000193_missing.png")
fig.show()

### Missing Data Imputation

In [None]:
null_mask = ts_df.energy_consumption_missing.isnull()
null_mask

### Repeating Patterns

#### - **Last Day**

In [None]:
#Shifting 48 steps to get previous day
ts_df["prev_day"] = ts_df['energy_consumption'].shift(48)
ts_df['prev_day_imputed'] =  ts_df['energy_consumption_missing']
ts_df.loc[null_mask,:]
#ts_df.head(50)

In [None]:
#Shifting 48 steps to get previous day
ts_df["prev_day"] = ts_df['energy_consumption'].shift(48)
#Using the shifted column to fill missing
ts_df['prev_day_imputed'] =  ts_df['energy_consumption_missing']
ts_df.loc[null_mask,"prev_day_imputed"] = ts_df.loc[null_mask,"prev_day"]
mae = mean_absolute_error(ts_df.loc[window, "prev_day_imputed"], ts_df.loc[window, "energy_consumption"])
print(f"MAE for Last Day Imputation: {mae}")

In [None]:
plot_df = pd.melt(ts_df.loc[window,['energy_consumption','prev_day_imputed']].reset_index(), id_vars='timestamp', var_name="series")
fig = px.line(plot_df, x="timestamp", y=["value"], line_dash="series", title=f"Imputing with Previous Day | MAE={mae:.3f}")
fig = format_plot(fig, ['Original', 'Previous Day Imputed'])
#fig.write_image("imgs/chapter_2/previous_day_imputation.png")
fig

#### - **Hourly Profile**

In [None]:
#Create a column with the Hour from timestamp
ts_df["hour"] = ts_df.index.hour
#Calculate hourly average consumption
hourly_profile = ts_df.groupby(['hour'])['energy_consumption'].mean().reset_index()
hourly_profile.rename(columns={"energy_consumption": "hourly_profile"}, inplace=True)
#Saving the index because it gets lost in merge
idx = ts_df.index
#Merge the hourly profile dataframe to ts dataframe
ts_df = ts_df.merge(hourly_profile, on=['hour'], how='left', validate="many_to_one")
ts_df.index = idx

#Using the hourly profile to fill missing
ts_df['hourly_profile_imputed'] = ts_df['energy_consumption_missing']
ts_df.loc[null_mask,"hourly_profile_imputed"] = ts_df.loc[null_mask,"hourly_profile"]
mae = mean_absolute_error(ts_df.loc[window, "hourly_profile_imputed"], ts_df.loc[window, "energy_consumption"])
print(f"MAE for Hourly Profile Imputation: {mae}")

In [None]:
plot_df = pd.melt(ts_df.loc[window,['energy_consumption','hourly_profile_imputed']].reset_index(), id_vars='timestamp', var_name="series")
fig = px.line(plot_df, x="timestamp", y=["value"], line_dash="series", title=f"Imputing with Hourly Profile | MAE={mae:.3f}")
fig = format_plot(fig, ['Original', 'Hourly Profile Imputed'])
#fig.write_image("imgs/chapter_2/hourly_profile_imputation.png")
fig

#### - **Weekday-Hourly Profile**

In [None]:
#Create a column with the weekday from timestamp
ts_df["weekday"] = ts_df.index.weekday
#Calculate weekday-hourly average consumption
day_hourly_profile = ts_df.groupby(['weekday','hour'])['energy_consumption'].mean().reset_index()
day_hourly_profile.rename(columns={"energy_consumption": "day_hourly_profile"}, inplace=True)
#Saving the index because it gets lost in merge
idx = ts_df.index
#Merge the day-hourly profile dataframe to ts dataframe
ts_df = ts_df.merge(day_hourly_profile, on=['weekday', 'hour'], how='left', validate="many_to_one")
ts_df.index = idx

#Using the day-hourly profile to fill missing
ts_df['day_hourly_profile_imputed'] = ts_df['energy_consumption_missing']
ts_df.loc[null_mask,"day_hourly_profile_imputed"] = ts_df.loc[null_mask,"day_hourly_profile"]
mae = mean_absolute_error(ts_df.loc[window, "day_hourly_profile_imputed"], ts_df.loc[window, "energy_consumption"])

In [None]:
plot_df = pd.melt(ts_df.loc[window,['energy_consumption','day_hourly_profile_imputed']].reset_index(), id_vars='timestamp', var_name="series")
fig = px.line(plot_df, x="timestamp", y=["value"], line_dash="series", title=f"Imputing with Week Day-Hourly Profile | MAE={mae:.3f}")
fig = format_plot(fig, ['Original', 'Week Day-Hourly Profile Imputed'])
#fig.write_image("imgs/chapter_2/day_hourly_profile_imputation.png")
fig

In [None]:
# Hourly Profiles by week Day
px.line(ts_df.loc["2012-10-15": "2012-10-21",['day_hourly_profile','weekday']].reset_index(), x="timestamp", y="day_hourly_profile", color='weekday', title="Hourly Profiles by Weekday (0-Monday - 6-Sunday)")


#### - **Seasonal Interpolation**

In [None]:
from src.imputation.interpolation import SeasonalInterpolation

In [None]:
# Seasonal interpolation using 48*7 as the seasonal period.
# seasonal period is the period after which you expect the pattern to repeat
recovered_matrix_seas_interp_weekday_half_hour = SeasonalInterpolation(seasonal_period=48*7,
                                                     decomposition_strategy="additive",
                                                     interpolation_strategy="spline",
                                                     interpolation_args={"order":3},
                                                     min_value=0).fit_transform(ts_df.energy_consumption_missing.values.reshape(-1,1))

ts_df['seas_interp_weekday_half_hour_imputed'] = recovered_matrix_seas_interp_weekday_half_hour
mae_weekday_half_hour = mean_absolute_error(ts_df.loc[window, "seas_interp_weekday_half_hour_imputed"], ts_df.loc[window, "energy_consumption"])

In [None]:
recovered_matrix_seas_interp_half_hour = SeasonalInterpolation(seasonal_period=48,
                                                     decomposition_strategy="additive",
                                                     interpolation_strategy="spline",
                                                     interpolation_args={"order":3},
                                                     min_value=0).fit_transform(ts_df.energy_consumption_missing.values.reshape(-1,1))

ts_df['seas_interp_half_hour_imputed'] = recovered_matrix_seas_interp_half_hour
mae_half_hour = mean_absolute_error(ts_df.loc[window, "seas_interp_half_hour_imputed"], ts_df.loc[window, "energy_consumption"])

In [None]:
# recovered_matrix_seas_interp_quarter_hour = SeasonalInterpolation(seasonal_period=48*91,
#                                                      decomposition_strategy="additive",
#                                                      interpolation_strategy="spline",
#                                                      interpolation_args={"order":3},
#                                                      min_value=0).fit_transform(ts_df.energy_consumption_missing.values.reshape(-1,1))

# ts_df['seas_interp_quarter_imputed'] = recovered_matrix_seas_interp_quarter_hour
# mae_quarter_hour = mean_absolute_error(ts_df.loc[window, "seas_interp_quarter_imputed"], ts_df.loc[window, "energy_consumption"])

In [None]:
365/4

In [None]:
plot_df = pd.melt(ts_df.loc[window,['energy_consumption', "seas_interp_half_hour_imputed", 'seas_interp_weekday_half_hour_imputed']].reset_index(), id_vars='timestamp', var_name="series")
fig = px.line(plot_df, x="timestamp", y=["value"], line_dash="series", title=f"Imputing with Seasonal Interpolation <br> MAE Half Hourly={mae_half_hour:.3f} | MAE Weekday-Half Hourly={mae_weekday_half_hour:.3f}")
fig = format_plot(fig, ['Original', 'Half Hourly Profile Imputed', 'Week Day-Half Hourly Profile Imputed'], title_font_size=16)
#fig.write_image("imgs/chapter_2/seasonal_interpolation_imputation.png")
fig

### Summary

In [None]:
imputed_columns = [col for col in ts_df.columns if "imputed" in col]
original_col = "energy_consumption"


In [None]:
act = ts_df.loc[window, original_col].values
mae_d = {}
for col in imputed_columns:
    mae_d[col] = mean_absolute_error(act, ts_df.loc[window, col].values)

In [None]:
mae_d.keys()

In [None]:
res_df = pd.DataFrame.from_dict(mae_d, orient="index").reset_index()
res_df.columns=["Imputation", "MAE"]
res_df.Imputation = res_df.Imputation.map({
 'prev_day_imputed': "Previous Day",
 'hourly_profile_imputed': "Hourly Profile",
 'day_hourly_profile_imputed': "Day-Hourly Profile",
 'seas_interp_weekday_half_hour_imputed': "Seasonal Interpolation Weekday-Half Hourly",
 'seas_interp_half_hour_imputed': "Seasonal Interpolation Half Hourly"
})

In [None]:
res_df

In [None]:
plot_df = pd.melt(ts_df.loc[window,['energy_consumption', "seas_interp_half_hour_imputed", 'seas_interp_weekday_half_hour_imputed']].reset_index(), id_vars='timestamp', var_name="series")
fig = px.line(plot_df, x="timestamp", y=["value"], line_dash="series", title=f"Imputing with Seasonal Interpolation <br> MAE Half Hourly={mae_half_hour:.3f} | MAE Weekday-Half Hourly={mae_weekday_half_hour:.3f}")
fig = format_plot(fig, ['Original', 'Half Hourly Profile Imputed', 'Week Day-Half Hourly Profile Imputed'], title_font_size=16)
#fig.write_image("imgs/chapter_2/seasonal_interpolation_imputation.png")
fig

In [None]:
px.line(ts_df.loc[window, imputed_columns+[original_col]].reset_index(), x="timestamp", y=imputed_columns+[original_col])

In [None]:
%cd ../..

## 01 **Visualizing Time Series**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import os
import plotly.io as pio
pio.templates.default = "plotly_white"
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from IPython.display import display, HTML
# %load_ext autoreload
# %autoreload 2
np.random.seed()
tqdm.pandas()

In [None]:
os.makedirs("/content/drive/MyDrive/Books/Modern Time Series Forecasting/Chapter II/imgs/chapter_3", exist_ok=True)
preprocessed = Path("/content/drive/MyDrive/Books/Modern Time Series Forecasting/Data/data/london_smart_meters/data/london_smart_meters/preprocessed")
assert preprocessed.is_dir(), "You have to run 02 - Preprocessing London Smart Meter Dataset.ipynb in Chapter02 before running this notebook"

### 01-01 Reading Blocks 0-7

In [None]:
from src.utils.data_utils import compact_to_expanded

In [None]:
from itertools import cycle
from src.utils import plotting_utils

def format_plot(fig, legends = None, xlabel="Time", ylabel="Value", figsize=(500,900), font_size=15, title_font_size=20):
    if legends:
        names = cycle(legends)
        fig.for_each_trace(lambda t:  t.update(name = next(names)))
    fig.update_layout(
            autosize=False,
            width=figsize[1],
            height=figsize[0],
            title={
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'},
            titlefont={
                "size": 20
            },
            legend_title = None,
            legend=dict(
                font=dict(size=font_size),
                orientation="h",
                yanchor="bottom",
                y=0.98,
                xanchor="right",
                x=1,
            ),
            yaxis=dict(
                title_text=ylabel,
                titlefont=dict(size=font_size),
                tickfont=dict(size=font_size),
            ),
            xaxis=dict(
                title_text=xlabel,
                titlefont=dict(size=font_size),
                tickfont=dict(size=font_size),
            )
        )
    return fig

In [None]:
try:
    block_df = pd.read_parquet(preprocessed/"london_smart_meters_merged_block_0-7.parquet")
    display(block_df.head(2))
except FileNotFoundError:
    display(HTML("""
    <div class="alert alert-block alert-warning">
    <b>Warning!</b> File not found. Please make sure you have run 02 - Preprocessing London Smart Meter Dataset.ipynb in Chapter02
    </div>
    """))

Let's take all the consumers in one block for our exercise. Keep in mind, you can do the same with more number of blocks as long as it fits in your RAM

In [None]:
#Converting to expanded form
exp_block_df = compact_to_expanded(block_df[block_df.file=="block_7"], timeseries_col = 'energy_consumption',
                                    static_cols = ["frequency", "series_length", "stdorToU", "Acorn", "Acorn_grouped", "file"],
                                    time_varying_cols = ['holidays', 'visibility', 'windBearing', 'temperature', 'dewPoint',
                                          'pressure', 'apparentTemperature', 'windSpeed', 'precipType', 'icon',
                                          'humidity', 'summary'],
                                    ts_identifier = "LCLid")

exp_block_df.head()

In [None]:
exp_block_df.LCLid.unique()

#### **MAC000193**

In [None]:
# Taking a single time series from the block
ts_df_MAC000193 = exp_block_df[exp_block_df.LCLid=="MAC000193"].set_index("timestamp")
ts_df_MAC000193.head()

In [None]:
ts_df_MAC000193["weekday_name"] = ts_df_MAC000193.index.day_name()
ts_df_MAC000193["weekday"] = ts_df_MAC000193.index.weekday
ts_df_MAC000193["week"] = ts_df_MAC000193.index.isocalendar().week
ts_df_MAC000193["day"] = ts_df_MAC000193.index.day
ts_df_MAC000193["hour"] = ts_df_MAC000193.index.hour
ts_df_MAC000193["date"] = ts_df_MAC000193.index.date
ts_df_MAC000193["month"] = ts_df_MAC000193.index.month
ts_df_MAC000193["month_name"] = ts_df_MAC000193.index.month_name()
ts_df_MAC000193["year"] = ts_df_MAC000193.index.year

#Making ordered categoricals to make for sorted plots
ts_df_MAC000193['month_name'] = pd.Categorical(ts_df_MAC000193['month_name'], categories=["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"], ordered=True)
ts_df_MAC000193['weekday_name'] = pd.Categorical(ts_df_MAC000193['weekday_name'], categories=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"], ordered=True)

In [None]:
#Making ordered categoricals to make for sorted plots
ts_df_MAC000193.head()

#### **MAC001768**

In [None]:
# Taking a single time series from the block
ts_df_MAC001768 = exp_block_df[exp_block_df.LCLid=="MAC001768"].set_index("timestamp")
ts_df_MAC001768.head()

In [None]:
ts_df_MAC001768["weekday_name"] = ts_df_MAC001768.index.day_name()
ts_df_MAC001768["weekday"] = ts_df_MAC001768.index.weekday
ts_df_MAC001768["week"] = ts_df_MAC001768.index.isocalendar().week
ts_df_MAC001768["day"] = ts_df_MAC001768.index.day
ts_df_MAC001768["hour"] = ts_df_MAC001768.index.hour
ts_df_MAC001768["date"] = ts_df_MAC001768.index.date
ts_df_MAC001768["month"] = ts_df_MAC001768.index.month
ts_df_MAC001768["month_name"] = ts_df_MAC001768.index.month_name()
ts_df_MAC001768["year"] = ts_df_MAC001768.index.year

#Making ordered categoricals to make for sorted plots
ts_df_MAC001768['month_name'] = pd.Categorical(ts_df_MAC001768['month_name'], categories=["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"], ordered=True)
ts_df_MAC001768['weekday_name'] = pd.Categorical(ts_df_MAC001768['weekday_name'], categories=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"], ordered=True)

In [None]:
#Making ordered categoricals to make for sorted plots
ts_df_MAC001768.head()

In [None]:
#Interpolating Missing values
from src.imputation.interpolation import SeasonalInterpolation

ts_df_MAC000193['energy_consumption'] = SeasonalInterpolation(seasonal_period=48*7).fit_transform(ts_df_MAC000193.energy_consumption.values.reshape(-1,1))
ts_df_MAC001768['energy_consumption'] = SeasonalInterpolation(seasonal_period=48*7).fit_transform(ts_df_MAC001768.energy_consumption.values.reshape(-1,1))

## **Line Charts**

### Basic Line Plot

In [None]:
fig = px.line(ts_df_MAC000193, y="energy_consumption", title="Energy Consumption for MAC000193", )
#fig = px.line(ts_df_MAC000193, y="energy_consumption", title="Energy Consumption for MAC000193"
fig = format_plot(fig, ylabel="Energy Consumption")
#fig.write_image("/content/drive/MyDrive/Books/Modern Time Series Forecasting/Chapter II/imgs/chapter_3/line_plot.png")
fig.show()

In [None]:
ts_df_MAC000193["rolling_monthly_avg"] = ts_df_MAC000193["energy_consumption"].rolling(window=48*30).mean()

In [None]:
fig = px.line(ts_df_MAC000193, y="rolling_monthly_avg", title="Rolling Monthly Average Energy Consumption for MAC000193")
#fig = px.line(ts_df_MAC000193, y="energy_consumption", title="Energy Consumption for MAC000193")
fig = format_plot(fig, ylabel="Energy Consumption")
#fig.write_image("imgs/chapter_3/rolling_avg_line_plot.png")
fig.show()

### Rolling Average Plot

In [None]:
ts_df_MAC001768["rolling_monthly_avg"] = ts_df_MAC001768["energy_consumption"].rolling(window=48*30).mean()

In [None]:
fig = px.line(ts_df_MAC001768, y="energy_consumption", title="Energy Consumption for MAC000193", )
#fig = px.line(ts_df_MAC000193, y="energy_consumption", title="Energy Consumption for MAC000193"
fig = format_plot(fig, ylabel="Energy Consumption")
#fig.write_image("/content/drive/MyDrive/Books/Modern Time Series Forecasting/Chapter II/imgs/chapter_3/line_plot.png")
fig.show()

In [None]:
fig = px.line(ts_df_MAC001768, y="rolling_monthly_avg", title="Rolling Monthly Average Energy Consumption for MAC000193")
#fig = px.line(ts_df_MAC000193, y="energy_consumption", title="Energy Consumption for MAC000193")
fig = format_plot(fig, ylabel="Energy Consumption")
#fig.write_image("imgs/chapter_3/rolling_avg_line_plot.png")
fig.show()

### Multivariate Line Chart

In [None]:
fig = plotting_utils.two_line_plot_secondary_axis(x=ts_df_MAC001768.index,
                                   y1=ts_df_MAC001768.energy_consumption,
                                   y2=ts_df_MAC001768.temperature,
                                   y1_name="Energy Consumption",
                                   y2_name="Temperature",
                                   title="Temperature and Energy Consumption",
                                   # greyscale=True,
                                  dash_secondary=False)
fig.update_layout(legend=dict(
                font=dict(size=15),
                orientation="h",
                yanchor="bottom",
                y=0.98,
                xanchor="right",
                x=1,
            ),
            yaxis=dict(
                # title_text=ylabel,
                titlefont=dict(size=15),
                tickfont=dict(size=15),
            ),
            xaxis=dict(
                # title_text=xlabel,
                titlefont=dict(size=15),
                tickfont=dict(size=15),
            ))
#fig.write_image("imgs/chapter_3/temp_vs_energy.png")
fig.show()

In [None]:
# fig = plotting_utils.two_line_plot_secondary_axis(x=ts_df_MAC001768.index,
#                                    y1=ts_df_MAC001768.energy_consumption,
#                                    y2=ts_df_MAC000193.energy_consumption,
#                                    y1_name="Energy Consumption MAC001768",
#                                    y2_name="Energy Consumption MAC000193",
#                                    title="Temperature and Energy Consumption",
#                                    # greyscale=True,
#                                   dash_secondary=False)
# fig.update_layout(legend=dict(
#                 font=dict(size=15),
#                 orientation="h",
#                 yanchor="bottom",
#                 y=0.98,
#                 xanchor="right",
#                 x=1,
#             ),
#             yaxis=dict(
#                 # title_text=ylabel,
#                 titlefont=dict(size=15),
#                 tickfont=dict(size=15),
#             ),
#             xaxis=dict(
#                 # title_text=xlabel,
#                 titlefont=dict(size=15),
#                 tickfont=dict(size=15),
#             ))
# #fig.write_image("imgs/chapter_3/temp_vs_energy.png")
# fig.show()

### Multivariate Line Chart - Hourly Resolution

In [None]:
zoom_window = slice("2012-03-16", "2012-03-18")
fig = plotting_utils.two_line_plot_secondary_axis(x=ts_df_MAC000193[zoom_window].index,
                                   y1=ts_df_MAC000193[zoom_window].energy_consumption,
                                   y2=ts_df_MAC000193[zoom_window].temperature,
                                   y1_name="Energy Consumption",
                                   y2_name="Temperature",
                                   title="Temperature and Energy Consumption (2012-03-16 to 2012-03-30)",
                                   greyscale=False,
                                   dash_secondary=True)
fig.update_layout(legend=dict(
                font=dict(size=15),
                orientation="h",
                yanchor="bottom",
                y=0.98,
                xanchor="right",
                x=1,
            ),
            yaxis=dict(
                # title_text=ylabel,
                titlefont=dict(size=15),
                tickfont=dict(size=15),
            ),
            xaxis=dict(
                # title_text=xlabel,
                titlefont=dict(size=15),
                tickfont=dict(size=15),
            ))
#fig.write_image("imgs/chapter_3/temp_vs_energy_zoomed.png")
fig.show()

### Multivariate Line Chart - Monthly Resolution

In [None]:
# Average energy_consumption and temperature for each month
plot_df = ts_df_MAC000193[~ts_df_MAC000193.year.isin([2011, 2014])].groupby(["year", "month_name"])[['energy_consumption', "temperature"]].mean().reset_index()
plot_df['year_month'] = plot_df["year"].astype(str) +" "+ plot_df['month_name'].astype(str).str.zfill(2)
fig = plotting_utils.two_line_plot_secondary_axis(x=[plot_df.year, plot_df.month_name], y1=plot_df.energy_consumption, y2=plot_df.temperature, y1_name="Energy Consumption", y2_name="Temperature", title="Temperature and Energy Consumption - Monthly", dash_secondary=True)
fig.update_layout(legend=dict(
                font=dict(size=15),
                orientation="h",
                yanchor="bottom",
                y=0.98,
                xanchor="right",
                x=1,
            ),
            yaxis=dict(
                # title_text=ylabel,
                titlefont=dict(size=15),
                tickfont=dict(size=15),
            ),
            xaxis=dict(
                # title_text=xlabel,
                titlefont=dict(size=15),
                tickfont=dict(size=15),
            ))
#fig.write_image("imgs/chapter_3/temp_vs_energy_monthly.png")
fig.show()

## Seasonal Plots

### Annual Seasonality at Monthly Resolution

In [None]:
#Montlhly Average energy consumption
plot_df = ts_df_MAC000193[~ts_df_MAC000193.year.isin([2011, 2014])].groupby(["year", "month_name"],observed=False)[['energy_consumption',"temperature"]].mean().reset_index()

In [None]:
fig = px.line(plot_df, x="month_name", y='energy_consumption', color="year", line_dash="year", title="Seasonal Plot - Monthly")
fig = format_plot(fig, ylabel="Energy Consumption", xlabel="Month")
fig.update_layout(legend=dict(
                font=dict(size=15),
                orientation="h",
                yanchor="bottom",
                y=0.98,
                xanchor="right",
                x=1,
            ),
            yaxis=dict(
                # title_text=ylabel,
                titlefont=dict(size=15),
                tickfont=dict(size=15),
            ),
            xaxis=dict(
                # title_text=xlabel,
                titlefont=dict(size=15),
                tickfont=dict(size=15),
            ))
#fig.write_image("imgs/chapter_3/seasonal_plot_monthly.png")
fig.show()

In [None]:
fig = plotting_utils.multiple_line_plot_secondary_axis(plot_df,
                                        x="month_name",
                                        primary='energy_consumption',
                                        secondary='temperature',
                                        color_or_linetype="year",
                                        title="Seasonal Plot Monthly: Multivariate",
                                       use_linetype=True,
                                       greyscale=False)
fig = format_plot(fig, ylabel="Energy Consumption", xlabel="Month")
fig.update_layout(legend=dict(
                orientation="h",
                yanchor="bottom",
                y=0.9,
                xanchor="right",
                x=1,
            ))
#fig.write_image("imgs/chapter_3/seasonal_plot_monthly_mv.png")
fig.show()

## Daily Seasonality at Hourly Resolution

In [None]:
#Calculating the day-hour average
plot_df = ts_df_MAC000193.groupby(["day", "hour"])['energy_consumption'].mean().reset_index()

In [None]:
fig = px.line(plot_df[["energy_consumption", "hour", "day"]], y="energy_consumption", x="hour",color="day", title="Day of Month-Hourly Average Consumption")
fig = format_plot(fig, ylabel="Energy Consumption", xlabel="Hours", legends=None)
fig.update_layout(showlegend=False)
# plotting_utils.make_lines_greyscale(fig)
#fig.write_image("imgs/chapter_3/seasonal_plot_hourly.png")
fig.show()

## Seasonal Box Plots

In [None]:
plot_df =  ts_df_MAC000193.groupby(["date","weekday_name", "hour"],observed=False)["energy_consumption"].mean().reset_index().dropna()

In [None]:
fig = px.box(plot_df, y="energy_consumption", x="hour", log_y=True, title="Box Plot: Day of Month-Hourly Average")
fig = format_plot(fig, ylabel="Energy Consumption", xlabel="Hours", legends=None)
#fig.write_image("imgs/chapter_3/box_plot_hourly_avg.png")
fig.show()

In [None]:
fig = px.box(plot_df, y="energy_consumption", x="hour", facet_col="weekday_name", facet_col_wrap=2, log_y=True, category_orders={"weekday_name":ts_df_MAC000193.weekday_name.cat.categories.tolist()}, title="Box Plot: Hourly Averages for each Week Day")
fig = format_plot(fig, ylabel="Energy Consumption", xlabel="Hours", legends=None, figsize=(1200, 1500))
#fig.write_image("imgs/chapter_3/box_plot_hourly_weekday.png")
fig.show()

## Calendar Heatmap

In [None]:
plot_df = pd.pivot_table(ts_df_MAC000193, index="weekday_name", values='energy_consumption', columns="hour", aggfunc="mean")
# plot_df.index = "Day "+ plot_df.index.astype(str)

In [None]:
fig = px.imshow(plot_df, height=600, title="Energy Consumption: Hours vs Week Day")
fig = format_plot(fig, ylabel="Week Day", xlabel="Hours", legends=None)
#fig.write_image("imgs/chapter_3/hour_weekday_heatmap.png")
fig.show()

In [None]:
plot_df = pd.pivot_table(ts_df_MAC000193[~ts_df_MAC000193.year.isin([2011, 2014])], index="year", values='energy_consumption', columns="month_name", aggfunc="mean")
plot_df.index = "Y"+ plot_df.index.astype(str)

fig = px.imshow(plot_df, height=600, title="Energy Consumption: Months vs Year")
fig = format_plot(fig, ylabel="Year", xlabel="Months", legends=None)
#fig.write_image("imgs/chapter_3/month_year_heatmap.png")
fig.show()

In [None]:
plot_df = pd.pivot_table(ts_df_MAC000193[~ts_df_MAC000193.year.isin([2011, 2014])], index="year", values='apparentTemperature', columns="month_name", aggfunc="mean")
plot_df.index = "Y"+ plot_df.index.astype(str)

fig = px.imshow(plot_df, height=600, title="Temperature: Months vs Year")
fig = format_plot(fig, ylabel="Year", xlabel="Months", legends=None)
#fig.write_image("imgs/chapter_3/month_year_heatmap_temperature.png")
fig.show()

# Autocorrelation Plots

In [None]:
from src.decomposition.seasonal import MultiSeasonalDecomposition, STL
stl = MultiSeasonalDecomposition(seasonal_model="fourier",seasonality_periods=["day_of_year", "day_of_week", "hour"], model = "additive", n_fourier_terms=10)
res = stl.fit(ts_df_MAC000193.energy_consumption)

In [None]:
fig = plotting_utils.plot_autocorrelation(res.resid, vertical=True)
#fig.write_image("imgs/chapter_3/acf_pacf.png")
fig.show()