# Modeling

In [79]:
# Ignoring warning messages from python
import warnings
warnings.filterwarnings('ignore')

# General use imports
import pandas as pd
import numpy as np
from datetime import date
from datetime import time
from datetime import datetime
from datetime import timedelta

# Useful modules imports
import wrangle

# Visualization imports
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly
import plotly.express as px
from plotly.subplots import make_subplots
from IPython.display import display_html
from itertools import chain,cycle

# Datetime, stats and modeling tools
import statsmodels.api as sm
from math import sqrt
from statsmodels.tsa.stattools import acf
from datetime import timedelta, datetime
from statsmodels.tsa.seasonal import seasonal_decompose 
from statsmodels.tsa.holtwinters import SimpleExpSmoothing   
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.seasonal import DecomposeResult, seasonal_decompose

In [2]:
# Acquiring the data
combined_df = wrangle.get_combined_df(get_central = True)
combined_df.head(2)

Unnamed: 0_level_0,ercot_load,dow,is_weekday,is_obs_holiday,hs_temp,hs_feelslike,hs_dew,hs_humidity,hs_precip,hs_windgust,...,vc_precip,vc_windgust,vc_windspeed,vc_winddir,vc_sealevelpressure,vc_cloudcover,vc_visibility,vc_solarradiation,vc_solarenergy,vc_uvindex
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-01 00:00:00-06:00,7931.2419,Friday,1,1,47.3,41.2,43.2,85.37,0.0,27.5,...,0.0,24.2,16.1,360.0,1027.7,98.7,9.9,0.0,,0
2010-01-01 01:00:00-06:00,7775.456846,Friday,1,1,46.8,39.9,41.4,81.38,0.0,24.2,...,0.0,26.5,15.1,350.0,1028.1,98.6,9.9,0.0,,0


>### Exmining the data for any impediment to modeling

In [8]:
df1 = df.resample('H')

<pandas.core.resample.DatetimeIndexResampler object at 0x7fda883bc790>

In [3]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 109535 entries, 2010-01-01 00:00:00-06:00 to 2022-06-30 23:00:00-05:00
Data columns (total 60 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ercot_load           109535 non-null  float64
 1   dow                  109535 non-null  object 
 2   is_weekday           109535 non-null  int64  
 3   is_obs_holiday       109535 non-null  int64  
 4   hs_temp              109535 non-null  float64
 5   hs_feelslike         109535 non-null  float64
 6   hs_dew               109535 non-null  float64
 7   hs_humidity          109535 non-null  float64
 8   hs_precip            109535 non-null  float64
 9   hs_windgust          28674 non-null   float64
 10  hs_windspeed         109535 non-null  float64
 11  hs_winddir           109535 non-null  float64
 12  hs_sealevelpressure  109435 non-null  float64
 13  hs_cloudcover        109535 non-null  float64
 14  hs_visibility        1

In [4]:
combined_df.index

DatetimeIndex(['2010-01-01 00:00:00-06:00', '2010-01-01 01:00:00-06:00',
               '2010-01-01 02:00:00-06:00', '2010-01-01 03:00:00-06:00',
               '2010-01-01 04:00:00-06:00', '2010-01-01 05:00:00-06:00',
               '2010-01-01 06:00:00-06:00', '2010-01-01 07:00:00-06:00',
               '2010-01-01 08:00:00-06:00', '2010-01-01 09:00:00-06:00',
               ...
               '2022-06-30 14:00:00-05:00', '2022-06-30 15:00:00-05:00',
               '2022-06-30 16:00:00-05:00', '2022-06-30 17:00:00-05:00',
               '2022-06-30 18:00:00-05:00', '2022-06-30 19:00:00-05:00',
               '2022-06-30 20:00:00-05:00', '2022-06-30 21:00:00-05:00',
               '2022-06-30 22:00:00-05:00', '2022-06-30 23:00:00-05:00'],
              dtype='datetime64[ns, US/Central]', name='datetime', length=109535, freq=None)

>#### Checking the index shows that pandas do not recognize a frequency to the data although we know that it is hourly
>#### I will resample to get the right frequency

In [5]:
# Resampling to hourly
df = combined_df.resample('H').mean()

In [6]:
# The frequency shows up now
df.index

DatetimeIndex(['2010-01-01 00:00:00-06:00', '2010-01-01 01:00:00-06:00',
               '2010-01-01 02:00:00-06:00', '2010-01-01 03:00:00-06:00',
               '2010-01-01 04:00:00-06:00', '2010-01-01 05:00:00-06:00',
               '2010-01-01 06:00:00-06:00', '2010-01-01 07:00:00-06:00',
               '2010-01-01 08:00:00-06:00', '2010-01-01 09:00:00-06:00',
               ...
               '2022-06-30 14:00:00-05:00', '2022-06-30 15:00:00-05:00',
               '2022-06-30 16:00:00-05:00', '2022-06-30 17:00:00-05:00',
               '2022-06-30 18:00:00-05:00', '2022-06-30 19:00:00-05:00',
               '2022-06-30 20:00:00-05:00', '2022-06-30 21:00:00-05:00',
               '2022-06-30 22:00:00-05:00', '2022-06-30 23:00:00-05:00'],
              dtype='datetime64[ns, US/Central]', name='datetime', length=109535, freq='H')

>#### Resampling removes non numeric data so I'll check the difference between the two dataframe to make sure we are not missing on anything important

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 109535 entries, 2010-01-01 00:00:00-06:00 to 2022-06-30 23:00:00-05:00
Freq: H
Data columns (total 59 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ercot_load           109535 non-null  float64
 1   is_weekday           109535 non-null  float64
 2   is_obs_holiday       109535 non-null  float64
 3   hs_temp              109535 non-null  float64
 4   hs_feelslike         109535 non-null  float64
 5   hs_dew               109535 non-null  float64
 6   hs_humidity          109535 non-null  float64
 7   hs_precip            109535 non-null  float64
 8   hs_windgust          28674 non-null   float64
 9   hs_windspeed         109535 non-null  float64
 10  hs_winddir           109535 non-null  float64
 11  hs_sealevelpressure  109435 non-null  float64
 12  hs_cloudcover        109535 non-null  float64
 13  hs_visibility        109535 non-null  float64
 14  hs_solarradiat

In [11]:
columns1 = combined_df.columns

In [9]:
columns2 = df.columns

In [12]:
diff = list(set(columns1)-set(columns2))
print(diff)

['dow']


In [17]:
plotly.__version__

'5.9.0'

>### Planning modeling  
        * Baseline  
            - Use simple average of the last day
            - Could I get the moving averages of the day and get the average of all moving averge?
        * Holt-Winter's Model application  
            - 

## 1. Setting up the baseline

In [24]:
df.tail(1)

Unnamed: 0_level_0,ercot_load,is_weekday,is_obs_holiday,hs_temp,hs_feelslike,hs_dew,hs_humidity,hs_precip,hs_windgust,hs_windspeed,...,vc_precip,vc_windgust,vc_windspeed,vc_winddir,vc_sealevelpressure,vc_cloudcover,vc_visibility,vc_solarradiation,vc_solarenergy,vc_uvindex
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-06-30 23:00:00-05:00,14065.386852,1.0,0.0,77.2,77.2,73.4,88.06,0.0,10.5,10.0,...,0.0,13.9,7.0,79.0,1014.0,0.4,9.9,0.0,,0.0


In [23]:
df.index

DatetimeIndex(['2010-01-01 00:00:00-06:00', '2010-01-01 01:00:00-06:00',
               '2010-01-01 02:00:00-06:00', '2010-01-01 03:00:00-06:00',
               '2010-01-01 04:00:00-06:00', '2010-01-01 05:00:00-06:00',
               '2010-01-01 06:00:00-06:00', '2010-01-01 07:00:00-06:00',
               '2010-01-01 08:00:00-06:00', '2010-01-01 09:00:00-06:00',
               ...
               '2022-06-30 14:00:00-05:00', '2022-06-30 15:00:00-05:00',
               '2022-06-30 16:00:00-05:00', '2022-06-30 17:00:00-05:00',
               '2022-06-30 18:00:00-05:00', '2022-06-30 19:00:00-05:00',
               '2022-06-30 20:00:00-05:00', '2022-06-30 21:00:00-05:00',
               '2022-06-30 22:00:00-05:00', '2022-06-30 23:00:00-05:00'],
              dtype='datetime64[ns, US/Central]', name='datetime', length=109535, freq='H')

In [44]:
# Finding the average of the last day of our dataset (June 30, 2022)
Baseline = df.loc['2022-06-30 00:00:00-05:00':'2022-06-30 23:00:00-05:00'].ercot_load.mean()

print(f"\033[94m \033[1m The baseline model has an average consumption demand of: {Baseline:.2f} Megawatt \033[0m")

[94m [1m The baseline model has an average consumption demand of: 14233.84 Megawatt [0m


>### Attempt at an average of moving averages as a baseline

## 2. Holt-Winter use set up

>### Splitting the data  
        - I'll split the data in two Train: 2010 - 2017.  Test: 2018 - 2022 I'll use blocked cross-validation on the train set  
        - For using Holt-Winters, I need to make sure the data has levels, a trend and some seasonality  
        - 

>#### Checking for seasonality, trend and level through graphs

In [48]:
df.head(1)

Unnamed: 0_level_0,ercot_load,is_weekday,is_obs_holiday,hs_temp,hs_feelslike,hs_dew,hs_humidity,hs_precip,hs_windgust,hs_windspeed,...,vc_precip,vc_windgust,vc_windspeed,vc_winddir,vc_sealevelpressure,vc_cloudcover,vc_visibility,vc_solarradiation,vc_solarenergy,vc_uvindex
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-01 00:00:00-06:00,7931.2419,1.0,1.0,47.3,41.2,43.2,85.37,0.0,27.5,15.1,...,0.0,24.2,16.1,360.0,1027.7,98.7,9.9,0.0,,0.0


In [None]:
def display_side_by_side(*args,titles=cycle([''])):
    '''
    
    '''
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:center"><td style="vertical-align:top">'
        html_str+=f'<h2>{title}</h2>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'

In [None]:
def display_cohort_traffic(freq_df):
display_side_by_side(freq_df[:5], freq_df[-5:], titles = ['Top 5 Cohorts by Traffic', 'Bottom 5 Cohorts by Traffic'])

In [None]:
def top_bot_5(intro_freq):
    top_5 = round(intro_freq['count'][:5].sum() / intro_freq['count'].sum() * 100, 1)
    bot_5 = round(intro_freq['count'][-5:].sum()/ intro_freq['count'].sum() * 100, 1)
    print(f'The top 5 cohorts account for {top_5}% of traffic to the most popular lesson, while the bottom 5 cohorts\
    account for {bot_5}% of traffic.')  

In [63]:
df[:'2017'].ercot_load

datetime
2010-01-01 00:00:00-06:00     7931.241900
2010-01-01 01:00:00-06:00     7775.456846
2010-01-01 02:00:00-06:00     7704.815982
2010-01-01 03:00:00-06:00     7650.575724
2010-01-01 04:00:00-06:00     7666.708317
                                 ...     
2017-12-31 19:00:00-06:00    12061.549401
2017-12-31 20:00:00-06:00    12015.663549
2017-12-31 21:00:00-06:00    11883.114122
2017-12-31 22:00:00-06:00    11754.250889
2017-12-31 23:00:00-06:00    11579.853459
Freq: H, Name: ercot_load, Length: 70128, dtype: float64

In [72]:
# Splitting the data

train = df[:'2017'].ercot_load
test = df['2018':].ercot_load

print(f'The Train dtatset:\n {train.head()}')
print('\n')
print(f'The Test dataset:\n {test.head()}')

The Train dtatset:
 datetime
2010-01-01 00:00:00-06:00    7931.241900
2010-01-01 01:00:00-06:00    7775.456846
2010-01-01 02:00:00-06:00    7704.815982
2010-01-01 03:00:00-06:00    7650.575724
2010-01-01 04:00:00-06:00    7666.708317
Freq: H, Name: ercot_load, dtype: float64


The Test dataset:
 datetime
2018-01-01 00:00:00-06:00    11452.163689
2018-01-01 01:00:00-06:00    11425.979115
2018-01-01 02:00:00-06:00    11408.418023
2018-01-01 03:00:00-06:00    11405.198365
2018-01-01 04:00:00-06:00    11450.560138
Freq: H, Name: ercot_load, dtype: float64


In [76]:
print(train.to_frame(), test.to_frame())

                             ercot_load
datetime                               
2010-01-01 00:00:00-06:00   7931.241900
2010-01-01 01:00:00-06:00   7775.456846
2010-01-01 02:00:00-06:00   7704.815982
2010-01-01 03:00:00-06:00   7650.575724
2010-01-01 04:00:00-06:00   7666.708317
...                                 ...
2017-12-31 19:00:00-06:00  12061.549401
2017-12-31 20:00:00-06:00  12015.663549
2017-12-31 21:00:00-06:00  11883.114122
2017-12-31 22:00:00-06:00  11754.250889
2017-12-31 23:00:00-06:00  11579.853459

[70128 rows x 1 columns]                              ercot_load
datetime                               
2018-01-01 00:00:00-06:00  11452.163689
2018-01-01 01:00:00-06:00  11425.979115
2018-01-01 02:00:00-06:00  11408.418023
2018-01-01 03:00:00-06:00  11405.198365
2018-01-01 04:00:00-06:00  11450.560138
...                                 ...
2022-06-30 19:00:00-05:00  15040.841510
2022-06-30 20:00:00-05:00  14700.132848
2022-06-30 21:00:00-05:00  14637.633680
2022-06-30 22:

In [68]:
type(train)

pandas.core.series.Series

In [80]:
def display_side_by_side(*args,titles=cycle([''])):
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:center"><td style="vertical-align:top">'
        html_str+=f'<h2 style="text-align: center;">{title}</h2>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'
    display_html(html_str,raw=True)

In [None]:
# Plotting the split data

fig = make_subplots(rows=1, cols=2)
fig.add_trace(
    go.Scatter(x=mean_train.sale_date, y=mean_train.sales_total),
    row=1, col=1
)
fig.add_trace(
    go.Scatter(x=mean_test.sale_date, y=mean_test.sales_total),
    row=1, col=2
)
fig.update_layout(height=500, width=700, title_text="Side By Side Subplots of Train and Test")
fig.show()

In [None]:
df_monthly = train.resample('W').sales_total.mean().to_frame().reset_index()

In [47]:
'''
This function takes in a statistical model:
1.the seasonal decomposition using moving averages:
a pd Series of the value to plot (df.column_name, pd.Series, np.array),
the model or type of seasonal component ('additive' or 'multiplicative'),
and a period determining the amount of lag (integer),
2.the dates of the dataset as a Series (df.column_name, pd.Series) or
the length of the observed variable (np.arange(len(result.observed)))
'''

def plot_seasonal_decomposition(result:DecomposeResult, dates:pd.Series=None, title:str='Seasonal Decomposition'):
    x_values = dates if dates is not None else np.arange(len(result.observed))
    return (
        make_subplots(
            rows=4,
            cols=1,
            subplot_titles=['Observed', 'Trend', 'Seasonal', 'Residuals'],
        )
        .add_trace(
            go.Scatter(x=x_values, y=result.observed, mode='lines', name='Observed'),
            row=1,
            col=1,
        )
        .add_trace(
            go.Scatter(x=x_values, y=result.trend, mode='lines', name='Trend'),
            row=2,
            col=1,
        )
        .add_trace(
            go.Scatter(x=x_values, y=result.seasonal, mode='lines', name='Seasonal'),
            row=3,
            col=1,
        )
        .add_trace(
            go.Scatter(x=x_values, y=result.resid, mode='markers', name='Residual'),
            row=4,
            col=1,
        )
        .update_layout(
            height=900, title=f'<b>{title}</b>', margin={'t':100}, title_x=0.5, showlegend=False
        )
    )

In [None]:
# Plotting the seasonal decomposition with a resampled data by week and a lag of 25 weeks

decomposition = seasonal_decompose(df.ercot_load, model='additive', period=25)
fig = plot_seasonal_decomposition(decomposition, dates=mean_train_week.sale_date)
fig.show()

In [56]:
dir(__builtins__)

['ArithmeticError',
 'AssertionError',
 'AttributeError',
 'BaseException',
 'BlockingIOError',
 'BrokenPipeError',
 'BufferError',
 'ChildProcessError',
 'ConnectionAbortedError',
 'ConnectionError',
 'ConnectionRefusedError',
 'ConnectionResetError',
 'EOFError',
 'Ellipsis',
 'EnvironmentError',
 'Exception',
 'False',
 'FileExistsError',
 'FileNotFoundError',
 'FloatingPointError',
 'GeneratorExit',
 'IOError',
 'ImportError',
 'IndentationError',
 'IndexError',
 'InterruptedError',
 'IsADirectoryError',
 'KeyError',
 'KeyboardInterrupt',
 'LookupError',
 'MemoryError',
 'ModuleNotFoundError',
 'NameError',
 'None',
 'NotADirectoryError',
 'NotImplemented',
 'NotImplementedError',
 'OSError',
 'OverflowError',
 'PermissionError',
 'ProcessLookupError',
 'RecursionError',
 'ReferenceError',
 'RuntimeError',
 'StopAsyncIteration',
 'StopIteration',
 'SyntaxError',
 'SystemError',
 'SystemExit',
 'TabError',
 'TimeoutError',
 'True',
 'TypeError',
 'UnboundLocalError',
 'UnicodeDecode