In [0]:
# Load libraries
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from statsmodels.formula.api import ols

In [0]:
# Load data
data = pd.read_excel('drive/My Drive/load_weather_dataset.xlsx')

In [0]:
# Convert to datetime and add day of the week, weekend flag, lockdown and convert Load to Ln
data['Date'] = pd.to_datetime(data['Date'])
data['Day'] = data['Date'].apply(lambda x: x.weekday())
data['Weekend'] = data['Day'].apply(lambda x: 1 if x>=5 else 0)
data['Lockdown'] = data['Date'].apply(lambda x: 1 if x > dt.datetime(2020,3,18) else 0)
data.rename(columns={'Load Values England and Wales (MW)':'Load','HadCET min':'Temp_min','HadCET mean':'Temp_mean','HadCET max':'Temp_max'},inplace=True)
data['Load'] = np.log(data['Load'])
data['t'] = (data['Date']-data['Date'][0])
data['t'] = data['t'].apply(lambda x: pd.Timedelta.total_seconds(x) / (24*60*60*365))
data['t_2'] = data['t']**2

In [100]:
data.head()

Unnamed: 0,Date,Temp_min,Temp_mean,Temp_max,Load,London min forecast,London mean forecast,London max forecast,Bristol min forecast,Bristol mean forecast,Bristol max forecast,Leeds min forecast,Leeds mean forecast,Leeds max forecast,Day,Weekend,Lockdown,t,t_2
0,2017-01-01,4.4,5.2,6.0,10.20655,,,,,,,,,,6,1,0,0.0,0.0
1,2017-01-02,-0.6,2.0,4.6,10.294451,,,,,,,,,,0,0,0,0.00274,8e-06
2,2017-01-03,-1.2,2.7,6.6,10.437069,,,,,,,,,,1,0,0,0.005479,3e-05
3,2017-01-04,2.2,4.9,7.6,10.452967,,,,,,,,,,2,0,0,0.008219,6.8e-05
4,2017-01-05,-3.2,1.2,5.6,10.494623,,,,,,,,,,3,0,0,0.010959,0.00012


In [0]:
# Add smoothed series
data['Sma'] = data['Load'].rolling(window=7).mean()
data['Ema_10'] = data['Load'].ewm(alpha=0.1).mean()
data['Ema_50'] = data['Load'].ewm(alpha=0.5).mean()
data['Ema_90'] = data['Load'].ewm(alpha=0.9).mean()

# Add lags for regression
data['Ema_90_1'] = data['Ema_90'].shift(periods = 1)

In [102]:
# Seasonal factors, pre-lockdown
# Lockdown date: dt.datetime(2020,3,18)

avg = data[data['Date']<dt.datetime(2020,3,18)]['Ema_90'].mean()

i_factors = data[data['Date']<dt.datetime(2020,3,18)][['Ema_90','Day']].groupby(['Day']).mean().reset_index()
i_factors.rename(columns={'Ema_90':'Avg_Ema_90'},inplace=True)
i_factors['i_t'] = i_factors['Avg_Ema_90'] / avg

# Map this back to the df
data = data.merge(i_factors[['Day','i_t']], on='Day',how='left')
data.head()

Unnamed: 0,Date,Temp_min,Temp_mean,Temp_max,Load,London min forecast,London mean forecast,London max forecast,Bristol min forecast,Bristol mean forecast,Bristol max forecast,Leeds min forecast,Leeds mean forecast,Leeds max forecast,Day,Weekend,Lockdown,t,t_2,Sma,Ema_10,Ema_50,Ema_90,Ema_90_1,i_t
0,2017-01-01,4.4,5.2,6.0,10.20655,,,,,,,,,,6,1,0,0.0,0.0,,10.20655,10.20655,10.20655,,0.989598
1,2017-01-02,-0.6,2.0,4.6,10.294451,,,,,,,,,,0,0,0,0.00274,8e-06,,10.252814,10.265151,10.28646,10.20655,1.001193
2,2017-01-03,-1.2,2.7,6.6,10.437069,,,,,,,,,,1,0,0,0.005479,3e-05,,10.320804,10.363389,10.422143,10.28646,1.004351
3,2017-01-04,2.2,4.9,7.6,10.452967,,,,,,,,,,2,0,0,0.008219,6.8e-05,,10.359235,10.411164,10.449887,10.422143,1.004725
4,2017-01-05,-3.2,1.2,5.6,10.494623,,,,,,,,,,3,0,0,0.010959,0.00012,,10.392296,10.45424,10.49015,10.449887,1.004335


In [125]:
i_factors[i_factors['Day']==0]['i_t']

0    1.001193
Name: i_t, dtype: float64

In [128]:
# Create line graph
cutoff = dt.datetime(2000,1,1)
plt_df = data[data['Date']>=cutoff]
fig = go.Figure()
fig.add_trace(go.Scatter(x=plt_df['Date'], y=plt_df['Load'],
                    mode='lines',
                    name='Log load'))
fig.add_trace(go.Scatter(x=plt_df['Date'], y=plt_df['Ema_10'],
                    mode='lines',
                    name='Smoothed Exp 0.1'))
#fig.add_trace(go.Scatter(x=plt_df['Date'], y=plt_df['Ema_50'],
#                    mode='lines',
#                    name='Exp 0.5'))
fig.add_trace(go.Scatter(x=plt_df['Date'], y=plt_df['Ema_90'],
                    mode='lines',
                    name='Smoothed Exp 0.9'))

fig.update_layout(
    title="Log load history",
    xaxis_title="Time",
    yaxis_title="Log load"
)
fig.show()

In [132]:
# Scatter of temperature mean and load
fig = go.Figure()
fig.add_trace(go.Scatter(x=data['Temp_mean'], y=data['Ema_90'],
                    mode='markers',
                    text=data['Date'],
                    marker_color=data['Weekend'],
                    name='Log load'))

fig.update_layout(
    title="Load vs mean temperature (weekends in yellow)",
    xaxis_title="Mean temperature",
    yaxis_title="Smoothed log load (exp 0.9)"
)

fig.show()

In [107]:
# Try basic regression
# Functional form: log x_t = B_0 + B_1 * t + B_2 * t^2 + B_3 * i_t + B_4 * temp + B_5 * s_t-1 + lockdown + e_t

model_0 = ols('Load ~ t + t_2 + i_t + Temp_mean + Ema_90_1 + Lockdown', data=data).fit()
print(model_0.summary())

                            OLS Regression Results                            
Dep. Variable:                   Load   R-squared:                       0.835
Model:                            OLS   Adj. R-squared:                  0.834
Method:                 Least Squares   F-statistic:                     1034.
Date:                Sun, 24 May 2020   Prob (F-statistic):               0.00
Time:                        09:48:25   Log-Likelihood:                 1617.2
No. Observations:                1236   AIC:                            -3220.
Df Residuals:                    1229   BIC:                            -3185.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -2.7991      0.351     -7.977      0.0

In [108]:
# From previous step, the time index is not significant, drop

# Functional form: log x_t = B_0 + B_3 * i_t + B_4 * temp + B_5 * s_t-1 + lockdown + e_t

model_1 = ols('Load ~ i_t + Temp_mean + Ema_90_1 + Lockdown', data=data).fit()
print(model_1.summary())

                            OLS Regression Results                            
Dep. Variable:                   Load   R-squared:                       0.832
Model:                            OLS   Adj. R-squared:                  0.832
Method:                 Least Squares   F-statistic:                     1528.
Date:                Sun, 24 May 2020   Prob (F-statistic):               0.00
Time:                        09:52:23   Log-Likelihood:                 1608.6
No. Observations:                1236   AIC:                            -3207.
Df Residuals:                    1231   BIC:                            -3182.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -2.9046      0.352     -8.250      0.0

In [0]:
data['in_forecast'] = model_1.predict(data)

In [114]:
# Show in-sample forecasts
cutoff = dt.datetime(2019,1,1)
plt_df = data[data['Date']>=cutoff]
fig = go.Figure()
fig.add_trace(go.Scatter(x=plt_df['Date'], y=plt_df['Load'],
                    mode='lines',
                    name='Log load'))
fig.add_trace(go.Scatter(x=plt_df['Date'], y=plt_df['in_forecast'],
                    mode='lines',
                    name='Forecast'))
fig.show()

In [134]:
# Scatter of observed vs forecast
fig = go.Figure()
fig.add_trace(go.Scatter(x=data['Load'], y=data['in_forecast'],
                    mode='markers',
                    text=data['Date'],
                    marker=dict(size=5,
                                opacity=0.5),
                    marker_color=data['Weekend'],
                    name='Log load'))
fig.add_trace(go.Scatter(x=data['Load'], y=data['Load'],
                    mode='markers',
                    marker=dict(size=2,
                                color='Gray'),
                    name='y = x'))

fig.update_layout(
    title="In-sample forecast vs observed",
    xaxis_title="Observed log load",
    yaxis_title="Forecast log load"
)

fig.show()

In [127]:
# Forecast one point
model_1.predict({'i_t':i_factors[i_factors['Day']==0]['i_t'], 'Temp_mean':20, 'Ema_90_1':10, 'Lockdown':1})[0]

9.85517013560906