#              Lecture 16                    
                                            
## Introduction to Time-Series Analysis      
  - time-series data manipulations        
  - data explorations:                    
      - descriptive + graphs              
      - auto-correlation                  
  - model with feols                      
      - setup feols w panel.id            
      - Newey-West standard errors        
      - lagged variables                  
      - cumulative effects w SEs          
                                            
#### Case Study:                                 
  - Arizona Electricity Consumption           
---  

Import packages

In [None]:
import warnings
from datetime import datetime

import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from mizani.breaks import date_breaks
from mizani.formatters import date_format
from plotnine import *
from skimpy import skim
from stargazer.stargazer import Stargazer

%matplotlib inline
warnings.filterwarnings("ignore")

Import data

- 1st source: climate data  (cooling degree days etc, by month)

In [None]:
climate = pd.read_csv('https://osf.io/g3tj7/download')
climate.head()

and convert them into date format as well: here we have an easier implemented format

In [None]:
climate["tempdate"] = climate["DATE"].apply(lambda x: datetime.strptime(x, "%Y-%m"))

Add the same variables

In [None]:
climate["year"] = climate["tempdate"].dt.year
climate["month"] = climate["tempdate"].dt.month
climate["ym"] = climate["tempdate"]

### Data manipulation with time-series data:
 1) Generate averages from sums:
     when dividing by N, must take into account N of days

In [None]:
climate["ndays"] = np.where(
    climate["month"].isin([1, 3, 5, 7, 8, 10, 12]),
    31,
    np.where(climate["month"] == 2, 28, 30),
)

 Focus on cooling degree, heating degree 

In [None]:
climate["CLDD_avg"] = climate["CLDD"] / climate["ndays"] 
climate["HTDD_avg"] = climate["HTDD"] / climate["ndays"] 

Drop others

In [None]:
climate = climate.drop(["DATE", "tempdate", "STATION", "NAME","DX32","DX70","DX90"],axis=1)

Check the descriptive

In [None]:
climate.filter(["CLDD_avg","HTDD_avg"]).describe().T

In [None]:
skim(climate.filter(["CLDD_avg","HTDD_avg"]))

- 2nd source: the electricity consumption data (monthly data)

In [None]:
electricity = pd.read_csv('https://osf.io/wbef4/download')
electricity.head()

Convert 'MY' variable into numeric date

In [None]:
electricity["date"] = electricity["MY"].apply(lambda x: datetime.strptime(x, "%b-%y"))

We can create different time variables:\
year -> the actual year\
month -> the actual month\
format -> create your own format

In [None]:
electricity["year"] = electricity["date"].dt.year
electricity["month"] = electricity["date"].dt.month
electricity["ym"] = electricity["date"]

Remove MY, year and month variables

In [None]:
electricity = electricity.drop(["MY","year","month"],axis=1)

Take logs of q (used electricity)

In [None]:
electricity["lnQ"] = np.log(electricity["Q"])

__Merging the two data__

In [None]:
df = climate.merge(electricity,on ="ym",how="inner")

Restrict the sample between years 2001 and 2017

In [None]:
df = df.loc[(df["year"] >= 2001) & (df["year"] <= 2017)]

### Data exploration

In [None]:
df.filter(["Q", "lnQ", "CLDD_avg", "HTDD_avg"]).describe().T

In [None]:
skim(df.filter(["Q", "lnQ", "CLDD_avg", "HTDD_avg"]))

__Plot the time series__

Consumption

In [None]:
limits = datetime(2001, 1, 1), datetime(2018, 1, 1)
breaks = date_breaks("3 year")

(
    ggplot(df, aes(x="date", y="Q"))
    + geom_line(color="red", size=0.7)
    + ylab("Residential electricity consumption (GWh)")
    + xlab("Date (month)")
    + scale_y_continuous(limits=(1000, 5000), breaks=np.arange(1000, 5001, 1000))
    + scale_x_date(breaks=breaks(limits)[::1], labels=date_format("%b%Y"))
    + theme_bw()
)

Log-consumption

In [None]:
(
    ggplot(df, aes(x="date", y="lnQ"))
    + geom_line(color="red", size=0.7)
    + ylab("ln(residential electricity consumption, GWh)")
    + xlab("Date (month)")
    + scale_y_continuous(limits=(7, 8.5), breaks=np.arange(7, 8.5, 0.25))
    + scale_x_date(breaks=breaks(limits)[::1], labels=date_format("%b%Y"))
    + theme_bw()
)

 Cooling degrees

In [None]:
(
    ggplot(df, aes(x="date", y="CLDD_avg"))
    + geom_line(color="red", size=0.7)
    + ylab("Cooling degrees (Farenheit)")
    + xlab("Date (month)")
    + scale_y_continuous(
        expand=(0.01, 0.01), limits=(0, 35), breaks=np.arange(0, 36, 5)
    )
    + scale_x_date(breaks=breaks(limits)[::1], labels=date_format("%b%Y"))
    + theme_bw()
)

Heating degrees

In [None]:
(
    ggplot(df, aes(x="date", y="HTDD_avg"))
    + geom_line(color="red", size=0.7)
    + ylab("Heating degrees (Farenheit)")
    + xlab("Date (month)")
    + scale_y_continuous(
        expand=(0.01, 0.01), limits=(0, 14), breaks=np.arange(0, 15, 2)
    )
    + scale_x_date(breaks=breaks(limits)[::1], labels=date_format("%b%Y"))
    + theme_bw()
)

Plot all of them together

In [None]:
(
    ggplot(
        df.melt(id_vars="date", value_vars=["CLDD_avg", "HTDD_avg", "lnQ", "Q"]),
        aes(x="date", y="value"),
    )
    + geom_line(color="red", size=0.7)
    + facet_wrap("~variable", scales="free_y")
    + xlab("Date (month)")
    + scale_x_date(breaks=breaks(limits)[::1], labels=date_format("%Y"))
    + theme_bw()
    + theme(subplots_adjust={"wspace": 0.25})
)

<br>__Time-series specific analysis: auto-correlation__

In [None]:
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.graphics.tsaplots import plot_acf
import matplotlib.pyplot as plt

Log of electricity consumption

In [None]:
plot_acf(df["lnQ"], lags = 25)
plt.show()

In [None]:
plot_pacf(df["lnQ"], lags = 25, method = "ols")
plt.show()

Cooling degree

In [None]:
plot_acf(df["CLDD_avg"], lags = 25)
plt.show()

In [None]:
plot_pacf(df["CLDD_avg"], lags = 25, method = "ols")
plt.show()

Heating degree

In [None]:
plot_acf(df["HTDD_avg"], lags = 25)
plt.show()

In [None]:
plot_pacf(df["HTDD_avg"], lags = 25, method = "ols")
plt.show()

Solution --> Create differences

In [None]:
df["DlnQ"] = df["lnQ"] - df["lnQ"].shift(1)
df["DCLDD_avg"] = df["CLDD_avg"] - df["CLDD_avg"].shift(1)
df["DHTDD_avg"] = df["HTDD_avg"] - df["HTDD_avg"].shift(1)

<br>__Functional form investigations__

In [None]:
(
    ggplot(df, aes(x="DCLDD_avg", y="DlnQ"))
    + geom_point(size=1, fill="blue", color="blue")
    + geom_smooth(
        method="loess", se=False, colour="black", size=1.5, span=0.9,
    )
    + labs(
        x="Cooling degrees (Farenheit), first difference",
        y="ln(monthly electricity consumption), first difference",
    )
    + scale_x_continuous(limits=(-20, 20), breaks=np.arange(-20, 20, 10))
    + theme_bw()
)

In [None]:
(
    ggplot(df, aes(x="DHTDD_avg", y="DlnQ"))
    + geom_point(size=1, fill="blue", color="blue")
    + geom_smooth(method="loess", se=False, colour="black", size=1.5)
    + labs(
        x="Heating degrees (Farenheit), first difference",
        y="ln(monthly electricity consumption), first difference",
    )
    + scale_x_continuous(limits=(-10, 10), breaks=np.arange(-10, 10, 10))
    + theme_bw()
)

### Linear regressions

- reg1: DlnQ = alpha + beta_1 * DCLDD_avg + beta_2 * DHTDD_avg
- reg2: DlnQ = alpha + beta_1 * DCLDD_avg + beta_2 * DHTDD_avg + months
- reg3: DlnQ = alpha + gamma * lag( DlnQ ) + beta_1 * DCLDD_avg + beta_2 * DHTDD_avg + months
- reg4: DlnQ = alpha + beta_1 * DCLDD_avg + beta_2 * DHTDD_avg + months + 2 LAGS of DCLDD_avg and DHTDD_avg
- reg_cumSE: use reg4 but estimate standard errors for the cumulative effect

In [None]:
df = df.sort_values(by="ym")

In [None]:
reg1 = smf.ols("DlnQ ~ DCLDD_avg + DHTDD_avg", data=df).fit()
reg1.get_robustcov_results(cov_type="HAC", maxlags=24).summary()

In [None]:
reg2 = smf.ols("DlnQ ~ DCLDD_avg + DHTDD_avg + C(month)", data=df).fit()
reg2.get_robustcov_results(cov_type="HAC", maxlags=24).summary()

Compare the two models

In [None]:
table = Stargazer([reg1, reg2])
table

reg3: include the lag of DlnQ:

In [None]:
reg3 = smf.ols("DlnQ ~ DlnQ.shift(1) + DCLDD_avg + DHTDD_avg + C(month)", data=df).fit()
reg3.get_robustcov_results(cov_type="HAC", maxlags=24).summary()

reg4: include the lag of heating/cooling degrees up to two lags

In [None]:
reg4 = smf.ols(
    "DlnQ ~ DCLDD_avg + DCLDD_avg.shift(1) + DCLDD_avg.shift(2) + DHTDD_avg + DHTDD_avg.shift(1) + DHTDD_avg.shift(2) + C(month)",
    data=df,
).fit()
reg4.get_robustcov_results(cov_type="HAC", maxlags=24).summary()

In [None]:
table = Stargazer([reg1, reg2, reg3, reg4])
table

 Note: to be fair, one needs to use a restricted sample with 201 observations in this case!

#### Task:
Replicate these results, but now using the same sample for each model to ensure fair comparison!\
You should have the same number of observations in the end

In [None]:
reg1_s= smf.ols("DlnQ ~ DCLDD_avg + DHTDD_avg", data=df.iloc[3:, :]).fit()
reg2_s = smf.ols("DlnQ ~ DCLDD_avg + DHTDD_avg + C(month)", data=df.iloc[3:, :]).fit()
reg3_s = smf.ols(
    "DlnQ ~ DlnQ.shift(1) + DCLDD_avg + DHTDD_avg + C(month)", data=df.iloc[2:, :]
).fit()
reg4_s = smf.ols(
    "DlnQ ~ DCLDD_avg + DCLDD_avg.shift(1) + DCLDD_avg.shift(2) + DHTDD_avg + DHTDD_avg.shift(1) + DHTDD_avg.shift(2) + C(month)",
    data=df,
).fit()

In [None]:
table = Stargazer([reg1_s, reg2_s, reg3_s, reg4_s])
table

Trick to estimate SE on the cumulative effect

 1) create double differenced variable

In [None]:
df["DDCLDD_avg"] = df["DCLDD_avg"] - df["DCLDD_avg"].shift(1)
df["DDHTDD_avg"] = df["DHTDD_avg"] - df["DHTDD_avg"].shift(1)

In [None]:
reg_cumSE = smf.ols(
    "DlnQ ~ DCLDD_avg.shift(2) + DHTDD_avg.shift(2) + DDCLDD_avg + DCLDD_avg.shift(1) + DDHTDD_avg + DDHTDD_avg.shift(1) + C(month)",
    data=df,
).fit()
reg_cumSE.get_robustcov_results(cov_type="HAC", maxlags=24).summary()

Compare the results

In [None]:
table = Stargazer([reg4, reg_cumSE])
table

Remark - from reg4: DCLDD_avg+l(DCLDD_avg,1)+l(DCLDD_avg,2) == reg_cumSE: l(DCLDD_avg,2) \
  extra: for reg_cumSE: l(DCLDD_avg,2) we have SE as well! \
  same for l(DHTDD_avg,2)