# Analyzing Time Series

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
plt.rcParams['figure.figsize'] = (18, 12)

In [None]:
df = pd.read_csv('./data/TG_STAID002759.txt', parse_dates=True, sep=',', skiprows=19, index_col=1)
df.columns = [col.strip() for col in df.columns]
df.index.freq = "D"
df.rename(columns={'TG':'temp'}, inplace=True)
# df.index = pd.to_datetime(df['DATE'], format='%Y%m%d')
# df = df[df['Q_TG'] != 9]
# df = df[df['SOUID'] == 111448]
df = df['1950-01-01':]
# df['Q_TG'].value_counts()

In [None]:
df = df.drop(columns=['SOUID', 'Q_TG'])
df

In [None]:
# df[['TG']].plot()
fig = px.line(df, y="temp")
fig.show()

What do you in see in the data?
* it's an additive time series 🤷‍♂️
* **no** trend
* pattern/seasonality
* remainder — noise + time-dependance

Our task for the day:
* model the **trend**
* model the **seasonality** (here we'll model trend-seasonality at once)
* extract (and save) the **remainder** to look at it / work with it tomorrow

### Warm-up: compose a time series

In [None]:
x = np.linspace(0, 10, 101)

In [None]:
y_trend = 0.01*x
plt.plot(x, y_trend)

In [None]:
y_seasonal = 10*np.sin(20*x)
plt.plot(x, y_trend+y_seasonal)

In [None]:
y_noise = np.random.normal(size=101, scale=3)

In [None]:
plt.plot(x, y_noise)

In [None]:
plt.plot(x, y_trend+y_seasonal+y_noise)

## Trend

Two approaches to modeling the trend:
* moving/rolling average
* timestep as variable

In [None]:
# df.plot()
fig = px.line(df, y="temp")
fig.show()

In [None]:
df['timestep'] = range(len(df))
df

In [None]:
X = df[['timestep']]
y = df['temp']

In [None]:
m = LinearRegression()

In [None]:
m.fit(X, y)

In [None]:
df['trend'] = m.predict(X)

In [None]:
[t for t in zip(y, m.predict(X))][-10:]

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index.values, y=df['temp'], name='Temperature'))
fig.add_trace(go.Scatter(x=df.index.values, y=df['trend'], name='Trend'))

fig.show()

## Seasonality

In [None]:
df.groupby(df.index.month)['temp'].mean().plot.bar()

In [None]:
pd.get_dummies(df.index.month, drop_first=True, prefix='month')

In [None]:
month_dummies = pd.get_dummies(df.index.month, drop_first=True, prefix='month').set_index(df.index)

In [None]:
df = df.join(month_dummies)
df

In [None]:
X = df.drop(['temp', 'trend'], axis=1)
X

In [None]:
m.fit(X, y)

In [None]:
df['trend_seasonal'] = m.predict(X)

In [None]:
# df[['temp', 'trend_seasonal']].plot()

fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index.values, y=df['temp'], name='Temperature'))
fig.add_trace(go.Scatter(x=df.index.values, y=df['trend_seasonal'], name='Seasonal trend'))

fig.show()

## Remainder

In [None]:
df['remainder'] = df['temp'] - df['trend_seasonal']
df['remainder'].std()

In [None]:
# df['remainder'].plot()

fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index.values, y=df['remainder'], name='Reminder'))

fig.show()

In [None]:
# df['remainder'].to_csv('./data/temperature_remainder.csv')

## 7.2. Autoregressive Model (AR)

In [None]:
# df = pd.read_csv('./data/temperature_remainder.csv')
#
# Shifting Rows
# df = df[['remainder']].copy()

df['lag1'] = df['remainder'].shift(1)
df['lag2'] = df['remainder'].shift(2)
df['lag3'] = df['remainder'].shift(3)

corr = df[['remainder', 'lag1', 'lag2', 'lag3']].corr()
corr

In [None]:
import seaborn as sns

sns.heatmap(corr)

In [None]:
tmp = df[['remainder', 'lag1', 'lag2', 'lag3']].melt(id_vars=['remainder'])
tmp

In [None]:
# Scatterplot with df['lag1'] on the x-axis and df['remainder'] on the y-axis (zoom in to see the trend line)
import plotly.express as px
import plotly.graph_objects as go

scatter = px.scatter(df, y="lag1", trendline="ols")
scatter.show()

In [None]:
traces = px.scatter(tmp, x="value", y="remainder", color='variable', trendline="ols")

traces.data[1]['showlegend']=True
traces.data[3]['showlegend']=True
traces.data[5]['showlegend']=True

fig = go.Figure()
fig.add_trace(traces.data[1])
fig.add_trace(traces.data[3])
fig.add_trace(traces.data[5])
fig.show()

# Note: this plot shows us correlation between previous value (lagN) and current remainder-value. 
# The strongest correlation is for lag1. Meaning that yesterday's weather has the biggest impact to today's weather.

In [None]:
# correnation between remainder and lags (X is a lag level, y is correlation)
# lag0 is correlates to remainder with corr=1 (because there is no lag and lag0=remainder)

from statsmodels.graphics.tsaplots import plot_acf

print(plot_acf(df['remainder']))

In [None]:
# partial (direct) correlation between lag level and remainder (that is not explained by previous lag with lower level)

from statsmodels.graphics.tsaplots import plot_pacf

print(plot_pacf(df['remainder'], method='ywm'))

In [None]:
from statsmodels.tsa.ar_model import AutoReg, ar_select_order
selected_order = ar_select_order(df['remainder'], maxlag=12)
selected_order.ar_lags # number of lags (previous days) that will be included into autoregression model

In [None]:
from statsmodels.tsa.ar_model import AutoReg
ar_model = AutoReg(endog=df['remainder'], lags=1).fit()
df['reminder_explained_by_lag'] = ar_model.predict()
df['noise'] = df['remainder'] - df['reminder_explained_by_lag']

In [None]:
df['noise'].std()

In [None]:
tmp = df[['temp', 'noise']].copy()
tmp['explained'] = df['trend_seasonal'] + df['reminder_explained_by_lag']
# tmp[-365:].plot()

tmp['date'] = tmp.index
tmp = tmp.dropna().melt(id_vars=['date'])

line = px.line(tmp, x="date", y="value", color='variable')
line.show() # feel free to zoom in

# note:
# "explained" line is explained by trend + seasons + three previous days
# "noise" line shows not explained fluctuations
# "temp" shows actual value

In [None]:
# zoomed in plot
line = px.line(tmp[tmp['date'] > '2021-08-01'], x="date", y="value", color='variable')
line.show() # feel free to zoom in