## Load data

In [1]:
import pandas as pd

In [2]:
series = pd.read_csv(
    './data/daily-minimum-temperatures.csv',
    header=0,
    index_col=0,
    parse_dates=True,
    squeeze=True)

In [3]:
series.head()

Date
1981-01-01    20.7
1981-01-02    17.9
1981-01-03    18.8
1981-01-04    14.6
1981-01-05    15.8
Name: Temp, dtype: float64

In [4]:
series.tail()

Date
1990-12-27    14.0
1990-12-28    13.6
1990-12-29    13.5
1990-12-30    15.7
1990-12-31    13.0
Name: Temp, dtype: float64

## Convert Date

In [5]:
df = pd.DataFrame()

df['year'] = [series.index[i].year for i in range(len(series))]
df['month'] = [series.index[i].month for i in range(len(series))]
df['day'] = [series.index[i].day for i in range(len(series))]
df['temperature'] = [series[i] for i in range(len(series))]

df.head(5)

Unnamed: 0,year,month,day,temperature
0,1981,1,1,20.7
1,1981,1,2,17.9
2,1981,1,3,18.8
3,1981,1,4,14.6
4,1981,1,5,15.8


## Lag feature
- Lag features are the classical way that time series forecasting problems are transformed into
supervised learning problems.
- The simplest approach is to predict the value at the next time(t+1) given the value at the current time(t)
    + Value(t), Value(t+1)

In [6]:
temps = pd.DataFrame(series.values)
temps.head()

Unnamed: 0,0
0,20.7
1,17.9
2,18.8
3,14.6
4,15.8


In [7]:
df = pd.concat([temps.shift(1), temps], axis=1)
df.columns = ['t', 't+1']

df.head(5)

Unnamed: 0,t,t+1
0,,20.7
1,20.7,17.9
2,17.9,18.8
3,18.8,14.6
4,14.6,15.8


In [8]:
### Multiple lagging - slicing window
df = pd.concat([temps.shift(3), temps.shift(2), temps.shift(1), temps], axis=1)
df.columns = ['t-2', 't-1', 't', 't+1']

df.head(5)

Unnamed: 0,t-2,t-1,t,t+1
0,,,,20.7
1,,,20.7,17.9
2,,20.7,17.9,18.8
3,20.7,17.9,18.8,14.6
4,17.9,18.8,14.6,15.8


## Rolling Window Statistics
- calculate summary statistics across the values in the sliding window and include these as features in our dataset

- Example rolling mean:
    + mean of the current and previous values and use that to predict the next value

In [9]:
temps = pd.DataFrame(series.values)
df = pd.concat([temps.shift(2), temps.shift(1), temps], axis=1)
df.columns = ['t-1', 't', 't+1']

df.head(5)

Unnamed: 0,t-1,t,t+1
0,,,20.7
1,,20.7,17.9
2,20.7,17.9,18.8
3,17.9,18.8,14.6
4,18.8,14.6,15.8


In [10]:
temps = pd.DataFrame(series.values)
shifted = temps.shift(1)
window = shifted.rolling(window=2)
means = window.mean()

In [11]:
# mean of (t-1) and (t)
df = pd.concat([means, temps], axis=1)
df.columns = ['mean(t-1,t)', 't+1']

df.head(5)

Unnamed: 0,"mean(t-1,t)",t+1
0,,20.7
1,,17.9
2,19.3,18.8
3,18.35,14.6
4,16.7,15.8


- Rolling min, mean, max(window = 4)

In [12]:
temps = pd.DataFrame(series.values)
df = pd.concat([temps.shift(4), temps.shift(3), temps.shift(2), temps.shift(1), temps], axis=1)
df.columns = ['t-3', 't-2', 't-1', 't', 't+1']

df.head(5)

Unnamed: 0,t-3,t-2,t-1,t,t+1
0,,,,,20.7
1,,,,20.7,17.9
2,,,20.7,17.9,18.8
3,,20.7,17.9,18.8,14.6
4,20.7,17.9,18.8,14.6,15.8


In [13]:
# Rolling min, mean, max(window = 4)
temps = pd.DataFrame(series.values)

width = 3
shifted = temps.shift(width - 1)
window = shifted.rolling(window=width)

df = pd.concat([window.min(), window.mean(), window.max(), temps], axis=1)
df.columns = ['min', 'mean', 'max', 't+1']

df.head(5)

Unnamed: 0,min,mean,max,t+1
0,,,,20.7
1,,,,17.9
2,,,,18.8
3,,,,14.6
4,17.9,19.133333,20.7,15.8


In [14]:
# Pandas built-in window - Rolling min, mean, max(window = 4)
temps = pd.DataFrame(series.values)
window = temps.expanding()

df = pd.concat([window.min(), window.mean(), window.max(), temps.shift(-1)], axis=1)
df.columns = ['min', 'mean', 'max', 't+1']

df.head(5)

Unnamed: 0,min,mean,max,t+1
0,20.7,20.7,20.7,17.9
1,17.9,19.3,20.7,18.8
2,17.9,19.133333,20.7,14.6
3,14.6,18.0,20.7,15.8
4,14.6,17.56,20.7,15.8
