# Working with Time-Series Data

Revisiting RAM price example from Chapter 2.


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import mglearn

## 1. Loading and preparing RAM prices dataset

In [None]:
import os
ram_prices = pd.read_csv(os.path.join(mglearn.datasets.DATA_PATH, "ram_price.csv"))

plt.semilogy(ram_prices.date, ram_prices.price)
plt.xlabel("Year")
plt.ylabel("Price in $/Mbyte")


### 1.1 Constructing date-time index

The native date is in decimal year format, e.g. 2001.4. In order to leverage our date-time techniques, we convert to Python datetime object.

In [None]:
# Use the fractional part to build month and day
# Use pd.to_datetime() and pd.DateTimeIndex()
# For example, 100th day of the year
pd.to_datetime(100, format='%j') # %j expects a day [1, 366]

In [None]:
pd.DatetimeIndex([pd.to_datetime(100, format='%j')])

In [None]:
# Use the fractional part to build month and day
month_day = pd.DatetimeIndex(pd.to_datetime(365 * (ram_prices.date % 1) + 1, format='%j'))

In [None]:
# Use the integer part to build the year
# Hint: use pd.to_datetime() and pd.DateTimeIndex()
year = pd.DatetimeIndex(pd.to_datetime(ram_prices.date, format='%Y')).year

In [None]:
# combine year-month-day into datetime and assign to 'date' column
# Hint: with year, month and day in columns of df, 
# pd.to_datetime(df) combines into a single column

df = pd.DataFrame({'year':[2021],
                  'month':[2],
                  'day': [14]})
df

In [None]:
pd.to_datetime(df)

In [None]:
# combine year-month-day into datetime and assign to 'date' column

df = pd.DataFrame({'year': year.values,
                   'month': month_day.month.values,
                   'day': month_day.day.values})
df['date'] = pd.to_datetime(df)

# Create a column 'price' and use the log of ram_prices as the values
df['price'] = np.log(ram_prices.price)

# Use 'date' column as index (using .set_index()) 
df = df.set_index('date')

# Remove 'year', 'month', 'day' columns
df = df.drop(columns=['year', 'month', 'day'])
df

### 1.2 Resample to regular time steps

To use the shift techniques, it is best to have regularly spaced time steps.

We resample to three month steps. 

**Note:** In the beginning of the data, sampling is coarser and resampling creates `nan`. We fill these in with forward fill (copy numbers forward).

See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.resample.html

for more information on resampling.

In [None]:
df.resample("3M").mean().head()

In [None]:
df.resample("3M").mean().fillna(method='ffill').head()

In [None]:
ram = df.resample("3M").mean().fillna(method='ffill')

In [None]:
ram.head()

In [None]:
ram.plot()

## 2. Regression using date-time features

### 2.1 Preparing date-time features

In [None]:
X_year_month = pd.DataFrame(np.hstack([ram.index.year.values.reshape(-1, 1),
                                           ram.index.month.values.reshape(-1, 1)]),
                           columns=['year', 'month'])

y = ram.price

In [None]:
X_year_month.head()

In [None]:
print(X_year_month.shape)

In [None]:
print(y.shape)

### 2.2 Import utility functions
Utility functions are now in a module so that all notebooks can use them

In [None]:
from timeseries_utils import create_xticks, eval_on_features

In [None]:
xticks = create_xticks(ram.index, freq='24M',fmt_str="%Y" )

Customizing the `eval_on_features` function with partial

In [None]:
from functools import partial
eval_on_features_3 = partial(eval_on_features,
                             n_val=60, 
                             y_str="log(RAM price)",
                             xticks=xticks)

### 2.3 Preparing split object for cross-validation

In [None]:
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=3)

for train_index, test_index in tscv.split(X_year_month):
    print("train size:", train_index.shape, "test size:", test_index.shape)

### 2.4 Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

rf_regressor = RandomForestRegressor(n_estimators=100, random_state=0)

In [None]:
eval_on_features_3(X_year_month, 
                 y, 
                 rf_regressor)
plt.grid()

In [None]:
scores = cross_val_score(rf_regressor, X_year_month, y, cv=tscv)

print(scores)
print(f"mean= {scores.mean():.3f}")

**Question:** Why is Random forest failing here?

Only month would be a feature that provides previous values. It might be that year is more prominent and hence could not extrapolate.

Check by looking at feature importances.

In [None]:
n_features = X_year_month.shape[1]
plt.barh(np.arange(n_features), rf_regressor.feature_importances_, align='center')
plt.yticks(np.arange(n_features),X_year_month.columns)
plt.xlabel("Feature importance")
plt.ylabel("Feature")
plt.ylim(-1, n_features);

### 2.5 Linear regression with l2 regularization (ridge)

In [None]:
from sklearn.linear_model import Ridge

ridge = Ridge()

In [None]:
eval_on_features_3(X_year_month, 
                 y, 
                 ridge)
plt.grid()

In [None]:
scores = cross_val_score(ridge, X_year_month, y, cv=tscv)
print(scores)
print(f"mean= {scores.mean():.3f}")

In [None]:
plt.figure(figsize=(6, 2))
plt.plot(ridge.coef_,'o')
plt.ylim([-0.5, 0])
plt.xticks(np.arange(len(ridge.coef_)), X_year_month.columns, rotation=90)
plt.xlabel("Feature name")
plt.ylabel("Feature magnitude")
plt.grid();

## 3. Multiple lag features

We can use this shifting technique to engineer multiple feature columns, each with a different lag.

One time step is 3 months. We include lags of 1-4 3 months chunks. This would mean that we need one year worth of data to do predictions.

### 3.1 Preparing lag features

In [None]:
y_df = pd.DataFrame(y.values)

X_shift = pd.DataFrame(y.values, columns=['original'])

for shift in range(1,5):
    
    col_name = f'lag_{shift}'
    X_shift[col_name] = y_df.shift(shift)

X_shift = X_shift.dropna()

In [None]:
X_shift

### 3.2 Random Forest Regressor

In [None]:
eval_on_features_3(X_shift.drop(columns=['original']), 
                 X_shift['original'], 
                 rf_regressor)
plt.grid()

In [None]:
scores = cross_val_score(rf_regressor, 
                         X_shift.drop(columns=['original']),
                         X_shift['original'],
                         cv=tscv)
print(scores)
print(f"mean= {scores.mean():.3f}")

In [None]:
n_features = X_shift.drop(columns=['original']).shape[1]
plt.barh(np.arange(n_features), rf_regressor.feature_importances_, align='center')
plt.yticks(np.arange(n_features),X_shift.drop(columns=['original']).columns)
plt.xlabel("Feature importance")
plt.ylabel("Feature")
plt.ylim(-1, n_features);

### 3.3 Linear regression with l2 regularization (ridge)

In [None]:
eval_on_features_3(X_shift.drop(columns=['original']), 
                 X_shift['original'], 
                 ridge)
plt.grid()

In [None]:
scores = cross_val_score(ridge, 
                         X_shift.drop(columns=['original']),
                         X_shift['original'],
                         cv=tscv)
print(scores)
print(f"mean= {scores.mean():.3f}")

In [None]:
plt.figure(figsize=(6, 2))
plt.plot(ridge.coef_,'o')
plt.xticks(np.arange(len(ridge.coef_)), X_shift.drop(columns=['original']).columns, rotation=90)
plt.xlabel("Feature name")
plt.ylabel("Feature magnitude")
plt.grid();

## 4. Summary
There is not much seasonality in this dataset. Random Forest is not able to predict, since it would need to extraplolate. As we know, Random Forest cannot predict values it has not seen during training.

Ridge regression provides an OK model, however, it primarely uses the previous value (lag 1) to predict the next.
