### First, we'll load in the processed data from the data folder

In [75]:
import numpy as np
import pandas as pd

X_train = pd.read_csv('../data/X_train.csv')
y_train = pd.read_csv('../data/y_train.csv')
X_val = pd.read_csv('../data/X_val.csv')
y_val = pd.read_csv('../data/y_val.csv')
X_test = pd.read_csv('../data/X_test.csv')
y_test = pd.read_csv('../data/y_test.csv')

In [76]:
X_train

Unnamed: 0,Open,High,Low,Close,Volume,moving_avg_50,moving_avg_100,Day,Days_to_election,Avg_month_temp,...,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12
0,-1.261893,-1.282373,-1.244264,-1.257836,-1.043700,,,0.779614,1.000000,0.159902,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-1.264133,-1.269703,-1.274461,-1.305309,-0.313123,,,0.782369,0.999082,0.159902,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-1.303205,-1.313705,-1.312510,-1.337888,0.035411,,,0.785124,0.998163,0.159902,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-1.327979,-1.263471,-1.292745,-1.235772,0.181478,,,0.787879,0.997245,0.159902,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-1.231848,-1.231140,-1.210141,-1.203882,-0.035195,,,0.790634,0.996327,0.159902,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
748,1.875767,1.879728,1.880024,1.869527,-0.367313,1.805446,1.702362,0.749311,0.003673,-0.082026,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
749,1.867188,1.860309,1.885052,1.865527,-0.313181,1.813941,1.710146,0.752066,0.002755,-0.082026,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
750,1.894373,1.889369,1.891904,1.872699,-0.030484,1.821609,1.717829,0.754821,0.001837,-0.082026,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
751,1.851855,1.820232,1.762988,1.790303,-0.176766,1.825661,1.724491,0.757576,0.000918,-0.082026,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


## Processing: 14-day sliding prediction window
***Our model will predict values of our target variable by considering the past two weeks' data at any time. In order to do this, we will transform the stucture of our data such that each row includes a given day's feature values, as well as the values for each of the 14 preceeding days. We will also include the past values of the target variable in this window because after a day has passed, the target variable's value on that day can be treated as known.***

In [77]:
# each column will have a '_i' at the end of its name indicating
# its delay (i.e. the present day will end in '_0' and the values
# from two weeks ago will have '_14' at the end).

def get_lagged_df(df, delay):
    '''function to take in a dataframe return a df with lagged columns'''
    # list for all of the shifted df copies that we'll concat into the df
    values = []
    # shift by value i=0,...,delay so we have the original day's value
    # as well as its window
    for i in range(delay+1):
        values.append(df.shift(i).copy())
        values[-1].columns = [col + '_' + str(i) for col in df.columns]
    return pd.concat(values, axis=1)

In [78]:
# now lets go through our target variable and grab the lagged values for that
# --> just as in the previous method we'll add on target variable columns for
# delays i=1,...,14 (not 0 because we aren't allowed to see the current day's
# target value)

# function will take in the df we're editing along with the corresponding
# target variable series y

def get_lagged_y(df, y, delay):
    '''take in lagged feature matrix, attach lagged y values'''
    for i in range(delay):
        df['y_'+str(i+1)] = y.shift(i+1)
    return df

In [79]:
# function to combine the two lagging methods and spit out our data in the form 
# in which we can feed it to a regression model

def get_timeseries_df(df, y, delay):
    '''wrapper method to do both steps in one call'''
    df = get_lagged_df(df, delay)
    df = get_lagged_y(df, y, delay)
    return df

### Now we can transform our data to be windowed for 14-day sliding time series window

In [80]:
# generating the datasets
X_train_windowed = get_timeseries_df(X_train, y_train, 14)
X_val_windowed = get_timeseries_df(X_val, y_val, 14)
X_test_windowed = get_timeseries_df(X_test, y_val, 14)

# ... and saving them to our data folder
X_train_windowed.to_csv('../data/X_train_windowed.csv', index=False)
X_val_windowed.to_csv('../data/X_val_windowed.csv', index=False)
X_test_windowed.to_csv('../data/X_test_windowed.csv', index=False)