In [87]:
import tensorflow as tf
import pandas as pd 

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

import seaborn as sns

In [88]:
df = pd.read_csv("../data/raw/clean_data.csv")
df.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,...,Unemployment,Type,Size,Super_Bowl,Labor_Day,Thanksgiving,Christmas,week,month,year
0,1,1,2010-02-05,24924.5,False,42.31,2.572,0.0,0.0,0.0,...,8.106,A,151315,False,False,False,False,5,2,2010
1,1,2,2010-02-05,50605.27,False,42.31,2.572,0.0,0.0,0.0,...,8.106,A,151315,False,False,False,False,5,2,2010
2,1,3,2010-02-05,13740.12,False,42.31,2.572,0.0,0.0,0.0,...,8.106,A,151315,False,False,False,False,5,2,2010
3,1,4,2010-02-05,39954.04,False,42.31,2.572,0.0,0.0,0.0,...,8.106,A,151315,False,False,False,False,5,2,2010
4,1,5,2010-02-05,32229.38,False,42.31,2.572,0.0,0.0,0.0,...,8.106,A,151315,False,False,False,False,5,2,2010


In [89]:
df['Date'] = pd.to_datetime(df["Date"])
df['Weekly_Sales'] = np.log(df['Weekly_Sales'])
train = df.loc['2012-06-01':].reset_index()
test = df.loc['2012-06-02':].reset_index()

In [90]:
train = train.set_index("Date")
train['Weekly_sales_lag'] = train['Weekly_Sales'].shift(4)
train[['Weekly_Sales','Weekly_sales_lag']].head(10)
train = train.dropna()

test = test.set_index("Date")
test['Weekly_sales_lag'] = test['Weekly_Sales'].shift(4)
test[['Weekly_Sales','Weekly_sales_lag']].head(10)
test = test.dropna()


In [91]:
X_train = train.drop(['Weekly_Sales'], axis=1)
y_train = train['Weekly_Sales']

X_test = test.drop(['Weekly_Sales'], axis=1)
y_test = test['Weekly_Sales']

In [92]:
# X_train['Date'] = pd.to_datetime(X_train['Date'])

In [93]:
X_train.columns

Index(['index', 'Store', 'Dept', 'IsHoliday', 'Temperature', 'Fuel_Price',
       'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI',
       'Unemployment', 'Type', 'Size', 'Super_Bowl', 'Labor_Day',
       'Thanksgiving', 'Christmas', 'week', 'month', 'year',
       'Weekly_sales_lag'],
      dtype='object')

In [94]:
CATEGORICAL_FEATURE_KEYS = [
    # 'Dept',
    'IsHoliday',
    "Super_Bowl",
    "Type",
    "Size",
    "Labor_Day",
    "Thanksgiving",
    "Christmas"
]

NUMERIC_FEATURE_KEYS = [
    "Temperature",
    "Fuel_Price",
    "MarkDown1",
    "MarkDown2",
    "MarkDown3",
    "MarkDown4",
    "MarkDown5",
    "CPI",
    "Weekly_sales_lag"
]

In [95]:
ct = ColumnTransformer(
    [("Numeric", StandardScaler(), NUMERIC_FEATURE_KEYS),
     ("Categorical", OneHotEncoder(), CATEGORICAL_FEATURE_KEYS)
    ])




In [96]:
pipeline = Pipeline(steps=[('preprocessor', ct)])
X_train_trans = ct.fit_transform(X_train)
X_test_trans = ct.transform(X_test)
trans_col = pipeline.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(CATEGORICAL_FEATURE_KEYS)

In [97]:
X_train_trans = pd.DataFrame(X_train_trans,columns=NUMERIC_FEATURE_KEYS + trans_col.tolist())
X_test_trans = pd.DataFrame(X_test_trans,columns=NUMERIC_FEATURE_KEYS + trans_col.tolist())
# X_train_trans['week'] = X_train['week']


In [98]:
X_train_trans.shape, X_train_trans.columns

((219008, 43),
 Index(['Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3',
        'MarkDown4', 'MarkDown5', 'CPI', 'Weekly_sales_lag', 'IsHoliday_False',
        'IsHoliday_True', 'Super_Bowl_False', 'Super_Bowl_True', 'Type_A',
        'Type_B', 'Type_C', 'Size_39690', 'Size_39910', 'Size_41062',
        'Size_42988', 'Size_93638', 'Size_103681', 'Size_114533', 'Size_118221',
        'Size_119557', 'Size_128107', 'Size_140167', 'Size_152513',
        'Size_155083', 'Size_158114', 'Size_184109', 'Size_196321',
        'Size_203007', 'Size_203750', 'Size_203819', 'Size_204184',
        'Size_206302', 'Labor_Day_False', 'Labor_Day_True',
        'Thanksgiving_False', 'Thanksgiving_True', 'Christmas_False',
        'Christmas_True'],
       dtype='object'))

In [99]:
trans_col

array(['IsHoliday_False', 'IsHoliday_True', 'Super_Bowl_False',
       'Super_Bowl_True', 'Type_A', 'Type_B', 'Type_C', 'Size_39690',
       'Size_39910', 'Size_41062', 'Size_42988', 'Size_93638',
       'Size_103681', 'Size_114533', 'Size_118221', 'Size_119557',
       'Size_128107', 'Size_140167', 'Size_152513', 'Size_155083',
       'Size_158114', 'Size_184109', 'Size_196321', 'Size_203007',
       'Size_203750', 'Size_203819', 'Size_204184', 'Size_206302',
       'Labor_Day_False', 'Labor_Day_True', 'Thanksgiving_False',
       'Thanksgiving_True', 'Christmas_False', 'Christmas_True'],
      dtype=object)

In [100]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np 

In [102]:
lr = LinearRegression()
lr.fit(X_train_trans, y_train)

In [103]:
pred_lr = lr.predict(X_train_trans)

In [104]:
print("Train")
np.sqrt(mean_squared_error(y_train, pred_lr))

Train


2.031362120056187

In [105]:
print("test")
test_pred_lr = lr.predict(X_test_trans)
np.sqrt(mean_squared_error(y_test, test_pred_lr))

test


2.031362120056187

In [106]:
y_test.min(), y_test.max(), y_test.std()

(-4.605170185988091, 13.384374010120712, 2.188309195147046)

In [107]:
rt = RandomForestRegressor()
rt.fit(X_train_trans, y_train)

In [108]:
pred_rt = rt.predict(X_train_trans)

In [109]:
test_pred_rt = rt.predict(X_test_trans)

In [116]:
np.sqrt(mean_squared_error(y_train, pred_rt))

0.7827703927993288

In [117]:
y_train.min(), y_train.max(), y_train.std()

(-4.605170185988091, 13.384374010120712, 2.188309195147046)

In [118]:
np.sqrt(mean_squared_error(y_test, test_pred_rt))

0.7827703927993288

In [119]:
y_test.min(), y_test.max(), y_test.std()

(-4.605170185988091, 13.384374010120712, 2.188309195147046)

In [54]:
import tensorflow as tf 

In [55]:
class RNNModel:
    def __init__(self, rnn_units=100, return_sequences=False):
        # Your code here
        self.model = tf.keras.Sequential(
            [
                # Batch_size, rnn_units. When return_sequence = true, shape = N, input_shape, rnn_units. (useful for stacking RNN)
                tf.keras.layers.LSTM(rnn_units, return_sequences=return_sequences),
                # tf.keras.layers.SimpleRNN(128),
                # tf.keras.layers.Dense(10),
                # tf.keras.layers.Dense(5),
                tf.keras.layers.Dense(1),
            ]
        )
        self.model.compile(
            optimizer=tf.keras.optimizers.RMSprop(),
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[
                tf.keras.metrics.mean_squared_error,
                tf.keras.metrics.mean_absolute_error,
            ],
        )

    def fit(self, train_data, epochs, val_data=None, callbacks=None):
        # Your code here
        history = self.model.fit(
            train_data, epochs=epochs, validation_data=val_data, callbacks=callbacks
        )
        return history

    def evaluate(self, eval_data, verbose=0):
        # Your code here
        result = self.model.evaluate(eval_data, verbose=verbose, return_dict=True)
        return result['mean_squared_error'], result['mean_absolute_error']

    def predict(self, pred_data):
        # Your code here
        return self.model.predict(pred_data)

In [56]:
model = RNNModel()

Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB



2022-12-15 14:55:12.516127: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-12-15 14:55:12.516356: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [57]:
# model = tf.keras.Sequential(
#             [
#                 # Batch_size, rnn_units. When return_sequence = true, shape = N, input_shape, rnn_units. (useful for stacking RNN)
#                 tf.keras.layers.LSTM(32, return_sequences=False),
#                 tf.keras.layers.Dense(1),
#             ]
#         )
# model.compile(
#     optimizer=tf.keras.optimizers.RMSprop(),
#     loss=tf.keras.losses.MeanSquaredError(),
#     metrics=[
#         tf.keras.metrics.mean_squared_error,
#         tf.keras.metrics.mean_absolute_error,
#     ],
# )

In [58]:
class WindowGenerator:
    def __init__(
        self, lookback, lookahead, batch_size, train_df, test_df, label_column
    ):
        # Your code here
        # Store data
        self.train_df = train_df
        self.test_df = test_df
        # Get the column indices
        self.label_column = label_column
        self.label_column_indices = {
            name: i for i, name in enumerate(label_column)
        }  # Encoding column name into index
        self.column_indices = {name: i for i, name in enumerate(self.train_df.columns)}

        # Window parameters
        self.lookback = lookback
        self.lookahead = lookahead
        self.total_window_size = self.lookback + self.lookahead
        self.input_slice = slice(0, self.lookback)
        self.lookback_idx = np.arange(self.total_window_size)[self.input_slice]
        self.label_start = self.total_window_size - self.lookahead
        self.label_slice = slice(self.label_start, None)
        self.label_idx = np.arange(self.total_window_size)[self.label_slice]

        self.batch_size = batch_size

    def __repr__(self):
        return "\n".join(
            [
                f"Total window size: {self.total_window_size}",
                f"Lookback indices: {self.lookback_idx}",
                f"Label index: {self.label_idx}",
                f"Label name: {self.label_column}",
            ]
        )

    @property
    def train(self):
        return self.make_dataset(self.train_df)


    @property
    def test(self):
        return self.make_dataset(self.test_df, shuffle=False)

    def make_dataset(self, data, shuffle=True):
        # Your code here
        data = np.array(data, dtype=np.float32)
        ds = tf.keras.utils.timeseries_dataset_from_array(
            data=data,
            targets=None,
            sequence_length=self.total_window_size,
            sequence_stride=1,
            shuffle=shuffle,
            batch_size=32,
        )

        ds = ds.map(self.split_window)

        return ds

    def split_window(self, features):
        inputs = features[:, self.input_slice, :]
        labels = features[:, self.label_slice, :]
        labels = tf.stack(
            [labels[:, :, self.column_indices[name]] for name in self.label_column],
            axis=-1,
        )
        inputs.set_shape([None, self.lookback, None])
        labels.set_shape([None, self.lookahead, None])

        return inputs, labels


In [59]:
train_1 = X_train_trans
train_1['Date'] = train.index
train_1 = train_1.set_index(['Date'])
train_1['Weekly_Sales'] = np.log(y_train)

In [60]:
test_1 = X_train_trans
test_1['Date'] = train.index
test_1 = test_1.set_index(['Date'])
test_1['Weekly_Sales'] = np.log(y_train)

In [61]:
w1 = WindowGenerator(lookback=6, lookahead=1, batch_size=100, train_df=train_1, test_df=test_1, label_column=['Weekly_Sales'])

In [62]:
history = model.fit(w1.train, epochs=10)

Epoch 1/10


2022-12-15 14:55:22.121692: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-12-15 14:55:23.004102: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-15 14:55:23.196126: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-15 14:55:24.396339: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [63]:
pred = model.predict(w1.train)

   9/6844 [..............................] - ETA: 43s  

2022-12-15 15:25:44.168777: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-15 15:25:44.229342: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




In [64]:
np.sqrt(mean_squared_error(train_1['Weekly_Sales'][6:], np.exp(pred).flatten()))

21147.43095485315

In [65]:
model.evaluate(w1.train)

2022-12-15 15:26:21.652216: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-15 15:26:21.732043: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


(1.0131773948669434, 0.5739595890045166)

In [66]:
model.evaluate(w1.test)

(1.013179063796997, 0.5739584565162659)