In [440]:
import tensorflow as tf
import pandas as pd 
import numpy as np 

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

import seaborn as sns

In [441]:
df = pd.read_csv("../data/raw/clean_data.csv")
df.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,...,Unemployment,Type,Size,Super_Bowl,Labor_Day,Thanksgiving,Christmas,week,month,year
0,1,1,2010-02-05,24924.5,False,42.31,2.572,0.0,0.0,0.0,...,8.106,A,151315,False,False,False,False,5,2,2010
1,1,2,2010-02-05,50605.27,False,42.31,2.572,0.0,0.0,0.0,...,8.106,A,151315,False,False,False,False,5,2,2010
2,1,3,2010-02-05,13740.12,False,42.31,2.572,0.0,0.0,0.0,...,8.106,A,151315,False,False,False,False,5,2,2010
3,1,4,2010-02-05,39954.04,False,42.31,2.572,0.0,0.0,0.0,...,8.106,A,151315,False,False,False,False,5,2,2010
4,1,5,2010-02-05,32229.38,False,42.31,2.572,0.0,0.0,0.0,...,8.106,A,151315,False,False,False,False,5,2,2010


In [442]:
df['Date'] = pd.to_datetime(df["Date"])
df.index = df['Date']
# df = df.resample("W").mean()
df = df.drop(['Date'], axis=1)
# df['Weekly_Sales'] = np.log(df['Weekly_Sales'])
train = df.loc[:'2012-10-01'].reset_index()
test = df.loc['2012-10-01':].reset_index()
train.shape, test.shape

  test = df.loc['2012-10-01':].reset_index()


((408369, 23), (11843, 23))

In [443]:
train = train.set_index("Date")
train['Weekly_sales_lag'] = train['Weekly_Sales'].shift(4)
train = train.reset_index()
train = train.dropna()

test = test.set_index("Date")
test['Weekly_sales_lag'] = test['Weekly_Sales'].shift(4)
test = test.reset_index()
test = test.dropna()


In [444]:
X_train = train.drop(['Weekly_Sales'], axis=1)
y_train = np.log(train['Weekly_Sales'])

X_test = test.drop(['Weekly_Sales'], axis=1)
y_test = np.log(test['Weekly_Sales'])

In [445]:
# X_train['Date'] = pd.to_datetime(X_train['Date'])

In [446]:
X_train.columns

Index(['Date', 'Store', 'Dept', 'IsHoliday', 'Temperature', 'Fuel_Price',
       'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI',
       'Unemployment', 'Type', 'Size', 'Super_Bowl', 'Labor_Day',
       'Thanksgiving', 'Christmas', 'week', 'month', 'year',
       'Weekly_sales_lag'],
      dtype='object')

In [447]:
CATEGORICAL_FEATURE_KEYS = [
    'Dept',
    'IsHoliday',
    "Super_Bowl",
    "Type",
    "Size",
    "Labor_Day",
    "Thanksgiving",
    "Christmas",
    "year",
    "week"
]

NUMERIC_FEATURE_KEYS = [
    "Temperature",
    "Fuel_Price",
    "MarkDown1",
    "MarkDown2",
    "MarkDown3",
    "MarkDown4",
    "MarkDown5",
    "CPI",
    "Weekly_sales_lag"
]

# ORDINAL_FEATURE_KEYS = [
#     "year",
#     "week"
# ]

In [448]:
ct = ColumnTransformer(
    [("Numeric", StandardScaler(), NUMERIC_FEATURE_KEYS),
     ("Categorical", OneHotEncoder(), CATEGORICAL_FEATURE_KEYS)
    ])




In [449]:
pipeline = Pipeline(steps=[('preprocessor', ct)])
X_train_trans = ct.fit_transform(X_train)
X_test_trans = ct.transform(X_test)
trans_col = pipeline.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(CATEGORICAL_FEATURE_KEYS)

In [450]:
X_train_trans = pd.DataFrame(X_train_trans.toarray(), columns=NUMERIC_FEATURE_KEYS + trans_col.tolist())
X_test_trans = pd.DataFrame(X_test_trans.toarray(),columns=NUMERIC_FEATURE_KEYS + trans_col.tolist())
# X_train_trans['week'] = X_train['week']
# X_test_trans['week'] = X_test['week']
# X_train_trans = X_train_trans.dropna()
# X_test_trans = X_test_trans.dropna()



In [451]:
X_train_trans

Unnamed: 0,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Weekly_sales_lag,Dept_1,...,week_43,week_44,week_45,week_46,week_47,week_48,week_49,week_50,week_51,week_52
0,-0.953080,-1.702558,-0.419353,-0.175588,-0.085799,-0.274567,-0.380216,1.023768,0.390149,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.953080,-1.702558,-0.419353,-0.175588,-0.085799,-0.274567,-0.380216,1.023768,1.518722,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.953080,-1.702558,-0.419353,-0.175588,-0.085799,-0.274567,-0.380216,1.023768,-0.101363,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.953080,-1.702558,-0.419353,-0.175588,-0.085799,-0.274567,-0.380216,1.023768,1.050641,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.953080,-1.702558,-0.419353,-0.175588,-0.085799,-0.274567,-0.380216,1.023768,0.711171,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
408360,0.258408,1.429542,0.325591,-0.171579,-0.085531,0.131404,0.398139,0.535703,-0.318996,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
408361,0.258408,1.429542,0.325591,-0.171579,-0.085531,0.131404,0.398139,0.535703,0.286505,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
408362,0.258408,1.429542,0.325591,-0.171579,-0.085531,0.131404,0.398139,0.535703,-0.045078,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
408363,0.258408,1.429542,0.325591,-0.171579,-0.085531,0.131404,0.398139,0.535703,1.376635,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [452]:
X_train_trans.shape, X_train_trans.columns

((408365, 198),
 Index(['Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3',
        'MarkDown4', 'MarkDown5', 'CPI', 'Weekly_sales_lag', 'Dept_1',
        ...
        'week_43', 'week_44', 'week_45', 'week_46', 'week_47', 'week_48',
        'week_49', 'week_50', 'week_51', 'week_52'],
       dtype='object', length=198))

In [453]:
trans_col

array(['Dept_1', 'Dept_2', 'Dept_3', 'Dept_4', 'Dept_5', 'Dept_6',
       'Dept_7', 'Dept_8', 'Dept_9', 'Dept_10', 'Dept_11', 'Dept_12',
       'Dept_13', 'Dept_14', 'Dept_16', 'Dept_17', 'Dept_18', 'Dept_19',
       'Dept_20', 'Dept_21', 'Dept_22', 'Dept_23', 'Dept_24', 'Dept_25',
       'Dept_26', 'Dept_27', 'Dept_28', 'Dept_29', 'Dept_30', 'Dept_31',
       'Dept_32', 'Dept_33', 'Dept_34', 'Dept_35', 'Dept_36', 'Dept_37',
       'Dept_38', 'Dept_39', 'Dept_40', 'Dept_41', 'Dept_42', 'Dept_43',
       'Dept_44', 'Dept_45', 'Dept_46', 'Dept_47', 'Dept_48', 'Dept_49',
       'Dept_50', 'Dept_51', 'Dept_52', 'Dept_54', 'Dept_55', 'Dept_56',
       'Dept_58', 'Dept_59', 'Dept_60', 'Dept_65', 'Dept_67', 'Dept_71',
       'Dept_72', 'Dept_74', 'Dept_77', 'Dept_78', 'Dept_79', 'Dept_80',
       'Dept_81', 'Dept_82', 'Dept_83', 'Dept_85', 'Dept_87', 'Dept_90',
       'Dept_91', 'Dept_92', 'Dept_93', 'Dept_94', 'Dept_95', 'Dept_96',
       'Dept_97', 'Dept_98', 'Dept_99', 'IsHoliday_False',
 

In [454]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np 

In [455]:
lr = LinearRegression()
lr.fit(X_train_trans, y_train)

In [456]:
pred_lr = lr.predict(X_train_trans)

In [458]:
print("Train")
np.sqrt(mean_squared_error(y_train, pred_lr))

Train


1.1545892654894692

In [459]:
print("test")
test_pred_lr = lr.predict(X_test_trans)
np.sqrt(mean_squared_error(y_test, test_pred_lr))

test


1.16424159368037

In [460]:
y_test.min(), y_test.max(), y_test.std()

(-4.605170185988091, 12.169724223396777, 1.9976587043296536)

In [465]:
X_test_trans.shape

(11839, 198)

In [463]:
len(lr.coef_)

198

In [439]:
rt = RandomForestRegressor(max_depth= 50)
rt.fit(X_train_trans, y_train)

In [235]:
pred_rt = rt.predict(X_train_trans)

In [236]:
test_pred_rt = rt.predict(X_test_trans)

In [237]:
np.sqrt(mean_squared_error(y_train, pred_rt))

0.7618383334851071

In [238]:
y_train.min(), y_train.max(), y_train.std()

(-4.605170185988091, 13.448928644517972, 2.050487503404155)

In [239]:
np.sqrt(mean_squared_error(y_test, test_pred_rt))

1.7663920528199546

In [240]:
y_test.min(), y_test.max(), y_test.std()

(-4.605170185988091, 12.169724223396777, 1.9976587043296536)

In [241]:
import tensorflow as tf 

In [325]:
class RNNModel:
    def __init__(self, rnn_units=100, return_sequences=False):
        # Your code here
        self.model = tf.keras.Sequential(
            [
                # Batch_size, rnn_units. When return_sequence = true, shape = N, input_shape, rnn_units. (useful for stacking RNN)
                tf.keras.layers.LSTM(rnn_units, return_sequences=return_sequences),
                # tf.keras.layers.SimpleRNN(128),
                tf.keras.layers.Dense(10),
                tf.keras.layers.Dense(5),
                tf.keras.layers.Dense(1),
                tf.keras.layers.Dense(1, activation='linear'),
            ]
        )
        self.model.compile(
            optimizer=tf.keras.optimizers.RMSprop(),
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[
                tf.keras.metrics.mean_squared_error,
                tf.keras.metrics.mean_absolute_error,
            ],
        )

    def fit(self, train_data, epochs, val_data=None, callbacks=None):
        # Your code here
        history = self.model.fit(
            train_data, epochs=epochs, validation_data=val_data, callbacks=callbacks
        )
        return history

    def evaluate(self, eval_data, verbose=0):
        # Your code here
        result = self.model.evaluate(eval_data, verbose=verbose, return_dict=True)
        return result['mean_squared_error'], result['mean_absolute_error']

    def predict(self, pred_data):
        # Your code here
        return self.model.predict(pred_data)

In [326]:
model = RNNModel()

In [327]:
# model = tf.keras.Sequential(
#             [
#                 # Batch_size, rnn_units. When return_sequence = true, shape = N, input_shape, rnn_units. (useful for stacking RNN)
#                 tf.keras.layers.LSTM(32, return_sequences=False),
#                 tf.keras.layers.Dense(1),
#             ]
#         )
# model.compile(
#     optimizer=tf.keras.optimizers.RMSprop(),
#     loss=tf.keras.losses.MeanSquaredError(),
#     metrics=[
#         tf.keras.metrics.mean_squared_error,
#         tf.keras.metrics.mean_absolute_error,
#     ],
# )

In [329]:
class WindowGenerator:
    def __init__(
        self, lookback, lookahead, batch_size, train_df, test_df, label_column
    ):
        # Your code here
        # Store data
        self.train_df = train_df
        self.test_df = test_df
        # Get the column indices
        self.label_column = label_column
        self.label_column_indices = {
            name: i for i, name in enumerate(label_column)
        }  # Encoding column name into index
        self.column_indices = {name: i for i, name in enumerate(self.train_df.columns)}

        # Window parameters
        self.lookback = lookback
        self.lookahead = lookahead
        self.total_window_size = self.lookback + self.lookahead
        self.input_slice = slice(0, self.lookback)
        self.lookback_idx = np.arange(self.total_window_size)[self.input_slice]
        self.label_start = self.total_window_size - self.lookahead
        self.label_slice = slice(self.label_start, None)
        self.label_idx = np.arange(self.total_window_size)[self.label_slice]

        self.batch_size = batch_size

    def __repr__(self):
        return "\n".join(
            [
                f"Total window size: {self.total_window_size}",
                f"Lookback indices: {self.lookback_idx}",
                f"Label index: {self.label_idx}",
                f"Label name: {self.label_column}",
            ]
        )

    @property
    def train(self):
        return self.make_dataset(self.train_df)


    @property
    def test(self):
        return self.make_dataset(self.test_df, shuffle=False)

    def make_dataset(self, data, shuffle=True):
        # Your code here
        data = np.array(data, dtype=np.float32)
        ds = tf.keras.utils.timeseries_dataset_from_array(
            data=data,
            targets=None,
            sequence_length=self.total_window_size,
            sequence_stride=1,
            shuffle=shuffle,
            batch_size=32,
        )

        ds = ds.map(self.split_window)

        return ds

    def split_window(self, features):
        inputs = features[:, self.input_slice, :]
        labels = features[:, self.label_slice, :]
        labels = tf.stack(
            [labels[:, :, self.column_indices[name]] for name in self.label_column],
            axis=-1,
        )
        inputs.set_shape([None, self.lookback, None])
        labels.set_shape([None, self.lookahead, None])

        return inputs, labels


In [330]:
train_1 = X_train_trans
train_1['Date'] = train.Date
train_1['Weekly_Sales'] = np.exp(y_train)
train_1 = train_1.set_index(['Date'])
train_1 = train_1.dropna()

In [331]:
test_1 = X_test_trans
test_1['Date'] = test.Date
test_1['Weekly_Sales'] = np.exp(y_test)
test_1 = test_1.set_index(['Date'])
test_1 = test_1.dropna()

In [332]:
train_1

Unnamed: 0_level_0,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Weekly_sales_lag,IsHoliday_False,...,week_44,week_45,week_46,week_47,week_48,week_49,week_50,week_51,week_52,Weekly_Sales
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-02-05,-0.953080,-1.702558,-0.419353,-0.175588,-0.085799,-0.274567,-0.380216,1.023768,0.906984,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32229.38
2010-02-05,-0.953080,-1.702558,-0.419353,-0.175588,-0.085799,-0.274567,-0.380216,1.023768,0.066282,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5749.03
2010-02-05,-0.953080,-1.702558,-0.419353,-0.175588,-0.085799,-0.274567,-0.380216,1.023768,0.700028,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21084.08
2010-02-05,-0.953080,-1.702558,-0.419353,-0.175588,-0.085799,-0.274567,-0.380216,1.023768,1.013896,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40129.01
2010-02-05,-0.953080,-1.702558,-0.419353,-0.175588,-0.085799,-0.274567,-0.380216,1.023768,0.593042,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16930.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-09-28,0.258408,1.429542,0.325591,-0.171579,-0.085531,0.131404,0.398139,0.535703,0.273228,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8787.85
2012-09-28,0.258408,1.429542,0.325591,-0.171579,-0.085531,0.131404,0.398139,0.535703,0.733156,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22566.07
2012-09-28,0.258408,1.429542,0.325591,-0.171579,-0.085531,0.131404,0.398139,0.535703,0.534664,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15020.88
2012-09-28,0.258408,1.429542,0.325591,-0.171579,-0.085531,0.131404,0.398139,0.535703,1.094820,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,47372.08


In [333]:
w1 = WindowGenerator(lookback=6, lookahead=1, batch_size=100, train_df=train_1, test_df=test_1, label_column=['Weekly_Sales'])

In [334]:
history = model.fit(w1.train, epochs=10)

Epoch 1/10


2022-12-15 19:47:33.820622: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-15 19:47:34.263221: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-15 19:47:34.461087: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [314]:
pred = model.predict(w1.train)

    1/12762 [..............................] - ETA: 1:41:54

2022-12-15 17:16:31.169802: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-15 17:16:31.270068: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




In [318]:
np.sqrt(mean_squared_error(train_1['Weekly_Sales'][6:], pred.flatten()))

2.765172022365282

In [323]:
np.sqrt(model.evaluate(w1.train)[1])

0.7353736499240329

In [324]:
np.sqrt(model.evaluate(w1.test)[1])

0.7669559121447977

In [321]:
pred = model.predict(w1.test)



In [322]:
np.sqrt(mean_squared_error(test_1['Weekly_Sales'][6:], pred.flatten()))

1.0154105598483505

# Overall Result

| Model | Train RMSE | Test RMSE |
|------|-------------|-----------|
| Logistic Regression | 1.90 | 1.86 | 
| RandomForest Regressor | 0.76 | 1.76 |
| Neural Network | 0.73 | 0.76 |


# BASIC NN

In [337]:
X_train_trans.shape

(408365, 119)

In [390]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(100),
    tf.keras.layers.Dense(1000, activation='relu'),
    tf.keras.layers.Dense(2000, activation='relu'),
    tf.keras.layers.Dense(1000, activation='relu'),
    tf.keras.layers.Dense(500, activation='relu'),
    tf.keras.layers.Dense(250, activation='relu'),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(1, activation='relu'),
])

model.compile(
    optimizer=tf.keras.optimizers.RMSprop(),
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=[
        tf.keras.metrics.mean_squared_error,
        tf.keras.metrics.mean_absolute_error,
    ],
)

In [397]:
model.fit(tf.convert_to_tensor(np.array(X_train_trans).astype('float32')), tf.convert_to_tensor(np.array(y_train).astype('float32')), epoch =10)

TypeError: fit() got an unexpected keyword argument 'epoch'

array(<408365x117 sparse matrix of type '<class 'numpy.float64'>'
	with 7350570 stored elements in Compressed Sparse Row format>,
      dtype=object)

In [388]:
np.array(X_train_trans).astype('float32')

array([[-0.95308024, -1.7025583 , -0.4193528 , ...,  0.        ,
         0.        ,  0.        ],
       [-0.95308024, -1.7025583 , -0.4193528 , ...,  0.        ,
         0.        ,  0.        ],
       [-0.95308024, -1.7025583 , -0.4193528 , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.2584079 ,  1.4295418 ,  0.3255911 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.2584079 ,  1.4295418 ,  0.3255911 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.2584079 ,  1.4295418 ,  0.3255911 , ...,  0.        ,
         0.        ,  0.        ]], dtype=float32)