# Data Preprocessing

In [None]:
import os
import numpy as np
import pandas as pd

PATH_CSV = '/kaggle/input/applications-of-deep-learning-wustlfall-2022/beach_demand_forecast/'
PATH_CAM = '/kaggle/input/applications-of-deep-learning-wustlfall-2022/beach_demand_forecast/cam/'

df_sales_train = pd.read_csv(os.path.join(PATH_CSV,"sales_train.csv"))
df_sales_test = pd.read_csv(os.path.join(PATH_CSV,"sales_test.csv"))
df_items = pd.read_csv(os.path.join(PATH_CSV,"items.csv"))
df_resturant = pd.read_csv(os.path.join(PATH_CSV,"resturants.csv"))

df_sales_train.date = pd.to_datetime(df_sales_train.date, errors='coerce') 
df_sales_test.date = pd.to_datetime(df_sales_test.date, errors='coerce') 

In [None]:
df_sales = pd.concat([df_sales_train, df_sales_test])
df_sales.columns = ['date','item_id','price','sales','submit_id']
df_sales.loc[~df_sales.submit_id.isna(),'submit_id'] = df_sales[~df_sales.submit_id.isna()].submit_id.astype(int)

In [None]:
df_sales

# Deseason and Detrend

Begin by producing a line graph of all sales over the provided 3-year timespan.

In [None]:
import plotly.express as px

df_plot = df_sales_train[['date','item_count']].groupby(['date']).mean().reset_index()
fig = px.line(df_plot, x="date", y="item_count", title='RAW Sales by Date')
fig.show()

## Detrending

Can you see an overall trend in this data? Beyond just the seasonality?

In [None]:
from scipy import signal

df_plot.item_count = signal.detrend(df_plot.item_count)

fig = px.line(df_plot, x="date", y="item_count", title='RAW Sales by Date')
fig.show()

## De-Seasoning

Lets remove the seasonality.

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
from matplotlib import pyplot

df_plot = df_sales_train[['date','item_count']].groupby(['date']).mean()

adjustment = seasonal_decompose(df_plot.item_count, model='multiplicative', period=7) 

adjustment.plot()
pyplot.show()

Notice how the yearly seasonality was detected as the trend? The weekly seasonality was detected as seasonal. Without zooming you cannot see the ups and downs of the individual days of the week.

In [None]:
adjustment.trend

In [None]:
adjustment.seasonal

In [None]:
fig = px.line(adjustment.trend)
fig.show()

In [None]:
adjustment2 = seasonal_decompose(adjustment.trend.dropna(), model='multiplicative', period=365) 

adjustment2.plot()
pyplot.show()

In [None]:
fig = px.line(adjustment2.seasonal)
fig.show()

In [None]:
fig = px.line(adjustment2.trend)
fig.show()

In [None]:
from sklearn.linear_model import LinearRegression

X = np.arange(len(adjustment2.trend.dropna())).reshape(-1, 1)
y = adjustment2.trend.dropna().values

reg = LinearRegression().fit(X, y)
reg.score(X, y)

In [None]:
reg.coef_, reg.intercept_

## Resulting Dataset
Lets see the dataset "flattened".

In [None]:
df_plot2 = df_plot.copy()

df_plot2.item_count = df_plot2.item_count / adjustment.seasonal / adjustment2.seasonal / adjustment2.trend

fig = px.line(df_plot2.reset_index(), x="date", y="item_count", title='RAW Sales by Date')
fig.show()

In [None]:
df_adjustment = pd.DataFrame()
df_adjustment['seasonal_week'] = adjustment.seasonal
df_adjustment['seasonal_year'] = adjustment2.seasonal
df_adjustment['trend'] = adjustment2.trend

df_adjustment

In [None]:
for i in range(3):
    df_adjustment.iloc[i, 1] = df_adjustment.iloc[i+365, 1]
    df_adjustment.iloc[-1-i, 1] = df_adjustment.iloc[-1-i-365, 1]

df_adjustment[df_adjustment['seasonal_year'].isna()]

In [None]:
df_adjustment['X'] = np.arange(-185, -185+df_adjustment.shape[0])

In [None]:
df_adjustment['trend_pred'] = reg.predict(df_adjustment['X'].values.reshape(-1, 1))

In [None]:
df_adjustment[df_adjustment['trend'].isna()]
#df_adjustment[~df_adjustment['trend'].isna()]

In [None]:
df_adjustment_forecast = pd.DataFrame(index=pd.date_range('2021-10-01','2021-12-31',freq='D'), )

df_adjustment_forecast['seasonal_week'] = df_adjustment.iloc[-7:,0].to_list()*13 + df_adjustment.iloc[-7:-6,0].to_list()
df_adjustment_forecast['seasonal_year'] = df_adjustment.iloc[-365:-365+92,1].to_list()
df_adjustment_forecast['trend'] = np.nan
df_adjustment_forecast['X'] = np.arange(819, 819+df_adjustment_forecast.shape[0])
df_adjustment_forecast['trend_pred'] = reg.predict(df_adjustment_forecast['X'].values.reshape(-1, 1))

df_adjustment_forecast

In [None]:
df_adjustment = pd.concat([df_adjustment, df_adjustment_forecast])

We will save the adjustment table to a binary pickle form, so we can later reload it exactly as it is. We will make use of this table during day 2.

In [None]:
df_sales_adj = df_sales.merge(df_adjustment[['seasonal_week', 'seasonal_year', 'trend_pred']],right_index=True,left_on='date')
df_sales_adj['adjust'] = df_sales_adj.sales / df_sales_adj.seasonal_week / df_sales_adj.seasonal_year / df_sales_adj.trend_pred

df_sales_adj

# Extract Data from Street Images with YOLO

In [None]:
import sys

!git clone https://github.com/ultralytics/yolov5 --tag 6.2  # clone
!mv /kaggle/working/6.2 /kaggle/working/yolov5
%pip install -qr /kaggle/working/yolov5/requirements.txt  # install
sys.path.insert(0,'/kaggle/working/yolov5/')

import torch
import utils
display = utils.notebook_init()  # checks

In [None]:
from os import walk
import datetime
import tqdm

# Model
yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s')  # or yolov5n - yolov5x6, custom

filenames = next(walk(PATH_CAM), (None, None, []))[2]  

list_date = []
list_people_street = []
list_people_beach = []
x_cutoff = 800

for file in tqdm.tqdm(filenames):
    if file=='1.jpg': continue
    filename = os.path.join(PATH_CAM, file)
    results = yolo_model(filename)
    df = results.pandas().xyxy[0]
    people_street = len(df[(df.name=='person') & (df.xmin<x_cutoff)]) 
    people_beach = len(df[(df.name=='person') & (df.xmin>=x_cutoff)])
    dt = datetime.datetime.strptime(file[:10], '%Y_%m_%d')
    list_date.append(dt)
    list_people_street.append(people_street)
    list_people_beach.append(people_beach)

df_street_view = pd.DataFrame({'date':list_date,'people_street':list_people_street, 'people_beach':list_people_beach})
df_street_view

# Engineer Time Series Features

In [None]:
def series_to_supervised(data, window=1, lag=1, dropnan=True):
    cols, names = list(), list()
    # Input sequence (t-n, ... t-1)
    for i in range(window, 0, -1):
        cols.append(data.shift(i))
        names += [('%s(t-%d)' % (col, i)) for col in data.columns]
    # Current timestep (t=0)
    cols.append(data)
    names += [('%s(t)' % (col)) for col in data.columns]
    # Target timestep (t=lag)
    cols.append(data.shift(-lag))
    names += [('%s(t+%d)' % (col, lag)) for col in data.columns]
    # Put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # Drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

def drop_columns(df, columns):
    columns_to_drop = [('%s(t+%d)' % (col, future_span)) for col in columns]
    for i in range(window, 0, -1):
        columns_to_drop += [('%s(t-%d)' % (col, i)) for col in columns]
    columns_to_drop += [('%s(t)' % col) for col in columns]
    df.drop(columns_to_drop, axis=1, inplace=True, errors='ignore')

We will link the season and trend adjustments.

In [None]:
df_items2 = df_items[['id','store_id']]
df_train = df_sales_adj.merge(df_items2,left_on='item_id',right_on='id')

df_train

In [None]:
# Merge people counts 
temp = len(df_train)
df_train = df_train.merge(df_street_view)
assert len(df_train) == temp

df_train

In [None]:
# Sort/agg
df_train = df_train.sort_values('date').groupby(['item_id', 'store_id', 'date'], as_index=False)
df_train = df_train.agg({'adjust':['mean'],'sales':['mean'],'seasonal_week':['mean'],'seasonal_year':['mean'],'trend_pred':['mean'],'people_street':['mean'],'people_beach':['mean'],'submit_id':['mean']})
df_train.columns = ['item', 'store', 'date', 'adjust', 'sales', 'seasonal_week', 'seasonal_year', 'trend', 'people_street', 'people_beach', 'submit_id']
df_train.head()

Lets also engineer two features. This gives the neural network some information about what day of the week and day of the year we are in. Ideally, these are not needed with seasonality generally removed; however, if some seasonality and trend remain, maybe these two features help the neural network to overcome.

In [None]:
df_train['dow'] = df_train['date'].dt.dayofweek
df_train['doy'] = df_train['date'].dt.dayofyear

df_train

Build the sequence data.

In [None]:
future_span = (df_sales_test['date'].max().date() - df_sales_train['date'].max().date()).days
print('Max date from train set: %s' % df_sales_train['date'].max().date())
print('Max date from test set: %s' % df_sales_test['date'].max().date())
print('Forecast lag size', future_span)

Remove sequences that did not have enough data.

In [None]:
window = 29
series = series_to_supervised(df_train.drop('date', axis=1), window=window, lag=future_span, dropnan=False)

# Remove edge cases, where there were not enough values to complete a series
last_item = 'item(t-%d)' % window
last_store = 'store(t-%d)' % window
# last_dow = 'dow(t-%d)' % window
# last_doy = 'doy(t-%d)' % window

series = series[(series['store(t+%d)' % future_span] == series[last_store])]
series = series[(series['item(t+%d)' % future_span] == series[last_item])]

series

We will predict with adjusted sales, and our engineered features.

In [None]:
labels_col = 'adjust(t+%d)' % future_span
submit_id_col = 'submit_id(t+%d)' % future_span

series_train = series.loc[series[submit_id_col].isna()].copy(deep=True)
series_submit = series.loc[~series[submit_id_col].isna()].copy(deep=True)

print(series_train.shape, series_submit.shape)

In [None]:
# Label
labels = series_train[labels_col]
series_train.drop(labels_col, axis=1, inplace=True)
series_train.drop('item(t+%d)' % future_span, axis=1, inplace=True)
series_train.drop('store(t+%d)' % future_span, axis=1, inplace=True)
series_train.drop('dow(t+%d)' % future_span, axis=1, inplace=True)
series_train.drop('doy(t+%d)' % future_span, axis=1, inplace=True)
series_train.drop('people_street(t+%d)' % future_span, axis=1, inplace=True)
series_train.drop('people_beach(t+%d)' % future_span, axis=1, inplace=True)
series_train.drop('submit_id(t+%d)' % future_span, axis=1, inplace=True)

# store the seasonal and trend
unadjust_sales_col = 'sales(t+%d)' % future_span
seasonal_week_col = 'seasonal_week(t+%d)' % future_span
seasonal_year_col = 'seasonal_year(t+%d)' % future_span
trend_col = 'trend(t+%d)' % future_span

hold_sales = series_train[unadjust_sales_col]
hold_seasonal_week = series_train[seasonal_week_col]
hold_seasonal_year = series_train[seasonal_year_col]
hold_trend = series_train[trend_col]

series_train.drop(unadjust_sales_col, axis=1, inplace=True)
series_train.drop(seasonal_week_col, axis=1, inplace=True)
series_train.drop(seasonal_year_col, axis=1, inplace=True)
series_train.drop(trend_col, axis=1, inplace=True)

series_train

In [None]:
# Get adjust sales sequences
series2 = series_train.copy()
drop_columns(series2, ['item','store','dow', 'doy', 'submit_id', 'sales', 'seasonal_week', 'seasonal_year', 'trend', 'people_street', 'people_beach'])
sales_series = series2.values

# Day of week as a number
series2 = series_train.copy()
drop_columns(series2, ['item','store','adjust', 'doy', 'submit_id', 'sales', 'seasonal_week', 'seasonal_year', 'trend', 'people_street', 'people_beach'])
dow_series = series2.values

# Get day of year sequences
series2 = series_train.copy()
drop_columns(series2, ['item','store','dow', 'adjust', 'submit_id', 'sales', 'seasonal_week', 'seasonal_year', 'trend', 'people_street', 'people_beach'])
doy_series = series2.values

# Get number of people sequences
series2 = series_train.copy()
drop_columns(series2, ['item','store','dow', 'doy', 'adjust', 'people_beach', 'submit_id', 'sales', 'seasonal_week', 'seasonal_year', 'trend'])
people_street_series = series2.values

series2 = series_train.copy()
drop_columns(series2, ['item','store','dow', 'doy', 'adjust', 'people_street', 'submit_id', 'sales', 'seasonal_week', 'seasonal_year', 'trend'])
people_beach_series = series2.values

# Create x
t1 = sales_series.reshape(sales_series.shape + (1,))
t2 = dow_series.reshape(dow_series.shape + (1,)) 
t3 = doy_series.reshape(doy_series.shape + (1,))
t4 = people_street_series.reshape(people_street_series.shape + (1,))
t5 = people_beach_series.reshape(people_beach_series.shape + (1,))
x1 = np.concatenate([t1,t2,t3,t4,t5],axis=2)

Double check that all input data is of the same shape.

In [None]:
print(t1.shape)
print(t2.shape)
print(t3.shape)
print(t4.shape)
print(t5.shape)

# Vectorize Item Names with Glove Embeddings

In [None]:
!wget -c "https://nlp.stanford.edu/data/glove.6B.zip"
!unzip glove.6B.zip

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

glove_file = 'glove.6B.300d.txt'
tmp_file = get_tmpfile("test_word2vec.txt")
_ = glove2word2vec(glove_file, tmp_file)
w2vec_model = KeyedVectors.load_word2vec_format(tmp_file)

In [None]:
def process_title(model, name):
    v = None
    i = 0
    for word in name.split(' '):
        word = word.lower()
        if word == 'vegi': 
            word = "vegetable"
        if word == 'smoothy': 
            word = "malt"
        i+=1
        if v is None and word in model:
            v=model[word].copy()
        elif word in model:
            v+=model[word]
    v/=i
    return v

In [None]:
item_lookup = {}
for i, name in zip(list(df_items.id),list(df_items.name)):
    v = process_title(w2vec_model,name)
    item_lookup[i] = v

In [None]:
# Create predictors (x)
vec_size = w2vec_model['test'].shape[0]

lst = []
for item in list(series_train['item(t-1)']):
    lst.append(item_lookup[item])

x2 = np.concatenate(lst).reshape((series_train.shape[0],vec_size))

x = [x1,x2]

In [None]:
print(x1.shape, x2.shape)

# Train the Network

Extract the predictors (x sequences) and the label (future prediction)

In [None]:
TEST_SIZE = 0.4

mask = np.random.random(size=x[0].shape[0]) < TEST_SIZE

X_train = []
X_valid = []

for subx in x:
    X_train.append(subx[~mask])
    X_valid.append(subx[mask])

Y_train = labels.values[~mask]
Y_valid = labels.values[mask]

print('Train set shape x1:', X_train[0].shape)
print('Train set shape x2:', X_train[1].shape)
print('Validation set shape x1:', X_valid[0].shape)
print('Validation set shape x2:', X_valid[1].shape)

In [None]:
Y_train_seasonal_week = hold_seasonal_week.values[~mask]
Y_valid_seasnoal_week = hold_seasonal_week.values[mask]

Y_train_seasonal_year = hold_seasonal_year.values[~mask]
Y_valid_seasnoal_year = hold_seasonal_year.values[mask]

Y_train_trend = hold_trend.values[~mask]
Y_valid_trend = hold_trend.values[mask]

Construct the neural network.

In [None]:
import tensorflow as tf 
from keras.models import Sequential, Model
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers import Dense, LSTM, RepeatVector, TimeDistributed, Flatten, Dropout, concatenate, Input
import keras

epochs = 500
batch = 256
lr = 0.0003
adam = tf.keras.optimizers.Adam(lr)

model = Sequential()

A1 = Input(shape=(X_train[0].shape[1], X_train[0].shape[2]),name='A1')
A2 = Conv1D(filters=64, kernel_size=8, activation='relu')(A1)
A3 = MaxPooling1D(pool_size=2)(A2)
A4 = Flatten()(A3)
A5 = Dense(50, activation='relu')(A4)
A6 = Dropout(0.2)(A5)

B1 = Input(shape=X_train[1].shape[1],name='B1')
B2 = Dense(16, activation='relu',name='B2')(B1)

M1 = concatenate([A6,B2])
M2 = Dense(1,name='M2')(M1)

model = Model(inputs=[A1, B1],outputs=[M2])
model.compile(loss='mse', optimizer=adam)
model.summary()

Fit the neural network.

In [None]:
from keras.callbacks import EarlyStopping

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=10, 
        verbose=1, mode='auto', restore_best_weights=True)

cnn_history = model.fit(X_train, Y_train, callbacks=[monitor],
    validation_data=(X_valid, Y_valid), epochs=epochs, verbose=2)

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure()
plt.plot(cnn_history.history['loss'], label='Train loss')
plt.plot(cnn_history.history['val_loss'], label='Validation loss')
fig.legend()
fig.suptitle('CNN')
plt.xlabel("Epochs")
plt.ylabel("MSE")

plt.show()

In [None]:
from sklearn.metrics import mean_squared_error 

cnn_train_pred = model.predict(X_train) 
cnn_valid_pred = model.predict(X_valid) 
print('Train rmse:', np.sqrt(mean_squared_error(Y_train, cnn_train_pred))) 
print('Validation rmse:', np.sqrt(mean_squared_error(Y_valid, cnn_valid_pred)))

In [None]:
Y_train_actual = hold_sales.values[~mask]
Y_valid_actual = hold_sales.values[mask]

cnn_train_pred2 = cnn_train_pred.flatten() * Y_train_seasonal_week * Y_train_seasonal_year * Y_train_trend
cnn_valid_pred2 = cnn_valid_pred.flatten() * Y_valid_seasnoal_week * Y_valid_seasnoal_year * Y_valid_trend

print('Train rmse:', np.sqrt(mean_squared_error(Y_train_actual, cnn_train_pred2)))
print('Validation rmse:', np.sqrt(mean_squared_error(Y_valid_actual, cnn_valid_pred2)))

# Build a Submission File

In [None]:
submit_id = series_submit[submit_id_col].astype(int)

series_submit.drop(labels_col, axis=1, inplace=True)
series_submit.drop('item(t+%d)' % future_span, axis=1, inplace=True)
series_submit.drop('store(t+%d)' % future_span, axis=1, inplace=True)
series_submit.drop('dow(t+%d)' % future_span, axis=1, inplace=True)
series_submit.drop('doy(t+%d)' % future_span, axis=1, inplace=True)
series_submit.drop('people_street(t+%d)' % future_span, axis=1, inplace=True)
series_submit.drop('people_beach(t+%d)' % future_span, axis=1, inplace=True)
series_submit.drop('submit_id(t+%d)' % future_span, axis=1, inplace=True)

# store the seasonal and trend
#hold_sales_submit = series_submit[unadjust_sales_col]
hold_seasonal_week_submit = series_submit[seasonal_week_col]
hold_seasonal_year_submit = series_submit[seasonal_year_col]
hold_trend_submit = series_submit[trend_col]

series_submit.drop(unadjust_sales_col, axis=1, inplace=True)
series_submit.drop(seasonal_week_col, axis=1, inplace=True)
series_submit.drop(seasonal_year_col, axis=1, inplace=True)
series_submit.drop(trend_col, axis=1, inplace=True)

series_submit

In [None]:
# Get sales sequences
series2 = series_submit.copy()
drop_columns(series2, ['item','store','dow', 'doy', 'submit_id', 'sales', 'seasonal_week', 'seasonal_year', 'trend', 'people_street', 'people_beach'])
sales_series = series2.values

# Day of week as a number
series2 = series_submit.copy()
drop_columns(series2, ['item','store','adjust', 'doy', 'submit_id', 'sales', 'seasonal_week', 'seasonal_year', 'trend', 'people_street', 'people_beach'])
dow_series = series2.values

# Get day of year sequences
series2 = series_submit.copy()
drop_columns(series2, ['item','store','dow', 'adjust', 'submit_id', 'sales', 'seasonal_week', 'seasonal_year', 'trend', 'people_street', 'people_beach'])
doy_series = series2.values

# Get number of people sequences
series2 = series_submit.copy()
drop_columns(series2, ['item','store','dow', 'doy', 'adjust', 'people_beach', 'submit_id', 'sales', 'seasonal_week', 'seasonal_year', 'trend'])
people_street_series = series2.values

series2 = series_submit.copy()
drop_columns(series2, ['item','store','dow', 'doy', 'adjust', 'people_street', 'submit_id', 'sales', 'seasonal_week', 'seasonal_year', 'trend'])
people_beach_series = series2.values


# Create x
t1 = sales_series.reshape(sales_series.shape + (1,))
t2 = dow_series.reshape(dow_series.shape + (1,)) 
t3 = doy_series.reshape(doy_series.shape + (1,))
t4 = people_street_series.reshape(people_street_series.shape + (1,))
t5 = people_beach_series.reshape(people_beach_series.shape + (1,))
x1 = np.concatenate([t1,t2,t3,t4,t5],axis=2)

In [None]:
print(t1.shape)
print(t2.shape)
print(t3.shape)
print(t4.shape)
print(t5.shape)

In [None]:
# Create predictors (x)
vec_size = w2vec_model['test'].shape[0]

lst = []
for item in list(series_submit['item(t-1)']):
    lst.append(item_lookup[item])

x2 = np.concatenate(lst).reshape((series_submit.shape[0],vec_size))

x_submit = [x1,x2]

In [None]:
submit_pred = model.predict(x_submit)

In [None]:
submit_pred = submit_pred.flatten() * hold_seasonal_week_submit.values * hold_seasonal_year_submit.values * hold_trend_submit.values

In [None]:
df_submit = pd.DataFrame()
df_submit['id'] = submit_id.to_list()
df_submit['item_count'] = submit_pred

In [None]:
df_submit.item_count[df_submit['item_count']<0] = 0

In [None]:
df_submit.to_csv('submission.csv',index=False)