# Predictive Modelling using Machine Learning Algorithms

In [23]:
import pandas as pd
import numpy as np
import duckdb as db
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import optuna
import warnings

warnings.filterwarnings('ignore')

from IPython.display import display, Markdown
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
# pd.set_option('plotting.backend', 'plotly')
# pio.renderers.default = "notebook_connected"
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [2]:
orig = pd.read_parquet("../../data/processed/train_enhanced.parquet")
train_agg = pd.read_parquet("../../data/processed/train_agg.parquet")
train_region_code_agg = pd.read_parquet("../../data/processed/train_region_code_agg.parquet")
holiday_df= pd.read_csv("../../data/processed/holidays.csv")

In [3]:
orig

Unnamed: 0,ID,Store_id,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount,Orders,Sales,Day_of_Week_Name,Day,Day_of_Week,Month_Name,Month,Year,Quarter,Week,Week_of_Month,Is_Weekend
0,T1000001,1,S1,L3,R1,2018-01-01,1,1,9,7011.84,Monday,1,0,January,1,2018,1,1,1,0
1,T1000002,253,S4,L2,R1,2018-01-01,1,1,60,51789.12,Monday,1,0,January,1,2018,1,1,1,0
2,T1000003,252,S3,L2,R1,2018-01-01,1,1,42,36868.20,Monday,1,0,January,1,2018,1,1,1,0
3,T1000004,251,S2,L3,R1,2018-01-01,1,1,23,19715.16,Monday,1,0,January,1,2018,1,1,1,0
4,T1000005,250,S2,L3,R4,2018-01-01,1,1,62,45614.52,Monday,1,0,January,1,2018,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188335,T1188336,149,S2,L3,R2,2019-05-31,1,1,51,37272.00,Friday,31,4,May,5,2019,2,22,5,0
188336,T1188337,153,S4,L2,R1,2019-05-31,1,0,90,54572.64,Friday,31,4,May,5,2019,2,22,5,0
188337,T1188338,154,S1,L3,R2,2019-05-31,1,0,56,31624.56,Friday,31,4,May,5,2019,2,22,5,0
188338,T1188339,155,S3,L1,R2,2019-05-31,1,1,70,49162.41,Friday,31,4,May,5,2019,2,22,5,0


In [7]:
orig.columns

Index(['ID', 'Store_id', 'Store_Type', 'Location_Type', 'Region_Code', 'Date',
       'Holiday', 'Discount', 'Orders', 'Sales', 'Day_of_Week_Name', 'Day',
       'Day_of_Week', 'Month_Name', 'Month', 'Year', 'Quarter', 'Week',
       'Week_of_Month', 'Is_Weekend', 'log_Sales', 'log_Orders', 'sqrt_Sales',
       'sqrt_Orders'],
      dtype='object')

In [11]:
columns = ['Store_id', 'Store_Type', 'Location_Type', 'Region_Code',
       'Holiday', 'Discount',  'Sales', 'Day', 'Date',
       'Day_of_Week', 'Month', 'Quarter', 'Week',
       'Week_of_Month', 'Is_Weekend']

In [17]:
df=orig[columns]

In [18]:
df.sort_values(by=['Store_id'])

Unnamed: 0,Store_id,Store_Type,Location_Type,Region_Code,Holiday,Discount,Sales,Day,Date,Day_of_Week,Month,Quarter,Week,Week_of_Month,Is_Weekend
0,1,S1,L3,R1,1,1,7011.84,1,2018-01-01,0,1,1,1,1,0
129193,1,S1,L3,R1,0,1,44025.00,20,2018-12-20,3,12,4,51,3,0
129250,1,S1,L3,R1,0,1,55155.00,21,2018-12-21,4,12,4,51,3,0
168279,1,S1,L3,R1,0,1,36531.00,7,2019-04-07,6,4,2,14,1,0
57988,1,S1,L3,R1,0,0,28788.00,8,2018-06-08,4,6,2,23,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5889,365,S2,L1,R2,0,1,41844.00,17,2018-01-17,2,1,1,3,3,0
119151,365,S2,L1,R2,1,1,27097.92,23,2018-11-23,4,11,4,47,4,0
153328,365,S2,L1,R2,0,0,19836.00,25,2019-02-25,0,2,1,9,4,0
163522,365,S2,L1,R2,0,1,42726.00,25,2019-03-25,0,3,1,13,4,0


In [24]:
df=db.sql("""
     select *,
        lag(Sales, 1) over (partition by Store_id order by Date) as Sales_Lag_1,
        lag(Sales, 7) over (partition by Store_id order by Date) as Sales_Lag_7,
        lag(Sales, 12) over (partition by Store_id order by Date) as Sales_Lag_12,
        lag(Sales, 30) over (partition by Store_id order by Date) as Sales_Lag_30,
        avg(Sales) over (partition by Store_id order by Date rows between 7 preceding and 1 preceding) as Sales_Mean_7,
        avg(Sales) over (partition by Store_id order by Date rows between 12 preceding and 1 preceding) as Sales_Mean_12,
        avg(Sales) over (partition by Store_id order by Date rows between 30 preceding and 1 preceding) as Sales_Mean_30,
        from df
        order by Store_id, Date
    """).df().dropna()

# # Create lag features
# df['Sales_Lag_1'] = df.groupby('Store_id')['Sales'].shift(1)
# df['Sales_Lag_7'] = df.groupby('Store_id')['Sales'].shift(7)
# df['Sales_Lag_12'] = df.groupby('Store_id')['Sales'].shift(12)
# df['Sales_Lag_30'] = df.groupby('Store_id')['Sales'].shift(30)

# # Create moving average features
# df['Sales_Mean_7'] = df.groupby('Store_id')['Sales'].shift(1).rolling(window=7).mean()
# df['Sales_Mean_12'] = df.groupby('Store_id')['Sales'].shift(1).rolling(window=12).mean()
# df['Sales_Mean_30'] = df.groupby('Store_id')['Sales'].shift(1).rolling(window=30).mean()

# # Sort the DataFrame by Store_id and Date
# df = df.sort_values(by=['Store_id', 'Date'])

In [26]:
ohe = OneHotEncoder(categories='auto')
feature_array = ohe.fit_transform(df[['Store_Type', 'Location_Type', 'Region_Code']]).toarray()

In [27]:
df = pd.concat([df, pd.DataFrame(feature_array, columns=ohe.get_feature_names_out())], axis=1).drop(['Store_Type', 'Location_Type', 'Region_Code'], axis=1)
df.head()

Unnamed: 0,Store_id,Holiday,Discount,Sales,Day,Date,Day_of_Week,Month,Quarter,Week,...,Store_Type_S4,Location_Type_L1,Location_Type_L2,Location_Type_L3,Location_Type_L4,Location_Type_L5,Region_Code_R1,Region_Code_R2,Region_Code_R3,Region_Code_R4
30,1.0,1.0,0.0,15812.94,31.0,2018-01-31,2.0,1.0,1.0,5.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
31,1.0,0.0,0.0,27306.0,1.0,2018-02-01,3.0,2.0,1.0,5.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
32,1.0,0.0,0.0,34035.0,2.0,2018-02-02,4.0,2.0,1.0,5.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
33,1.0,0.0,0.0,32616.0,3.0,2018-02-03,5.0,2.0,1.0,5.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
34,1.0,0.0,0.0,31974.0,4.0,2018-02-04,6.0,2.0,1.0,5.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [28]:
# target encoding for Store_id using global mean
store_id_mean = df.groupby('Store_id')['Sales'].mean()
df['Store_id'] = df['Store_id'].map(store_id_mean)
# add smoothed target encoding for Store_id
df['Store_id_smooth'] = df['Store_id'].expanding().mean()
df.head()

Unnamed: 0,Store_id,Holiday,Discount,Sales,Day,Date,Day_of_Week,Month,Quarter,Week,...,Location_Type_L1,Location_Type_L2,Location_Type_L3,Location_Type_L4,Location_Type_L5,Region_Code_R1,Region_Code_R2,Region_Code_R3,Region_Code_R4,Store_id_smooth
30,34515.640679,1.0,0.0,15812.94,31.0,2018-01-31,2.0,1.0,1.0,5.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,34515.640679
31,34515.640679,0.0,0.0,27306.0,1.0,2018-02-01,3.0,2.0,1.0,5.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,34515.640679
32,34515.640679,0.0,0.0,34035.0,2.0,2018-02-02,4.0,2.0,1.0,5.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,34515.640679
33,34515.640679,0.0,0.0,32616.0,3.0,2018-02-03,5.0,2.0,1.0,5.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,34515.640679
34,34515.640679,0.0,0.0,31974.0,4.0,2018-02-04,6.0,2.0,1.0,5.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,34515.640679
