# Importing libraries and cleaned dataset

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from prophet import Prophet

from statsmodels.tools.eval_measures import rmse
from sklearn.metrics import mean_absolute_error

from functions import prophet_model, df_prophet_prep, outlier_scaling, metrics_calc_clusters,prophet_model_with_clusters,prophet_model_per_product

# to set logging level to ERROR
# that is because in the model forecasts I was getting updates on time started/ completed and was visually confusing
import logging

Importing plotly failed. Interactive plots will not work.


In [2]:
# to set logging level to ERROR
# that is because in the model forecasts I was getting updates on time started/ completed and was visually confusing
logging.getLogger('cmdstanpy').setLevel(logging.ERROR)

In [3]:
# Disabling a specific type of warning from Jupyter, that I know if mistaken for something else
pd.options.mode.chained_assignment = None 

**Original Dataset**:

Chen,Daqing. (2015). Online Retail. UCI Machine Learning Repository. https://archive.ics.uci.edu/dataset/352/online+retail

In [4]:
# Importing dataset, clean and filtered with clusters from the notebook 03_ML_clustering
# the filters on are: only actual products, only UK (90% of the orders), dropping NaN customerID
# At a later stage, I can solve the NaN in CustomerID by building a ML model to predict their Label based on the order(from InvoiceNo) characteristics
file_path = '../data/data_filtered_with_clusters.csv'

df = pd.read_csv(file_path, dtype={'InvoiceNo': 'O','StockCode': 'O','Description': 'O',
                                   'Quantity': 'int64','UnitPrice': 'float64',
                                   'CustomerID': 'Int64','Country': 'O'}, parse_dates=['InvoiceDate'])

# Feature Engineering

### Scaling outliers

I know from the EDA, shown in the previous notebooks, **two outliers** with huge difference in scale. I am going to do scaling to a range, scaling them to highest next value (after these two).

In [5]:
# Scaling within a range, updating the Quantity and accordingly the TotalPrice
df = outlier_scaling(df)

### Date column

Adding new column for the date without time.

In [6]:
# New date column without time
df['InvoiceDate_noTime'] = df.InvoiceDate.dt.normalize()

### Removing features that will not be used

I will be using StockCode, Quantity, Date and Labelfor my model. 

In [7]:
df.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country', 'TotalPrice', 'Label',
       'InvoiceDate_noTime'],
      dtype='object')

In [8]:
df.drop(columns=['InvoiceNo','Description', 'InvoiceDate','UnitPrice', 
                 'CustomerID', 'Country', 'TotalPrice'],inplace=True)
df.rename(columns={'InvoiceDate_noTime':'InvoiceDate'}, inplace=True)
df

Unnamed: 0,StockCode,Quantity,Label,InvoiceDate
0,85123A,6,2,2010-12-01
1,71053,6,2,2010-12-01
2,84406B,8,2,2010-12-01
3,84029G,6,2,2010-12-01
4,84029E,6,2,2010-12-01
...,...,...,...,...
348882,22466,12,1,2011-12-09
348883,22061,8,2,2011-12-09
348884,23275,24,2,2011-12-09
348885,21217,24,2,2011-12-09


# Top selling products

In [9]:
# the top 5 selling products
products_top = df.groupby('StockCode')['Quantity'].sum().sort_values(ascending=False).head(5).index.tolist()
products_top

['84077', '22197', '85099B', '85123A', '84879']

# Preparing dataframes for model for Top 1 product

In [10]:
# Creating dataframe fot the prophet model, with the custom function (at functions.py)

# Creating df for the top product 1, for the dataframe without cluster seperation, and no outlier clipping
df_TopPr0 = df_prophet_prep(df, products_top[0], False)

In [11]:
# Creating df for the top product 1, for the dataframe without cluster seperation, and with outlier clipping
df_TopPr0_clip = df_prophet_prep(df, products_top[0], True)

Instances that needed outlier clipping:  10 , out of total instances:  424


In [12]:
# The code in this file is meant to work with 4 clusters!

In [13]:
df.Label.unique()

array([2, 1, 0, 3])

In [14]:
# Splitting dataset per cluster
df_cl0 = df[df.Label == 0].reset_index(drop=True)
df_cl1 = df[df.Label == 1].reset_index(drop=True)
df_cl2 = df[df.Label == 2].reset_index(drop=True)
df_cl3 = df[df.Label == 3].reset_index(drop=True)

In [15]:
# Creating dataframes for the top product 1, for the dataframes with cluster seperation, and no outlier clipping
df_TopPr0_cl0 = df_prophet_prep(df_cl0.drop(columns='Label'), products_top[0], False)
df_TopPr0_cl1 = df_prophet_prep(df_cl1.drop(columns='Label'), products_top[0], False)
df_TopPr0_cl2 = df_prophet_prep(df_cl2.drop(columns='Label'), products_top[0], False)
df_TopPr0_cl3 = df_prophet_prep(df_cl3.drop(columns='Label'), products_top[0], False)

In [16]:
# Creating dataframes for the top product 1, for the dataframes with cluster seperation, and with outlier clipping
df_TopPr0_cl0_clip = df_prophet_prep(df_cl0.drop(columns='Label'), products_top[0], True)
df_TopPr0_cl1_clip = df_prophet_prep(df_cl1.drop(columns='Label'), products_top[0], True)
df_TopPr0_cl2_clip = df_prophet_prep(df_cl2.drop(columns='Label'), products_top[0], True)
df_TopPr0_cl3_clip = df_prophet_prep(df_cl3.drop(columns='Label'), products_top[0], True)

Instances that needed outlier clipping:  0 , out of total instances:  59
Instances that needed outlier clipping:  7 , out of total instances:  226
Instances that needed outlier clipping:  0 , out of total instances:  102
Instances that needed outlier clipping:  1 , out of total instances:  37


# Prophet model the Top 1 product

Helpful resource: https://facebook.github.io/prophet/docs/quick_start.html

Regarding the train/test split:
- my dataset has 375 days
- I will use the last part of the dataset as my test model, and I nned to define the number of days that I will consider
- 80% of my dataset is 83 days, and 90% 46.5 days
- since I have a bit more than 12 months of data, I can consider the first 10 months as my train set, and the last 2 months as the test set.

That means that my test set will have the last 60 days, and thus my train set the first 315 days.

### **Top 1 product, no clusters, no outlier clipping**

In [29]:
# Calling custom function for prophet model (at functions.py), with defining the number of days for the test set.
# Model for df for the top product 1, for the dataframe without cluster seperation, and no outlier clipping
_,_,_ = prophet_model(df_TopPr0, 60)

Root Mean Squared Error:  609.374873418084
Mean Absolute Error:  155.52807608543804


### **Top 1 product, no clusters, with outlier clipping**

In [28]:
# Model for df for the top product 1, for the dataframe without cluster seperation, and with outlier clipping
_,_,_ = prophet_model(df_TopPr0_clip, 60)

Root Mean Squared Error:  167.21273931378624
Mean Absolute Error:  90.99648381504903


### Top 1 product, no clusters, with outlier clipping. Metrics vs no clipping

### **Top 1 product, with clusters, no outlier clipping**

In [27]:
prophet_model_with_clusters(df_TopPr0_cl0, df_TopPr0_cl1, df_TopPr0_cl2, df_TopPr0_cl3, 60, df, products_top[0])

Cluster 0:
Root Mean Squared Error:  32.339920448940546
Mean Absolute Error:  22.419755612975404
Cluster 1:
Root Mean Squared Error:  61.840780154655825
Mean Absolute Error:  45.632093718207095
Cluster 2:
Root Mean Squared Error:  75.54338256092078
Mean Absolute Error:  45.35105910987186
Cluster 3:
Root Mean Squared Error:  610.0816727048411
Mean Absolute Error:  117.14219383109076

Collective metrics for this model:
RMSE:  108.89872062021425 , MAE:  48.58071427641995


### **Top 1 product, with clusters, with outlier clipping**

In [20]:
prophet_model_with_clusters(df_TopPr0_cl0_clip, df_TopPr0_cl1_clip, df_TopPr0_cl2_clip, df_TopPr0_cl3_clip, 60, df, products_top[0])

Cluster 0:
Root Mean Squared Error:  32.339920448940546
Mean Absolute Error:  22.419755612975404
Cluster 1:
Root Mean Squared Error:  61.923247934127176
Mean Absolute Error:  45.67455194566671
Cluster 2:
Root Mean Squared Error:  75.54338256092078
Mean Absolute Error:  45.35105910987186
Cluster 3:
Root Mean Squared Error:  462.98775115247184
Mean Absolute Error:  98.0755271644241

Collective metrics for this model:
RMSE:  96.10137659515064 , MAE:  46.93882451165592


# Prophet model for the rest of the top products

*Note: Due to time constraints, this function duplicates code from the main file to test the Prophet forecasting model on products other than the top 1 product. It separates the dataframe per selected product and applies the Prophet forecasting model to each product separately. This function is a temporary solution until individual functions are built to handle forecasting for multiple products.*

In [21]:
prophet_model_per_product(df, products_top[1],1)

Instances that needed outlier clipping:  9 , out of total instances:  965
Instances that needed outlier clipping:  2 , out of total instances:  149
Instances that needed outlier clipping:  23 , out of total instances:  406
Instances that needed outlier clipping:  10 , out of total instances:  304
Instances that needed outlier clipping:  3 , out of total instances:  106
[1m

Top 2 product, no clusters, no outlier clipping:[0m
Root Mean Squared Error:  487.9998190340996
Mean Absolute Error:  235.3251091369121
[1m

Top 2 product, no clusters, with outlier clipping:[0m
Root Mean Squared Error:  285.32654304566415
Mean Absolute Error:  173.02732331606455
[1m

Top 2 product, with clusters, no outlier clipping:[0m
Cluster 0:
Root Mean Squared Error:  16.121812298498224
Mean Absolute Error:  7.7056686966494405
Cluster 1:
Root Mean Squared Error:  70.16932161128828
Mean Absolute Error:  44.15727597874067
Cluster 2:
Root Mean Squared Error:  126.43741031447001
Mean Absolute Error:  70.9576

In [22]:
prophet_model_per_product(df, products_top[2],2)

Instances that needed outlier clipping:  20 , out of total instances:  1461
Instances that needed outlier clipping:  1 , out of total instances:  157
Instances that needed outlier clipping:  17 , out of total instances:  631
Instances that needed outlier clipping:  8 , out of total instances:  482
Instances that needed outlier clipping:  3 , out of total instances:  191
[1m

Top 3 product, no clusters, no outlier clipping:[0m
Root Mean Squared Error:  181.41837789567077
Mean Absolute Error:  126.06562068000478
[1m

Top 3 product, no clusters, with outlier clipping:[0m
Root Mean Squared Error:  124.988446705912
Mean Absolute Error:  96.54273938424808
[1m

Top 3 product, with clusters, no outlier clipping:[0m
Cluster 0:
Root Mean Squared Error:  9.155621730974186
Mean Absolute Error:  6.2100098347229205
Cluster 1:
Root Mean Squared Error:  42.16108440263319
Mean Absolute Error:  23.692171585686424
Cluster 2:
Root Mean Squared Error:  35.8415078870486
Mean Absolute Error:  31.937929

In [23]:
prophet_model_per_product(df, products_top[3],3)

Instances that needed outlier clipping:  12 , out of total instances:  1936
Instances that needed outlier clipping:  14 , out of total instances:  416
Instances that needed outlier clipping:  11 , out of total instances:  934
Instances that needed outlier clipping:  12 , out of total instances:  433
Instances that needed outlier clipping:  4 , out of total instances:  153
[1m

Top 4 product, no clusters, no outlier clipping:[0m
Root Mean Squared Error:  177.4483101503898
Mean Absolute Error:  91.35886257431659
[1m

Top 4 product, no clusters, with outlier clipping:[0m
Root Mean Squared Error:  93.73710940563642
Mean Absolute Error:  63.55204746829087
[1m

Top 4 product, with clusters, no outlier clipping:[0m
Cluster 0:
Root Mean Squared Error:  11.871262345188448
Mean Absolute Error:  8.793964086990876
Cluster 1:
Root Mean Squared Error:  27.300892542806604
Mean Absolute Error:  19.70230110461904
Cluster 2:
Root Mean Squared Error:  28.08880091999307
Mean Absolute Error:  19.8684

In [24]:
prophet_model_per_product(df, products_top[4],4)

Instances that needed outlier clipping:  4 , out of total instances:  1320
Instances that needed outlier clipping:  6 , out of total instances:  296
Instances that needed outlier clipping:  12 , out of total instances:  668
Instances that needed outlier clipping:  19 , out of total instances:  311
Instances that needed outlier clipping:  1 , out of total instances:  45
[1m

Top 5 product, no clusters, no outlier clipping:[0m
Root Mean Squared Error:  189.1148387207721
Mean Absolute Error:  94.80885722779415
[1m

Top 5 product, no clusters, with outlier clipping:[0m
Root Mean Squared Error:  113.79630605469539
Mean Absolute Error:  71.3409527871183
[1m

Top 5 product, with clusters, no outlier clipping:[0m
Cluster 0:
Root Mean Squared Error:  19.620806048899926
Mean Absolute Error:  13.193365962710233
Cluster 1:
Root Mean Squared Error:  47.47718375779328
Mean Absolute Error:  29.614264783495226
Cluster 2:
Root Mean Squared Error:  61.10033823967382
Mean Absolute Error:  36.358099

### Top 1 product, no clusters, with outlier clipping. Metrics vs no clipping