# Importing libraries and cleaned dataset

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from prophet import Prophet

from statsmodels.tools.eval_measures import rmse
from sklearn.metrics import mean_absolute_error

from functions import prophet_model, df_prophet_prep, outlier_scaling

Importing plotly failed. Interactive plots will not work.


In [2]:
# Disabling a specific type of warning from Jupyter, that I know if mistaken for something else
pd.options.mode.chained_assignment = None 

**Original Dataset**:

Chen,Daqing. (2015). Online Retail. UCI Machine Learning Repository. https://archive.ics.uci.edu/dataset/352/online+retail

In [3]:
# Importing dataset, clean and filtered with clusters from the notebook 03_ML_clustering
# the filters on are: only actual products, only UK (90% of the orders), dropping NaN customerID
# At a later stage, I can solve the NaN in CustomerID by building a ML model to predict their Label based on the order(from InvoiceNo) characteristics
file_path = '../data/data_filtered_with_clusters.csv'

df = pd.read_csv(file_path, dtype={'InvoiceNo': 'O','StockCode': 'O','Description': 'O',
                                   'Quantity': 'int64','UnitPrice': 'float64',
                                   'CustomerID': 'Int64','Country': 'O'}, parse_dates=['InvoiceDate'])

# Feature Engineering

### Scaling outliers

I know from the EDA, shown in the previous notebooks, **two outliers** with huge difference in scale. I am going to do scaling to a range, scaling them to highest next value (after these two).

In [4]:
# Scaling within a range, updating the Quantity and accordingly the TotalPrice
df = outlier_scaling(df)

### Date column

Adding new column for the date without time.

In [5]:
# New date column without time
df['InvoiceDate_noTime'] = df.InvoiceDate.dt.normalize()

### Removing features that will not be used

I will be using StockCode, Quantity, Date and Labelfor my model. 

In [6]:
df.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country', 'TotalPrice', 'Label',
       'InvoiceDate_noTime'],
      dtype='object')

In [7]:
df.drop(columns=['InvoiceNo','Description', 'InvoiceDate','UnitPrice', 
                 'CustomerID', 'Country', 'TotalPrice'],inplace=True)
df.rename(columns={'InvoiceDate_noTime':'InvoiceDate'}, inplace=True)
df

Unnamed: 0,StockCode,Quantity,Label,InvoiceDate
0,85123A,6,2,2010-12-01
1,71053,6,2,2010-12-01
2,84406B,8,2,2010-12-01
3,84029G,6,2,2010-12-01
4,84029E,6,2,2010-12-01
...,...,...,...,...
348882,22466,12,1,2011-12-09
348883,22061,8,2,2011-12-09
348884,23275,24,2,2011-12-09
348885,21217,24,2,2011-12-09


# Splitting dataset per cluster

In [8]:
df.Label.unique()

array([2, 1, 0, 3])

In [9]:
df.Label.info()

<class 'pandas.core.series.Series'>
RangeIndex: 348887 entries, 0 to 348886
Series name: Label
Non-Null Count   Dtype
--------------   -----
348887 non-null  int64
dtypes: int64(1)
memory usage: 2.7 MB


In [10]:
df_cl0 = df[df.Label == 0].reset_index(drop=True)
df_cl1 = df[df.Label == 1].reset_index(drop=True)
df_cl2 = df[df.Label == 2].reset_index(drop=True)
df_cl3 = df[df.Label == 3].reset_index(drop=True)

# Top selling products

In [11]:
# the top 5 selling products
products_top = df.groupby('StockCode')['Quantity'].sum().sort_values(ascending=False).head(5).index.tolist()
products_top

['84077', '22197', '85099B', '85123A', '84879']

## Preparing dataframes for model

In [12]:
# Creating dataframe fot the prophet model, with the custom function (at functions.py)

# Creating df for the top product 1, for the dataframe without cluster seperation, and no outlier clipping
df_TopPr0 = df_prophet_prep(df, products_top[0], False)

In [13]:
# Creating df for the top product 1, for the dataframe without cluster seperation, and with outlier clipping
df_TopPr0_clip = df_prophet_prep(df, products_top[0], True)

Instances that needed outlier clipping:  10 , out of total instances:  424


In [14]:
# Creating dataframes for the top product 1, for the dataframes with cluster seperation, and no outlier clipping
df_TopPr0_cl0 = df_prophet_prep(df_cl0.drop(columns='Label'), products_top[0], False)
df_TopPr0_cl1 = df_prophet_prep(df_cl1.drop(columns='Label'), products_top[0], False)
df_TopPr0_cl2 = df_prophet_prep(df_cl2.drop(columns='Label'), products_top[0], False)
df_TopPr0_cl3 = df_prophet_prep(df_cl3.drop(columns='Label'), products_top[0], False)

In [15]:
# Creating dataframes for the top product 1, for the dataframes with cluster seperation, and with outlier clipping
df_TopPr0_cl0_clip = df_prophet_prep(df_cl0.drop(columns='Label'), products_top[0], True)
df_TopPr0_cl1_clip = df_prophet_prep(df_cl1.drop(columns='Label'), products_top[0], True)
df_TopPr0_cl2_clip = df_prophet_prep(df_cl2.drop(columns='Label'), products_top[0], True)
df_TopPr0_cl3_clip = df_prophet_prep(df_cl3.drop(columns='Label'), products_top[0], True)

Instances that needed outlier clipping:  0 , out of total instances:  59
Instances that needed outlier clipping:  7 , out of total instances:  226
Instances that needed outlier clipping:  0 , out of total instances:  102
Instances that needed outlier clipping:  1 , out of total instances:  37


# Prophet model per product

Helpful resource: https://facebook.github.io/prophet/docs/quick_start.html

Regarding the train/test split:
- my dataset has 375 days
- I will use the last part of the dataset as my test model, and I nned to define the number of days that I will consider
- 80% of my dataset is 83 days, and 90% 46.5 days
- since I have a bit more than 12 months of data, I can consider the first 10 months as my train set, and the last 2 months as the test set.

That means that my test set will have the last 60 days, and thus my train set the first 315 days.

### Per product without clusters, without outlier clipping

In [16]:
# Calling custom function for prophet model (at functions.py), with defining the number of days for the test set.
# Model for df for the top product 1, for the dataframe without cluster seperation, and no outlier clipping
forecast_TopPr0,_ = prophet_model(df_TopPr0, 60)

11:45:58 - cmdstanpy - INFO - Chain [1] start processing
11:45:58 - cmdstanpy - INFO - Chain [1] done processing


Root Mean Squared Error:  609.374873418084
Mean Absolute Error:  155.52807608543804


### Per product without clusters, with outlier clipping

In [17]:
# Model for df for the top product 1, for the dataframe without cluster seperation, and with outlier clipping
forecast_TopPr0_clip,_ = prophet_model(df_TopPr0_clip, 60)

11:45:58 - cmdstanpy - INFO - Chain [1] start processing
11:45:58 - cmdstanpy - INFO - Chain [1] done processing


Root Mean Squared Error:  167.21273931378624
Mean Absolute Error:  90.99648381504903


### Per product per cluster, without outlier clipping

In [18]:
# Creating dataframes for the top product 1, for the dataframes with cluster seperation, and no outlier clipping
# Cluster 0
forecast_TopPr0_cl0, rmse_TopPr0_cl0 = prophet_model(df_TopPr0_cl0, 60)

11:45:58 - cmdstanpy - INFO - Chain [1] start processing
11:45:58 - cmdstanpy - INFO - Chain [1] done processing


Root Mean Squared Error:  32.339920448940546
Mean Absolute Error:  22.419755612975404


In [19]:
# Cluster 1
forecast_TopPr0_cl1, rmse_TopPr0_cl1 = prophet_model(df_TopPr0_cl1, 60)

11:45:58 - cmdstanpy - INFO - Chain [1] start processing
11:45:58 - cmdstanpy - INFO - Chain [1] done processing


Root Mean Squared Error:  61.840780154655825
Mean Absolute Error:  45.632093718207095


In [20]:
# Cluster 2
forecast_TopPr0_cl2, rmse_TopPr0_cl2 = prophet_model(df_TopPr0_cl2, 60)

11:45:58 - cmdstanpy - INFO - Chain [1] start processing
11:45:58 - cmdstanpy - INFO - Chain [1] done processing


Root Mean Squared Error:  75.54338256092078
Mean Absolute Error:  45.35105910987186


In [21]:
# Cluster 3
forecast_TopPr0_cl3, rmse_TopPr0_cl3 = prophet_model(df_TopPr0_cl3, 60)

11:45:58 - cmdstanpy - INFO - Chain [1] start processing
11:45:58 - cmdstanpy - INFO - Chain [1] done processing


Root Mean Squared Error:  610.0816727048411
Mean Absolute Error:  117.14219383109076


### Per product per cluster, with outlier clipping

In [22]:
# Creating dataframes for the top product 1, for the dataframes with cluster seperation, and with outlier clipping
# Cluster 0
forecast_TopPr0_cl0_clip, rmse_TopPr0_cl0_clip = prophet_model(df_TopPr0_cl0_clip, 60)

11:45:58 - cmdstanpy - INFO - Chain [1] start processing
11:45:58 - cmdstanpy - INFO - Chain [1] done processing


Root Mean Squared Error:  32.339920448940546
Mean Absolute Error:  22.419755612975404


In [23]:
# Cluster 1
forecast_TopPr0_cl1_clip, rmse_TopPr0_cl1_clip = prophet_model(df_TopPr0_cl1_clip, 60)

11:45:58 - cmdstanpy - INFO - Chain [1] start processing
11:45:58 - cmdstanpy - INFO - Chain [1] done processing


Root Mean Squared Error:  61.923247934127176
Mean Absolute Error:  45.67455194566671


In [24]:
# Cluster 2
forecast_TopPr0_cl2_clip, rmse_TopPr0_cl2_clip = prophet_model(df_TopPr0_cl2_clip, 60)

11:45:58 - cmdstanpy - INFO - Chain [1] start processing
11:45:58 - cmdstanpy - INFO - Chain [1] done processing


Root Mean Squared Error:  75.54338256092078
Mean Absolute Error:  45.35105910987186


In [25]:
# Cluster 3
forecast_TopPr0_cl3_clip, rmse_TopPr0_cl3_clip = prophet_model(df_TopPr0_cl3_clip, 60)

11:45:58 - cmdstanpy - INFO - Chain [1] start processing
11:45:58 - cmdstanpy - INFO - Chain [1] done processing


Root Mean Squared Error:  462.98775115247184
Mean Absolute Error:  98.0755271644241


In [30]:
label_percentages = [round((count / df[df.StockCode == products_top[0]].shape[0]) * 100,2) for count in [df[df[df.StockCode == products_top[0]].Label == i].shape[0] for i in range(4)]]
label_percentages

  label_percentages = [round((count / df[df.StockCode == products_top[0]].shape[0]) * 100,2) for count in [df[df[df.StockCode == products_top[0]].Label == i].shape[0] for i in range(4)]]


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).

In [None]:
prop0 = rmse_p0_l0*label_percentages[0]*0.01
prop1 = rmse_p0_l1*label_percentages[1]*0.01
prop2 = rmse_p0_l2*label_percentages[2]*0.01
prop3 = rmse_p0_l3*label_percentages[3]*0.01

In [None]:
# Calculating the total prediction, based on the proportion of the cluster
pred_p0_total = prop0 + prop1 + prop2 + prop3
pred_p0_total