In [1]:
# notes:

# inventory, item stock, prediction for inventory
# f.e. next month how we will sell from each item

# top most selling products
# top most high selling products

# Importing libraries and cleaned dataset

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from prophet import Prophet

from statsmodels.tools.eval_measures import rmse
from sklearn.metrics import mean_absolute_error

from functions import prophet_model

#import holidays 

Importing plotly failed. Interactive plots will not work.


In [3]:
# Disabling a specific type of warning from Jupyter, that I know if mistaken for something else
pd.options.mode.chained_assignment = None 

**Original Dataset**:

Chen,Daqing. (2015). Online Retail. UCI Machine Learning Repository. https://archive.ics.uci.edu/dataset/352/online+retail

In [4]:
# Importing dataset, clean and filtered with clusters from the notebook 03_ML_clustering
# the filters on are: only actual products, only UK, dropping NaN customerID
file_path1 = '../data/data_cleaned.csv'
file_path2 = '../data/data_filtered_with_clusters.csv'

df = pd.read_csv(file_path1, dtype={'InvoiceNo': 'O','StockCode': 'O','Description': 'O',
                                   'Quantity': 'int64','UnitPrice': 'float64',
                                   'CustomerID': 'Int64','Country': 'O'}, parse_dates=['InvoiceDate'])
df_fltrd = pd.read_csv(file_path2, dtype={'InvoiceNo': 'O','StockCode': 'O','Description': 'O',
                                   'Quantity': 'int64','UnitPrice': 'float64',
                                   'CustomerID': 'Int64','Country': 'O'}, parse_dates=['InvoiceDate'])

In [5]:
# test
df = df[df.StockCode != 'OTHER']
df = df[df.Country == 'United Kingdom']
df.dropna(subset='CustomerID', inplace=True)

In [6]:
df.head(2)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.95,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.75,17850,United Kingdom


In [7]:
df_fltrd.head(2)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalPrice,Label
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.95,17850,United Kingdom,17.7,2
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.75,17850,United Kingdom,22.5,2


In [8]:
# Ading the total price column to df
df['TotalPrice'] = df.Quantity * df.UnitPrice

# Feature Engineering

### Clipping outliers

I know from the EDA, shown in the previous notebooks, **two outliers** with huge difference in scale. I am going to do scaling to a range, scaling them to highest next value (after these two).

In [9]:
# scaling within a range, updating the Quantity and accordingly the TotalPrice
df.loc[df.InvoiceNo.isin(['581483','541431']),'Quantity'] = df.Quantity.sort_values(ascending=False).values[2]
df.loc[df.InvoiceNo.isin(['581483','541431']),'TotalPrice'] = df.Quantity * df.UnitPrice

df_fltrd.loc[df_fltrd.InvoiceNo.isin(['581483','541431']),'Quantity'] = df_fltrd.Quantity.sort_values(ascending=False).values[2]
df_fltrd.loc[df_fltrd.InvoiceNo.isin(['581483','541431']),'TotalPrice'] = df_fltrd.Quantity * df_fltrd.UnitPrice

### Date column

Adding new column for the date without time.

In [10]:
# New date column without time
df['InvoiceDate_noTime'] = df.InvoiceDate.dt.normalize()
df_fltrd['InvoiceDate_noTime'] = df_fltrd.InvoiceDate.dt.normalize()

### Removing features that will not be used

I will be using StockCode, Quantity and date for my model. 

In [11]:
df_stock = df[['StockCode','Quantity','InvoiceDate_noTime']].rename(columns={'InvoiceDate_noTime':'InvoiceDate'})
df_stock.head(2)

Unnamed: 0,StockCode,Quantity,InvoiceDate
0,85123A,6,2010-12-01
1,71053,6,2010-12-01


In [12]:
df_fltrd_stock = df_fltrd[['StockCode','Quantity','InvoiceDate_noTime','Label']].rename(columns={'InvoiceDate_noTime':'InvoiceDate'})
df_fltrd_stock.head(2)

Unnamed: 0,StockCode,Quantity,InvoiceDate,Label
0,85123A,6,2010-12-01,2
1,71053,6,2010-12-01,2


# Splitting dataset

In [13]:
df_fltrd_stock.Label.unique()

array([2, 1, 0, 3])

In [14]:
df_fltrd_stock.Label.info()

<class 'pandas.core.series.Series'>
RangeIndex: 348887 entries, 0 to 348886
Series name: Label
Non-Null Count   Dtype
--------------   -----
348887 non-null  int64
dtypes: int64(1)
memory usage: 2.7 MB


In [15]:
df_fltrd_stock_0 = df_fltrd_stock[df_fltrd_stock.Label == 0].reset_index(drop=True)
df_fltrd_stock_1 = df_fltrd_stock[df_fltrd_stock.Label == 1].reset_index(drop=True)
df_fltrd_stock_2 = df_fltrd_stock[df_fltrd_stock.Label == 2].reset_index(drop=True)
df_fltrd_stock_3 = df_fltrd_stock[df_fltrd_stock.Label == 3].reset_index(drop=True)

# Top selling products

In [16]:
# the top 5 selling products
products_top = df.groupby('StockCode')['Quantity'].sum().sort_values(ascending=False).head(5).index.tolist()
products_top

['84077', '22197', '85099B', '85123A', '84879']

## Preparing dataframes for model

In [17]:
def df_prep(df_2,products_top, product_i,filtering=False, clipping = False):
    df = df_2.copy()
    if filtering==True:
        df = df[df.StockCode == products_top[product_i]].drop(columns='StockCode').reset_index(drop=True)
    else:
        df.drop(columns='StockCode', inplace=True)

    for i in range(df.shape[1]):
        if df.iloc[:,i].dtypes =='<M8[ns]':
            df.rename(columns={df.columns[i]: "ds"}, inplace=True)
        else:
            df.rename(columns={df.columns[i]: "y"}, inplace=True)

    df = df[['ds', 'y']] 
    
    if clipping == True:
        tq_UpperLimit = (df.y.mean() + df.y.std()*3).round(0).astype(int)
        df_clip = df.copy()
        df_clip.loc[df_clip.y > tq_UpperLimit , 'y'] = tq_UpperLimit        
        return df_clip
    else:
        return df


In [18]:
df_stock_top_pr = df_prep(df_stock, products_top, 0, True,False)
df_stock_top_pr.head()

Unnamed: 0,ds,y
0,2010-12-02,48
1,2010-12-02,48
2,2010-12-02,2880
3,2010-12-02,288
4,2010-12-03,48


In [19]:
df_stock_top_pr_clip = df_prep(df_stock, products_top, 0,True, True)

In [20]:
df_fltrd_stock_0_top_pr_clip = df_prep(df_fltrd_stock_0.drop(columns='Label'), products_top, 0, True,True)
df_fltrd_stock_1_top_pr_clip = df_prep(df_fltrd_stock_1.drop(columns='Label'), products_top, 0, True,True)
df_fltrd_stock_2_top_pr_clip = df_prep(df_fltrd_stock_2.drop(columns='Label'), products_top, 0, True,True)
df_fltrd_stock_3_top_pr_clip = df_prep(df_fltrd_stock_3.drop(columns='Label'), products_top, 0, True,True)

In [21]:
df_fltrd_stock_prep = df_prep(df_fltrd_stock[['StockCode','Quantity','InvoiceDate']], None, None,False, False)
df_fltrd_stock_clip = df_prep(df_fltrd_stock[['StockCode','Quantity','InvoiceDate']], None, None,False, True)

In [22]:
df_stock_prep = df_prep(df_stock, None, None,False, False)
df_stock_clip = df_prep(df_stock, None, None,False, True)

In [23]:
df_fltrd_stock_2_top_pr_clip.head()

Unnamed: 0,ds,y
0,2010-12-02,48
1,2010-12-02,288
2,2010-12-05,48
3,2010-12-07,48
4,2010-12-08,48


In [24]:
df_fltrd_stock_0_top_pr_clip.head()

Unnamed: 0,ds,y
0,2010-12-03,48
1,2010-12-05,48
2,2010-12-19,3
3,2011-02-03,48
4,2011-03-08,48


# Prophet model

https://facebook.github.io/prophet/docs/quick_start.html

### general sales

In [25]:
# For all products, not filtered model
forecast = prophet_model(df_stock_prep, 73)

18:09:00 - cmdstanpy - INFO - Chain [1] start processing
18:09:04 - cmdstanpy - INFO - Chain [1] done processing


Root Mean Squared Error:  13.73245010945489
Mean Absolute Error:  9.640611117829632


In [26]:
# For all products, not filtered model, outliers clipped
forecast  = prophet_model(df_stock_clip, 73)

18:09:09 - cmdstanpy - INFO - Chain [1] start processing
18:09:23 - cmdstanpy - INFO - Chain [1] done processing


Root Mean Squared Error:  13.769411439354112
Mean Absolute Error:  9.582852447202438


In [27]:
forecast = prophet_model(df_fltrd_stock_prep, 73)

18:09:29 - cmdstanpy - INFO - Chain [1] start processing
18:09:33 - cmdstanpy - INFO - Chain [1] done processing


Root Mean Squared Error:  13.73245010945489
Mean Absolute Error:  9.640611117829632


In [28]:
forecast = prophet_model(df_fltrd_stock_clip, 73)

18:09:38 - cmdstanpy - INFO - Chain [1] start processing
18:09:51 - cmdstanpy - INFO - Chain [1] done processing


Root Mean Squared Error:  13.769411439354112
Mean Absolute Error:  9.582852447202438


### per product

In [29]:
# 365-(365*0.8)  # 73.0
# Calling function for plrophet model, with defining the number of days for the test set.
forecast = prophet_model(df_stock_top_pr, 73)
# not good

18:09:52 - cmdstanpy - INFO - Chain [1] start processing
18:09:52 - cmdstanpy - INFO - Chain [1] done processing


Root Mean Squared Error:  69.6008962763101
Mean Absolute Error:  47.32452217133055


In [30]:
# trying the model where outliers were clipped
forecast = prophet_model(df_stock_top_pr_clip, 73)

# better, but not good yet

18:09:52 - cmdstanpy - INFO - Chain [1] start processing
18:09:52 - cmdstanpy - INFO - Chain [1] done processing


Root Mean Squared Error:  58.85478457865827
Mean Absolute Error:  37.20549873697381


In [31]:
# testing with less days for the test model
forecast = prophet_model(df_stock_top_pr_clip, 40)
# not better, also very small test sample

18:09:52 - cmdstanpy - INFO - Chain [1] start processing
18:09:52 - cmdstanpy - INFO - Chain [1] done processing


Root Mean Squared Error:  69.50450525690874
Mean Absolute Error:  42.21188221394395


### per product per cluster

In [32]:
forecast_p0_l0,rmse_p0_l0 = prophet_model(df_fltrd_stock_0_top_pr_clip, 73)

18:09:52 - cmdstanpy - INFO - Chain [1] start processing
18:09:52 - cmdstanpy - INFO - Chain [1] done processing


ValueError: operands could not be broadcast together with shapes (73,) (14,) 

In [33]:
forecast_p0_l1,rmse_p0_l1 = prophet_model(df_fltrd_stock_1_top_pr_clip, 50)

18:11:09 - cmdstanpy - INFO - Chain [1] start processing
18:11:09 - cmdstanpy - INFO - Chain [1] done processing


Root Mean Squared Error:  17.910549710775967
Mean Absolute Error:  13.377454782989405


In [34]:
forecast_p0_l2,rmse_p0_l2 = prophet_model(df_fltrd_stock_2_top_pr_clip, 50)

18:11:13 - cmdstanpy - INFO - Chain [1] start processing
18:11:13 - cmdstanpy - INFO - Chain [1] done processing


Root Mean Squared Error:  135.10979866185008
Mean Absolute Error:  88.93078818647126


In [35]:
forecast_p0_l3,rmse_p0_l3 = prophet_model(df_fltrd_stock_3_top_pr_clip, 50)

18:11:15 - cmdstanpy - INFO - Chain [1] start processing
18:11:15 - cmdstanpy - INFO - Chain [1] done processing


ValueError: operands could not be broadcast together with shapes (50,) (13,) 

In [None]:
label_percentages = [round((count / df_fltrd_stock.shape[0]) * 100,2) for count in [df_fltrd_stock[df_fltrd_stock.Label == i].shape[0] for i in range(4)]]
label_percentages

In [None]:
prop0 = rmse_p0_l0*label_percentages[0]*0.01
prop1 = rmse_p0_l1*label_percentages[1]*0.01
prop2 = rmse_p0_l2*label_percentages[2]*0.01
prop3 = rmse_p0_l3*label_percentages[3]*0.01

In [None]:
# Calculating the total prediction, based on the proportion of the cluster
pred_p0_total = prop0 + prop1 + prop2 + prop3
pred_p0_total