In [2]:
#Install Necessary Packages
!pip install --upgrade plotly
!pip install xgboost

[1m
         .:::.     .::.       
        ....yy:    .yy.       
        :.  .yy.    y.        
             :y:   .:         
             .yy  .:          
              yy..:           
              :y:.            
              .y.             
             .:.              
        ....:.                
        :::.                  
[0;33m
• Project files and data should be stored in /project. This is shared among everyone
  in the project.
• Personal files and configuration should be stored in /home/faculty.
• Files outside /project and /home/faculty will be lost when this server is terminated.
• Create custom environments to setup your servers reproducibly.
[0m
[1m
         .:::.     .::.       
        ....yy:    .yy.       
        :.  .yy.    y.        
             :y:   .:         
             .yy  .:          
              yy..:           
              :y:.            
              .y.             
             .:.              
        ....:.                


# All Packages/Libraries

In [3]:
#Necessary Packages / libraries
import os #provides functions for interacting with the operating system
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from numpy import asarray
from pandas import to_datetime
import plotly.express as px
import plotly.graph_objs as go
import datetime
%matplotlib inline
import plotly.express as px
#importing packaging for OrdinalEncoding to transform categorical variable
from sklearn.preprocessing import OrdinalEncoder
#importing model_selection and timeseriessplit for cross time series forecasting
from sklearn import model_selection
from sklearn.model_selection import TimeSeriesSplit
#error metrics for regression from sklearn package
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
#Import machine learning regressor model
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

# To change scientific numbers to float
np.set_printoptions(formatter={'float_kind':'{:f}'.format})
# Increases the size of sns plots
sns.set(rc={'figure.figsize':(8,6)})

# Laoding Main Dataset

In [4]:
#importing main dataset
sales_order=pd.read_excel("/project/MACHINE_LEARNING_DATA_SET_Stock.xlsx")

In [5]:
#mode of warehouse_name variable in the main dataset
sales_order.warehouse_name.mode()

0    Barking
dtype: object

In [6]:
#replacing the missing value of warehouse_name
sales_order['warehouse_name'].replace(np.nan, 'Barking',inplace=True)

In [7]:
#Segmenting Products into two categories based on 75% threshold for ordered quantity and price. 
df_product_segmentation['product_segmentation']=np.where((df_product_segmentation['INVOICED']>5.578835e+03)|(df_product_segmentation['QUANTITY']>113),'high_demand_products','low_demand_products')

NameError: name 'df_product_segmentation' is not defined

In [None]:
#Dropping price and quantity columns before mergring with the main dataset to prevenet create duplicate variables. 
df_product_segmentation=df_product_segmentation.drop(columns=['INVOICED','QUANTITY'])

In [None]:
#Looking how many products each category has.
df_product_segmentation.groupby('product_segmentation').STOCK_CODE.nunique()

In [None]:
#Merging product category column with the main dataset
sales_order= pd.merge(sales_order, 
                      df_product_segmentation, 
                      on ='STOCK_CODE', 
                      how ='inner')

In [None]:
#customer count for each day
customer_count=sales_order.groupby(by=['INVOICE_DATE'], as_index=False)['NAME'].nunique()
customer_count.rename(columns={'NAME':'Customer_count'}, inplace=True)
sales_order=pd.merge(sales_order, customer_count, how='left',on='INVOICE_DATE')

# Feature engineering day,month, year from date 
sales_order['week_number_of_year'] = sales_order['INVOICE_DATE'].dt.week
sales_order['day_number_of_week'] = sales_order['INVOICE_DATE'].dt.day
sales_order['month_number_of_year'] = sales_order['INVOICE_DATE'].dt.month

In [None]:
#Selecting variables for Machine Learning Dataset by storing variables in df_model
df_model=sales_order[['INVOICE_DATE','warehouse_name','STOCK_CODE','week_number_of_year',
                      'day_number_of_week','month_number_of_year','STOCK','QUANTITY','INVOICED','Customer_count',
                      'product_segmentation','AVERAGE_COST_PRICE', 'LAST_PURCHASE_PRICE']]

In [None]:
df_model=df_model.groupby(by=['INVOICE_DATE','warehouse_name','STOCK_CODE','week_number_of_year','day_number_of_week','month_number_of_year','Customer_count','product_segmentation'], as_index=False)[['AVERAGE_COST_PRICE', 'LAST_PURCHASE_PRICE','QUANTITY','INVOICED','STOCK']].sum()

In [None]:
#Rouding the figures in these variables 
df_model.INVOICED=df_model.INVOICED.round()
df_model.QUANTITY=df_model.QUANTITY.round()
df_model.STOCK=df_model.STOCK.round()

In [None]:
df_model

## Categorical Data Transformation

In [None]:
#transforming all observation to string as the product column contains mix of text and integers
df_model['STOCK_CODE'] = df_model['STOCK_CODE'].astype(str)
ohe=OrdinalEncoder()
products=ohe.fit_transform(asarray(df_model['STOCK_CODE']).reshape(-1,1))

In [None]:
product_name=pd.DataFrame(products)

In [None]:
df_model.index=product_name.index
df_model=pd.concat([df_model,product_name], axis=1)
df_model.rename(columns={0:'product'}, inplace=True)

In [None]:
df_model.loc[(df_model.product_segmentation=='high_demand_products'),'product_segment']=1 
df_model.loc[(df_model.product_segmentation=='low_demand_products'),'product_segment']=0

In [None]:
df_warehouse=pd.get_dummies(df_model['warehouse_name'], prefix='warehouse_', drop_first=False)
df_model.index = df_warehouse.index
df_model = pd.concat([df_model, df_warehouse], axis=1)
df_model

## Generating Correlation Matrix 

In [None]:
df_corr=df_model.drop(columns=['warehouse_name','STOCK_CODE','product_segmentation', 'INVOICE_DATE'])

corr_matrix= df_corr.corr()
corr_matrix['QUANTITY'].sort_values(ascending=False)

In [None]:
plt.figure(figsize=(16,9))
plt.yticks(fontsize=10)
plt.xticks(fontsize=10)
plt.title('Correlation of Different Variables with Demand Outcome - QUANTITY', fontsize=15)
sns.heatmap(df_corr.corr(method='pearson'),annot=True,cmap='seismic');

## Data Spliting with Time Series Nested Cross - Validation 

In [None]:
df_model_cv=df_model.set_index('INVOICE_DATE')

In [None]:
df_model_cv.columns

In [None]:
df_model_cv=df_model_cv.drop(columns=['warehouse_name','STOCK_CODE','product_segmentation'])

In [None]:
df_model_cv = df_model_cv[['week_number_of_year', 'day_number_of_week', 'month_number_of_year',
        'INVOICED', 'STOCK', 'product','product_segment','Customer_count','AVERAGE_COST_PRICE', 'LAST_PURCHASE_PRICE','warehouse__Barking',
       'warehouse__Great Yarmouth', 'warehouse__Thetford',
       'warehouse__Waltham Abbey','QUANTITY']]

In [None]:
# Reference https://www.angioi.com/time-nested-cv-with-sklearn/?fbclid=IwAR0bnOugMIqE-Hhhb8cgy9Lk81d9Xp8jwYJZf7IEO3Pq3iJ5LKUe7k-J8D4
n_splits = 3 #Number of train/cv/test folds

trainTestSplit = TimeSeriesSplit(n_splits+1).split(df_model_cv)
next(trainTestSplit)


for trainCvIndices, testIndices in trainTestSplit:
    # First, we split Train + CV and Test
    X_traincv, y_traincv = df_model_cv.iloc[trainCvIndices,0:14], df_model_cv.iloc[trainCvIndices,-1]
    X_test, y_test     = df_model_cv.iloc[testIndices,0:14]   , df_model_cv.iloc[testIndices,-1]
    
    # Then, we build a list of the form [ ( [...Train Indices...], [...CV Indices...]  )]
    testLength = len(X_test)
    trainCvSplit = [(list(range(trainCvIndices[0],trainCvIndices[-testLength])),
                     list(range(trainCvIndices[-testLength],trainCvIndices[-1]+1)))]
    
    # Printing date ranges
    print("Training:"           , X_traincv.index[0].date(), "--", X_traincv.index[-testLength-1].date(),
          ", Cv:"     , X_traincv.index[-testLength].date(), "--", X_traincv.index[-1].date(),
          ", Test:"                , X_test.index[0].date(), "--", X_test.index[-1].date())

# Model Result on Default Parameter

# Linear Model Regression (Baseline Model)

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
#Fit the model
model.fit(X_traincv, y_traincv)
#Get predictions for the test data
y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred,squared=False)
mae = mean_absolute_error(y_test, y_pred)
mape=mean_absolute_percentage_error(y_test, y_pred)

# Check the score on train and test
print('RMSE: {}'.format(rmse))
print('MAE: {}'.format(mae))
print('r^2 score on train set: {}'.format(model.score(X_traincv, y_traincv)))
print('r^2 score on test set: {}'.format(model.score(X_test,y_test)))

# KNN

In [None]:
# Create a KNN regression model with default arguments
model = KNeighborsRegressor()
#Fit the model
model.fit(X_traincv, y_traincv)
#Get predictions for the test data
y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred,squared=False)
mae = mean_absolute_error(y_test, y_pred)
mape=mean_absolute_percentage_error(y_test, y_pred)

# Check the score on train and test
print('RMSE: {}'.format(rmse))
print('MAE: {}'.format(mae))
print('r^2 score on train set: {}'.format(model.score(X_traincv, y_traincv)))
print('r^2 score on test set: {}'.format(model.score(X_test,y_test)))

# Random Forest

In [None]:
model = RandomForestRegressor(random_state=42)
#Fit the model
model.fit(X_traincv, y_traincv)
#Get predictions for the test data
y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred,squared=False)
mae = mean_absolute_error(y_test, y_pred)
mape=mean_absolute_percentage_error(y_test, y_pred)

# Check the score on train and test
print('RMSE: {}'.format(rmse))
print('MAE: {}'.format(mae))
print('r^2 score on train set: {}'.format(model.score(X_traincv, y_traincv)))
print('r^2 score on test set: {}'.format(model.score(X_test,y_test)))

# Gradient Boosting Regressor

In [None]:
model = GradientBoostingRegressor(random_state=42)

#Fit the model
model.fit(X_traincv, y_traincv)
#Get predictions for the test data
y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred,squared=False)
mae = mean_absolute_error(y_test, y_pred)
mape=mean_absolute_percentage_error(y_test, y_pred)

# Check the score on train and test
print('RMSE: {}'.format(rmse))
print('MAE: {}'.format(mae))
print('r^2 score on train set: {}'.format(model.score(X_traincv, y_traincv)))
print('r^2 score on test set: {}'.format(model.score(X_test,y_test)))

# XGBOOST

In [None]:
#XGBOOST

# Create a XGB regression model with default arguments
model = XGBRegressor(random_state=42)
#Fit the model
model.fit(X_traincv, y_traincv)
#Get predictions for the test data
y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred,squared=False)
mae = mean_absolute_error(y_test, y_pred)
mape=mean_absolute_percentage_error(y_test, y_pred)

# Check the score on train and test
print('RMSE: {}'.format(rmse))
print('MAE: {}'.format(mae))
print('r^2 score on train set: {}'.format(model.score(X_traincv, y_traincv)))
print('r^2 score on test set: {}'.format(model.score(X_test,y_test)))

# Model Results with Tuned Parameter

# Linear Regression

In [None]:
model = LinearRegression(positive=False, normalize=False, n_jobs=-1,fit_intercept=True)

#Fit the model
model.fit(X_traincv, y_traincv)
#Get predictions for the test data
y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred,squared=False)
mae = mean_absolute_error(y_test, y_pred)
mape=mean_absolute_percentage_error(y_test, y_pred)

# Check the score on train and test
print('RMSE: {}'.format(rmse))
print('MAE: {}'.format(mae))
print('r^2 score on train set: {}'.format(model.score(X_traincv, y_traincv)))
print('r^2 score on test set: {}'.format(model.score(X_test,y_test)))

## KNN

In [None]:
model = KNeighborsRegressor(n_neighbors= 11, p=2, weights='distance',metric='euclidean',leaf_size=30,algorithm='brute')

#Fit the model
model.fit(X_traincv, y_traincv)
#Get predictions for the test data
y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred,squared=False)
mae = mean_absolute_error(y_test, y_pred)
mape=mean_absolute_percentage_error(y_test, y_pred)

# Check the score on train and test
print('RMSE: {}'.format(rmse))
print('MAE: {}'.format(mae))
print('r^2 score on train set: {}'.format(model.score(X_traincv, y_traincv)))
print('r^2 score on test set: {}'.format(model.score(X_test,y_test)))

# Random Forest

In [None]:
model = RandomForestRegressor(n_estimators=950,
 min_samples_split= 10,
 min_samples_leaf= 2,
 max_features='log2',
 max_depth=16,
 bootstrap=False, random_state=42)
#Fit the model
model.fit(X_traincv, y_traincv)
#Get predictions for the test data
y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred,squared=False)
mae = mean_absolute_error(y_test, y_pred)
mape=mean_absolute_percentage_error(y_test, y_pred)

# Check the score on train and test
print('RMSE: {}'.format(rmse))
print('MAE: {}'.format(mae))
print('r^2 score on train set: {}'.format(model.score(X_traincv, y_traincv)))
print('r^2 score on test set: {}'.format(model.score(X_test,y_test)))

# Gradient Boosting 

In [None]:
model = GradientBoostingRegressor(n_estimators=300,
          max_depth=6,
          min_samples_split=2,
          min_samples_leaf= 1,
          max_features='auto',
        subsample=1,
random_state=42)

#Fit the model
model.fit(X_traincv, y_traincv)
#Get predictions for the test data
y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred,squared=False)
mae = mean_absolute_error(y_test, y_pred)
mape=mean_absolute_percentage_error(y_test, y_pred)

# Check the score on train and test
print('RMSE: {}'.format(rmse))
print('MAE: {}'.format(mae))
print('r^2 score on train set: {}'.format(model.score(X_traincv, y_traincv)))
print('r^2 score on test set: {}'.format(model.score(X_test,y_test)))

# XGBOOST

In [None]:
model = XGBRegressor(n_estimators=100,
                     subsample=1,
objective= 'reg:squarederror',
reg_alpha=9,
reg_lambda=10,
min_child_weight= 35,
max_depth= 20,
gamma= 20,
eta=0.2,
colsample_bytree=0.5,
random_state=42)

#Fit the model
model.fit(X_traincv, y_traincv)
#Get predictions for the test data
y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred,squared=False)
mae = mean_absolute_error(y_test, y_pred)

# Check the score on train and test
print('RMSE: {}'.format(rmse))
print('MAE: {}'.format(mae))
print('r^2 score on train set: {}'.format(model.score(X_traincv, y_traincv)))
print('r^2 score on test set: {}'.format(model.score(X_test,y_test)))