In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import ipywidgets as widgets
from ipywidgets import interact
import pycaret
from pycaret.regression import *
from statsmodels.tsa.stattools import adfuller
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from IPython import display

In [4]:
df = pd.read_csv('../data/processed/Final_EDA.csv')

In [5]:
df.head()

Unnamed: 0,counts,TotalPrice,Average_Price,is_holiday,Pre_wk_orders,Pre_30_orders,Post_30_orders,Sales_boost,Invoice,StockCode,Quantity,Price,Customer_Id,Hour,Month,Year,Day,Weeknumber
0,125.0,36227.9,289.8232,0.0,0.0,0.0,2403.0,0.0,489434,85048,12.0,6.95,13085.0,7.0,12.0,2009.0,1.0,49.0
1,125.0,36227.9,289.8232,0.0,0.0,0.0,2403.0,0.0,489434,79323P,12.0,6.75,13085.0,7.0,12.0,2009.0,1.0,49.0
2,125.0,36227.9,289.8232,0.0,0.0,0.0,2403.0,0.0,489434,79323W,12.0,6.75,13085.0,7.0,12.0,2009.0,1.0,49.0
3,125.0,36227.9,289.8232,0.0,0.0,0.0,2403.0,0.0,489434,22041,48.0,2.1,13085.0,7.0,12.0,2009.0,1.0,49.0
4,125.0,36227.9,289.8232,0.0,0.0,0.0,2403.0,0.0,489434,21232,24.0,1.25,13085.0,7.0,12.0,2009.0,1.0,49.0


In [6]:
df.columns

Index(['counts', 'TotalPrice', 'Average_Price', 'is_holiday', 'Pre_wk_orders',
       'Pre_30_orders', 'Post_30_orders', 'Sales_boost', 'Invoice',
       'StockCode', 'Quantity', 'Price', 'Customer_Id', 'Hour', 'Month',
       'Year', 'Day', 'Weeknumber'],
      dtype='object')

In [7]:
df.dtypes

counts            float64
TotalPrice        float64
Average_Price     float64
is_holiday        float64
Pre_wk_orders     float64
Pre_30_orders     float64
Post_30_orders    float64
Sales_boost       float64
Invoice            object
StockCode          object
Quantity          float64
Price             float64
Customer_Id       float64
Hour              float64
Month             float64
Year              float64
Day               float64
Weeknumber        float64
dtype: object

In [9]:
df.isna().sum()

counts            0
TotalPrice        0
Average_Price     0
is_holiday        0
Pre_wk_orders     0
Pre_30_orders     0
Post_30_orders    0
Sales_boost       0
Invoice           0
StockCode         0
Quantity          0
Price             0
Customer_Id       0
Hour              0
Month             0
Year              0
Day               0
Weeknumber        0
dtype: int64

In [10]:
all_results = []
    
# initialize setup from pycaret.regression
s = setup(df, target ='Post_30_orders' , train_size = 0.8,
              data_split_shuffle = True, fold = 5,
              ignore_features = ['Invoice','StockCode'],
              numeric_features = ['counts', 'TotalPrice', 'Average_Price', 'Pre_wk_orders','Pre_30_orders', 
               'Month','Year', 'Day', 'Weeknumber'],
              silent = True, verbose = False, session_id = 123, normalize=False)

best = compare_models()
best

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
dt,Decision Tree Regressor,0.0,0.0,0.0,1.0,0.0,0.0,0.706
et,Extra Trees Regressor,0.0,0.0,0.0007,1.0,0.0,0.0,25.402
rf,Random Forest Regressor,0.0,0.0,0.0,1.0,0.0,0.0,34.146
lightgbm,Light Gradient Boosting Machine,5.2186,46.0786,6.7861,0.9998,0.0076,0.0029,1.834
knn,K Neighbors Regressor,0.7839,232.3094,15.2028,0.9992,0.0104,0.0006,8.85
gbr,Gradient Boosting Regressor,33.6618,1898.5529,43.5494,0.9936,0.0384,0.0187,24.94
ada,AdaBoost Regressor,130.0043,23833.7808,154.37,0.92,0.1071,0.0729,15.054
lar,Least Angle Regression,343.436,237307.3465,487.1383,0.2037,0.3291,0.2889,0.1
br,Bayesian Ridge,343.4283,237307.349,487.1383,0.2037,0.3291,0.2889,0.288
lasso,Lasso Regression,343.2668,237310.7125,487.1417,0.2037,0.3291,0.2889,3.158


DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=123, splitter='best')

In [11]:
rf = create_model('rf')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0
Mean,0.0,0.0,0.0,1.0,0.0,0.0
SD,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
save_model(rf, '../models/orders_post30_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=False,
                                       features_todrop=['Invoice', 'StockCode'],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=['counts',
                                                           'TotalPrice',
                                                           'Average_Price',
                                                           'Pre_wk_orders',
                                                           'Pre_30_orders',
                                                           'Month', 'Year',
                                                           'Day', 'Weeknumber'],
                                       target='Post_30_orders',
                                       time_features=[])),
               

In [13]:
df_nonuk = pd.read_csv('../data/processed/Final_EDA_NonUK.csv')

In [14]:
final_df = df[['counts','TotalPrice','Average_Price','Pre_wk_orders','Pre_30_orders','Month', 'Year','Day', 'Weeknumber','Post_30_orders']]
final_df_nonuk = df_nonuk[['counts','TotalPrice','Average_Price','Pre_wk_orders','Pre_30_orders','Month', 'Year','Day', 'Weeknumber','Post_30_orders']]

In [16]:
final_df.shape

(741255, 10)

In [18]:
final_df = final_df[final_df.Average_Price >= 0.0]
final_df_nonuk = final_df_nonuk[final_df_nonuk.Average_Price >= 0.0]

In [19]:
final_df.to_csv('..\models\Final_Features.csv', index=False)
final_df_nonuk.to_csv('..\models\Final_Features_NonUK.csv', index=False)

In [20]:
all_results = []
    
# initialize setup from pycaret.regression
s = setup(df_nonuk, target ='Post_30_orders' , train_size = 0.8,
              data_split_shuffle = True, fold = 5,
              ignore_features = ['Invoice','StockCode'],
              numeric_features = ['counts', 'TotalPrice', 'Average_Price', 'Pre_wk_orders','Pre_30_orders', 
               'Month','Year', 'Day', 'Weeknumber'],
              silent = True, verbose = False, session_id = 123, normalize=False)

best = compare_models()
best

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
dt,Decision Tree Regressor,0.0,0.0,0.0025,1.0,0.0,0.0,0.076
et,Extra Trees Regressor,0.0003,0.0002,0.0139,1.0,0.0001,0.0,0.944
rf,Random Forest Regressor,0.001,0.0035,0.0553,1.0,0.0002,0.0,1.706
lightgbm,Light Gradient Boosting Machine,0.7239,0.9399,0.9694,0.9998,0.0063,0.0035,0.374
knn,K Neighbors Regressor,0.4089,21.2284,4.5988,0.9947,0.0226,0.002,0.134
gbr,Gradient Boosting Regressor,4.4531,37.5099,6.1231,0.9906,0.0395,0.0219,1.608
ada,AdaBoost Regressor,11.0162,198.6108,14.0897,0.9504,0.0912,0.0565,1.12
lar,Least Angle Regression,38.8554,3146.0977,56.0877,0.2143,0.3466,0.3256,0.036
br,Bayesian Ridge,38.8476,3146.1063,56.0878,0.2143,0.3466,0.3255,0.048
ridge,Ridge Regression,38.8554,3146.096,56.0877,0.2143,0.3466,0.3256,0.53


DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=123, splitter='best')

In [21]:
new_df = df_nonuk.append(df)
new_df.shape

(824293, 18)

In [22]:
# initialize setup from pycaret.regression
s = setup(new_df, target ='Post_30_orders' , train_size = 0.8,
              data_split_shuffle = True, fold = 5,
              ignore_features = ['Invoice','StockCode'],
              numeric_features = ['counts', 'TotalPrice', 'Average_Price', 'Pre_wk_orders','Pre_30_orders', 
               'Month','Year', 'Day', 'Weeknumber'],
              silent = True, verbose = False, session_id = 123, normalize=False)

best = compare_models()
best

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
dt,Decision Tree Regressor,0.0001,0.0019,0.0193,1.0,0.0001,0.0,0.954
et,Extra Trees Regressor,0.0002,0.0018,0.0249,1.0,0.0001,0.0,33.806
rf,Random Forest Regressor,0.0001,0.0011,0.0203,1.0,0.0001,0.0,46.024
lightgbm,Light Gradient Boosting Machine,8.2987,118.4782,10.8812,0.9998,0.0391,0.0106,2.52
knn,K Neighbors Regressor,0.8124,244.7475,15.6245,0.9996,0.0144,0.0008,4.874
gbr,Gradient Boosting Regressor,58.5551,5985.0958,77.3598,0.9898,0.1135,0.056,32.698
ada,AdaBoost Regressor,160.2473,41095.3522,202.6111,0.9303,0.1681,0.1206,18.324
lar,Least Angle Regression,388.763,292673.1869,540.9901,0.5034,0.51,0.5374,0.122
br,Bayesian Ridge,388.7539,292673.1371,540.9901,0.5034,0.51,0.5374,0.356
lasso,Lasso Regression,388.5118,292676.9,540.9935,0.5034,0.5103,0.5378,2.976


DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=123, splitter='best')