# Imports

In [9]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from pycaret import *
from pycaret.regression import *
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,RandomizedSearchCV,cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error as mse, mean_absolute_error as mae

# Load Model

In [10]:
model_path = '../models/orders_post30_model.pkl'
if os.path.exists(model_path):
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
else:
    print("Expected model not found")

In [11]:
model = load_model('../models/orders_post30_model')

Transformation Pipeline and Model Successfully Loaded


In [12]:
model

Pipeline(steps=[('dtypes',
                 DataTypes_Auto_infer(display_types=False,
                                      features_todrop=['Invoice', 'StockCode'],
                                      ml_usecase='regression',
                                      numerical_features=['counts',
                                                          'TotalPrice',
                                                          'Average_Price',
                                                          'Pre_wk_orders',
                                                          'Pre_30_orders',
                                                          'Month', 'Year',
                                                          'Day', 'Weeknumber'],
                                      target='Post_30_orders')),
                ('imputer',
                 Simple_Imputer(categorical_strategy='not_available',
                                fill_...
                ('binn', 'passthrough'), ('rem_

In [13]:
df = pd.read_csv('../models/Final_Features.csv')

# Train Test Split  

In [14]:
df.columns

Index(['counts', 'TotalPrice', 'Average_Price', 'Pre_wk_orders',
       'Pre_30_orders', 'Month', 'Year', 'Day', 'Weeknumber',
       'Post_30_orders'],
      dtype='object')

In [15]:
X = ['counts', 'TotalPrice', 'Average_Price', 'Pre_wk_orders',
       'Pre_30_orders', 'Month', 'Year', 'Day', 'Weeknumber']
y = df.Post_30_orders       

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='Post_30_orders'), 
                                                    df.Post_30_orders, test_size=0.3, 
                                                    random_state=50)


In [17]:
X_train.shape, X_test.shape

((518878, 9), (222377, 9))

In [18]:
y_train.shape, y_test.shape

((518878,), (222377,))

In [19]:
from sklearn.dummy import DummyRegressor
dummy_mean = DummyRegressor()
# "Train" dummy regressor
dummy_mean.fit(X_train, y_train)
# Get R2 score
score_dummy = dummy_mean.score(X_test, y_test)
print("The R2 score of using the mean to predict Post 30 orders is:", score_dummy)

The R2 score of using the mean to predict Post 30 orders is: -9.372737133084286e-07


In [20]:
dummy_pred = dummy_mean.predict(X_test)
dummy_r2 = r2_score(y_test, dummy_pred)
dummy_mse = mse(y_test, dummy_pred)
dummy_rmse = np.sqrt(mse(y_test, dummy_pred))

From pycaret we show that the DT's, Extra Tree's and RF's performed the best. Our initial start will be to use RF regressor

In [21]:
# Define parameters to search for GridSearchCV
basic_param_grid = {'n_estimators': [100, 300, 500, 900, 1200],
              'max_depth': [3, 5, 20, 50, 100],
              }
# Instantiate RandomForestRegressor
basic_rf = RandomForestRegressor(random_state=50)
cv_rf = GridSearchCV(basic_rf, basic_param_grid, cv = 5)
cv_rf_fit = cv_rf.fit(X_train, y_train)

In [22]:
print('The optimal max_depth for the RandomForestRegressor is: {}'.format(cv_rf_fit.best_params_['max_depth']))
print('The optimal n_estimators for the RandomForestRegressor is: {}'.format(cv_rf_fit.best_params_['n_estimators']))

The optimal max_depth for the RandomForestRegressor is: 50
The optimal n_estimators for the RandomForestRegressor is: 100


In [23]:
# Instantiate RFR with optimal hyperparameters
basic_rf = RandomForestRegressor(max_depth = 50, n_estimators = 300, random_state=42)
basic_rf.fit(X_train, y_train)
basic_rf_pred = basic_rf.predict(X_test)

In [24]:
# Calculate evaluation metrics on basic_rf
basic_rf_r2 = r2_score(y_test, basic_rf_pred)
basic_rf_mse = mse(y_test, basic_rf_pred)
basic_rf_rmse = np.sqrt(basic_rf_mse)

In [25]:
# Create RF results dataframe
rf_results = pd.DataFrame({'Model':['dummy_reg', 'basic_rf'], 'R2': [dummy_r2, basic_rf_r2], 'MSE':[dummy_mse, basic_rf_mse], 'RMSE':[dummy_rmse, basic_rf_rmse]})
rf_results

Unnamed: 0,Model,R2,MSE,RMSE
0,dummy_reg,-9.372737e-07,300769.322312,548.4244
1,basic_rf,1.0,0.0,0.0


In [26]:
lst_random = [42,47,487,90]     
X_train, X_test, y_train, y_test = [],[],[],[]
for i in lst_random:
    X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='Post_30_orders'), 
                                                    df.Post_30_orders, test_size=0.3, 
                                                    random_state=i)
    y_pred = basic_rf.predict(X_test)
    basic_rf_r2 = r2_score(y_test, y_pred)
    basic_rf_mse = mse(y_test, y_pred)
    basic_rf_rmse = np.sqrt(basic_rf_mse)
    print("random_state - {} / r2 - {} / mse - {} / rmse - {}".format(i,basic_rf_r2,basic_rf_mse,basic_rf_rmse))
                                                    

random_state - 42 / r2 - 1.0 / mse - 0.0 / rmse - 0.0
random_state - 47 / r2 - 1.0 / mse - 0.0 / rmse - 0.0
random_state - 487 / r2 - 1.0 / mse - 0.0 / rmse - 0.0
random_state - 90 / r2 - 1.0 / mse - 0.0 / rmse - 0.0


Dropping the highly correlated columns and re-evaluate the model

In [27]:
df.columns

Index(['counts', 'TotalPrice', 'Average_Price', 'Pre_wk_orders',
       'Pre_30_orders', 'Month', 'Year', 'Day', 'Weeknumber',
       'Post_30_orders'],
      dtype='object')

In [31]:
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(df.drop(columns=['Post_30_orders','Weeknumber','Month']), 
                                                    df.Post_30_orders, test_size=0.3, 
                                                    random_state=75)

In [32]:
X_train_new.shape, X_test_new.shape

((518878, 7), (222377, 7))

In [35]:
# Instantiate RFR with optimal hyperparameters
new_rf = RandomForestRegressor(max_depth = 50, n_estimators = 300, random_state=95)
new_rf.fit(X_train_new, y_train_new)
new_rf_pred = new_rf.predict(X_test_new)

In [36]:
# Calculate evaluation metrics on basic_rf
new_rf_r2 = r2_score(y_test_new, new_rf_pred)
new_rf_mse = mse(y_test_new, new_rf_pred)
new_rf_rmse = np.sqrt(new_rf_mse)

print(" r2 - {} / mse - {} / rmse - {}".format(new_rf_r2,new_rf_mse,new_rf_rmse))

 r2 - 1.0 / mse - 0.0 / rmse - 0.0


In [40]:
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(df.drop(columns=['Post_30_orders','counts','Weeknumber','Month','Pre_30_orders']), 
                                                    df.Post_30_orders, test_size=0.3, 
                                                    random_state=75)

In [43]:
X_train_new.columns

Index(['TotalPrice', 'Average_Price', 'Pre_wk_orders', 'Year', 'Day'], dtype='object')

In [44]:
X_test_new.columns

Index(['TotalPrice', 'Average_Price', 'Pre_wk_orders', 'Year', 'Day'], dtype='object')

In [41]:
X_train_new.shape, X_test_new.shape

((518878, 5), (222377, 5))

In [42]:
# Instantiate RFR with optimal hyperparameters
new_rf = RandomForestRegressor(max_depth = 50, n_estimators = 300, random_state=105)
new_rf.fit(X_train_new, y_train_new)
new_rf_pred = new_rf.predict(X_test_new)

# Calculate evaluation metrics on new_rf
new_rf_r2 = r2_score(y_test_new, new_rf_pred)
new_rf_mse = mse(y_test_new, new_rf_pred)
new_rf_rmse = np.sqrt(new_rf_mse)

print(" r2 - {} / mse - {} / rmse - {}".format(new_rf_r2,new_rf_mse,new_rf_rmse))

 r2 - 1.0 / mse - 0.0 / rmse - 0.0
