# Imports

In [2]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from pycaret import *
from pycaret.regression import *
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,RandomizedSearchCV,cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error as mse, mean_absolute_error as mae

# Load Model

In [10]:
model_path = '../models/orders_post30_model.pkl'
if os.path.exists(model_path):
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
else:
    print("Expected model not found")

In [11]:
model = load_model('../models/orders_post30_model')

Transformation Pipeline and Model Successfully Loaded


In [12]:
model

Pipeline(steps=[('dtypes',
                 DataTypes_Auto_infer(display_types=False,
                                      features_todrop=['Invoice', 'StockCode'],
                                      ml_usecase='regression',
                                      numerical_features=['counts',
                                                          'TotalPrice',
                                                          'Average_Price',
                                                          'Pre_wk_orders',
                                                          'Pre_30_orders',
                                                          'Month', 'Year',
                                                          'Day', 'Weeknumber'],
                                      target='Post_30_orders')),
                ('imputer',
                 Simple_Imputer(categorical_strategy='not_available',
                                fill_...
                ('binn', 'passthrough'), ('rem_

In [4]:
df = pd.read_csv('../models/Final_Features.csv')

In [3]:
df_nonuk = pd.read_csv('../models/Final_Features_NonUK.csv')

# Train Test Split  

In [5]:
df.columns

Index(['counts', 'TotalPrice', 'Average_Price', 'Pre_wk_orders',
       'Pre_30_orders', 'Month', 'Year', 'Day', 'Weeknumber',
       'Post_30_orders'],
      dtype='object')

In [6]:
df.head()

Unnamed: 0,counts,TotalPrice,Average_Price,Pre_wk_orders,Pre_30_orders,Month,Year,Day,Weeknumber,Post_30_orders
0,125.0,36227.9,289.8232,0.0,0.0,12.0,2009.0,1.0,49.0,2403.0
1,125.0,36227.9,289.8232,0.0,0.0,12.0,2009.0,1.0,49.0,2403.0
2,125.0,36227.9,289.8232,0.0,0.0,12.0,2009.0,1.0,49.0,2403.0
3,125.0,36227.9,289.8232,0.0,0.0,12.0,2009.0,1.0,49.0,2403.0
4,125.0,36227.9,289.8232,0.0,0.0,12.0,2009.0,1.0,49.0,2403.0


In [7]:
X = ['counts', 'TotalPrice', 'Average_Price', 'Pre_wk_orders',
       'Pre_30_orders', 'Month', 'Year', 'Day', 'Weeknumber']
y = df.Post_30_orders       

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='Post_30_orders'), 
                                                    df.Post_30_orders, test_size=0.3, 
                                                    random_state=50)


In [9]:
X_train.shape, X_test.shape

((517872, 9), (221946, 9))

In [10]:
y_train.shape, y_test.shape

((517872,), (221946,))

In [29]:
from sklearn.dummy import DummyRegressor
dummy_mean = DummyRegressor()
# "Train" dummy regressor
dummy_mean.fit(X_train, y_train)
# Get R2 score
score_dummy = dummy_mean.score(X_test, y_test)
print("The R2 score of using the mean to predict Post 30 orders is:", score_dummy)

The R2 score of using the mean to predict Post 30 orders is: -3.258531529715114e-06


In [30]:
dummy_pred = dummy_mean.predict(X_test)
dummy_r2 = r2_score(y_test, dummy_pred)
dummy_mse = mse(y_test, dummy_pred)
dummy_rmse = np.sqrt(mse(y_test, dummy_pred))

From pycaret we show that the DT's, Extra Tree's and RF's performed the best. Our initial start will be to use RF regressor

In [21]:
# Define parameters to search for GridSearchCV
basic_param_grid = {'n_estimators': [100, 300, 500, 900, 1200],
              'max_depth': [3, 5, 20, 50, 100],
              }
# Instantiate RandomForestRegressor
basic_rf = RandomForestRegressor(random_state=50)
cv_rf = GridSearchCV(basic_rf, basic_param_grid, cv = 5)
cv_rf_fit = cv_rf.fit(X_train, y_train)

In [22]:
print('The optimal max_depth for the RandomForestRegressor is: {}'.format(cv_rf_fit.best_params_['max_depth']))
print('The optimal n_estimators for the RandomForestRegressor is: {}'.format(cv_rf_fit.best_params_['n_estimators']))

The optimal max_depth for the RandomForestRegressor is: 50
The optimal n_estimators for the RandomForestRegressor is: 100


In [11]:
# Instantiate RFR with optimal hyperparameters
basic_rf = RandomForestRegressor(max_depth = 50, n_estimators = 300, random_state=42)
basic_rf.fit(X_train, y_train)
basic_rf_pred = basic_rf.predict(X_test)

In [12]:
# Calculate evaluation metrics on basic_rf
basic_rf_r2 = r2_score(y_test, basic_rf_pred)
basic_rf_mse = mse(y_test, basic_rf_pred)
basic_rf_rmse = np.sqrt(basic_rf_mse)

In [33]:
# Create RF results dataframe
rf_results = pd.DataFrame({'Model':['dummy_reg', 'basic_rf'], 'R2': [dummy_r2, basic_rf_r2], 'MSE':[dummy_mse, basic_rf_mse], 'RMSE':[dummy_rmse, basic_rf_rmse]})
rf_results

Unnamed: 0,Model,R2,MSE,RMSE
0,dummy_reg,-3e-06,298991.851826,546.801474
1,basic_rf,1.0,0.0,0.0


### Non UK Countries

In [13]:
X_train_nonuk, X_test_nonuk, y_train_nonuk, y_test_nonuk = train_test_split(df_nonuk.drop(columns='Post_30_orders'), 
                                                    df_nonuk.Post_30_orders, test_size=0.7, 
                                                    random_state=50)


In [14]:
X_train.columns

Index(['counts', 'TotalPrice', 'Average_Price', 'Pre_wk_orders',
       'Pre_30_orders', 'Month', 'Year', 'Day', 'Weeknumber'],
      dtype='object')

In [15]:
X_train_nonuk.columns

Index(['counts', 'TotalPrice', 'Average_Price', 'Pre_wk_orders',
       'Pre_30_orders', 'Month', 'Year', 'Day', 'Weeknumber'],
      dtype='object')

Predicting the non uk countries post 30 days orders using UK model.

In [16]:
basic_rf_pred_nonuk = basic_rf.predict(X_test_nonuk)
# Calculate evaluation metrics on basic_rf
basic_rf_r2_nonuk = r2_score(y_test_nonuk, basic_rf_pred_nonuk)
basic_rf_mse_nonuk = mse(y_test_nonuk, basic_rf_pred_nonuk)
basic_rf_rmse_nonuk = np.sqrt(basic_rf_mse_nonuk)

print(" r2 - {} / mse - {} / rmse - {}".format(basic_rf_r2_nonuk,basic_rf_mse_nonuk,basic_rf_rmse_nonuk))

 r2 - -876.5735637467105 / mse - 3518495.057064919 / rmse - 1875.765192412131


In [17]:
y_test_nonuk

70257    322.0
68966    330.0
70743    336.0
47406    203.0
76543    377.0
         ...  
42903    203.0
38620    210.0
68150    326.0
78729    254.0
68698    328.0
Name: Post_30_orders, Length: 57718, dtype: float64

In [18]:
basic_rf_pred_nonuk

array([2370.64      , 2375.31333333, 2626.19333333, ..., 2268.26      ,
       2347.05666667, 2268.26      ])

#### Refitting the non uk model on the same RF estimators

In [19]:
non_uk_rf = RandomForestRegressor(max_depth = 50, n_estimators = 300, random_state=57)
non_uk_rf.fit(X_train_nonuk,y_train_nonuk)
y_nonuk_pred = non_uk_rf.predict(X_test_nonuk)
# Calculate evaluation metrics on basic_rf
rf_r2_nonuk = r2_score(y_test_nonuk, y_nonuk_pred)
rf_mse_nonuk = mse(y_test_nonuk, y_nonuk_pred)
rf_rmse_nonuk = np.sqrt(rf_mse_nonuk)

print(" r2 - {} / mse - {} / rmse - {}".format(rf_r2_nonuk,rf_mse_nonuk,rf_rmse_nonuk))

 r2 - 0.9999993206631594 / mse - 0.0027236956697506204 / rmse - 0.052189037831240195


### Updating the random state

In [26]:
lst_random = [42,47,487,90]     
X_train, X_test, y_train, y_test = [],[],[],[]
for i in lst_random:
    X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='Post_30_orders'), 
                                                    df.Post_30_orders, test_size=0.3, 
                                                    random_state=i)
    y_pred = basic_rf.predict(X_test)
    basic_rf_r2 = r2_score(y_test, y_pred)
    basic_rf_mse = mse(y_test, y_pred)
    basic_rf_rmse = np.sqrt(basic_rf_mse)
    print("random_state - {} / r2 - {} / mse - {} / rmse - {}".format(i,basic_rf_r2,basic_rf_mse,basic_rf_rmse))
                                                    

random_state - 42 / r2 - 1.0 / mse - 0.0 / rmse - 0.0
random_state - 47 / r2 - 1.0 / mse - 0.0 / rmse - 0.0
random_state - 487 / r2 - 1.0 / mse - 0.0 / rmse - 0.0
random_state - 90 / r2 - 1.0 / mse - 0.0 / rmse - 0.0


### Further evaluation

Dropping the highly correlated columns and re-evaluate the model

In [27]:
df.columns

Index(['counts', 'TotalPrice', 'Average_Price', 'Pre_wk_orders',
       'Pre_30_orders', 'Month', 'Year', 'Day', 'Weeknumber',
       'Post_30_orders'],
      dtype='object')

In [31]:
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(df.drop(columns=['Post_30_orders','Weeknumber','Month']), 
                                                    df.Post_30_orders, test_size=0.3, 
                                                    random_state=75)

In [32]:
X_train_new.shape, X_test_new.shape

((518878, 7), (222377, 7))

In [35]:
# Instantiate RFR with optimal hyperparameters
new_rf = RandomForestRegressor(max_depth = 50, n_estimators = 300, random_state=95)
new_rf.fit(X_train_new, y_train_new)
new_rf_pred = new_rf.predict(X_test_new)

In [36]:
# Calculate evaluation metrics on basic_rf
new_rf_r2 = r2_score(y_test_new, new_rf_pred)
new_rf_mse = mse(y_test_new, new_rf_pred)
new_rf_rmse = np.sqrt(new_rf_mse)

print(" r2 - {} / mse - {} / rmse - {}".format(new_rf_r2,new_rf_mse,new_rf_rmse))

 r2 - 1.0 / mse - 0.0 / rmse - 0.0


In [56]:
X_train_new_1, X_test_new_1, y_train_new_1, y_test_new_1 = train_test_split(df.drop(columns=['Post_30_orders','Pre_wk_orders','counts','Weeknumber','Month','Pre_30_orders']), 
                                                    df.Post_30_orders, test_size=0.4, 
                                                    random_state=150)

In [57]:
X_train_new_1.columns

Index(['TotalPrice', 'Average_Price', 'Year', 'Day'], dtype='object')

In [59]:
X_test_new_1.columns

Index(['TotalPrice', 'Average_Price', 'Year', 'Day'], dtype='object')

In [58]:
X_train_new_1.shape, X_test_new_1.shape

((444753, 4), (296502, 4))

In [62]:
# Instantiate RFR with optimal hyperparameters
new_rf1 = RandomForestRegressor(max_depth = 50, n_estimators = 300, random_state=289)
new_rf1.fit(X_train_new_1, y_train_new_1)
new_rf1_pred = new_rf1.predict(X_test_new_1)

# Calculate evaluation metrics on new_rf
new_rf1_r2 = r2_score(y_test_new_1, new_rf1_pred)
new_rf1_mse = mse(y_test_new_1, new_rf1_pred)
new_rf1_rmse = np.sqrt(new_rf1_mse)

print(" r2 - {} / mse - {} / rmse - {}".format(new_rf1_r2,new_rf1_mse,new_rf1_rmse))

 r2 - 1.0 / mse - 0.0 / rmse - 0.0


In [66]:
# Instantiate GB with optimal hyperparameters
new_gb1 = GradientBoostingRegressor(n_estimators=20, learning_rate = 0.5, max_features=2, max_depth = 2, random_state = 0)
new_gb1.fit(X_train_new_1, y_train_new_1)
new_gb1_pred = new_gb1.predict(X_test_new_1)

# Calculate evaluation metrics on new_rf
new_gb1_r2 = r2_score(y_test_new_1, new_gb1_pred)
new_gb1_mse = mse(y_test_new_1, new_gb1_pred)
new_gb1_rmse = np.sqrt(new_gb1_mse)

print(" r2 - {} / mse - {} / rmse - {}".format(new_gb1_r2,new_gb1_mse,new_gb1_rmse))

 r2 - 0.4363820972761778 / mse - 168609.3213591989 / rmse - 410.6206538390378


In [50]:
y_test_new.head()

644772    2775.0
613790    2338.0
536364    1783.0
427440    1556.0
586022    2125.0
Name: Post_30_orders, dtype: float64

In [51]:
X_test_new.head()

Unnamed: 0,TotalPrice,Average_Price,Year,Day
644772,30077.27,385.606026,2011.0,18.0
613790,26194.171,390.957776,2011.0,26.0
536364,36929.94,559.544545,2011.0,30.0
427440,12367.99,334.27,2011.0,8.0
586022,26160.52,396.371515,2011.0,2.0


In [53]:
new_rf_pred

array([2775., 2338., 1783., ...,  390., 2839., 1778.])

In [54]:
y_test_new

644772    2775.0
613790    2338.0
536364    1783.0
427440    1556.0
586022    2125.0
           ...  
146825    1995.0
345131    2839.0
736025     390.0
344908    2839.0
538192    1778.0
Name: Post_30_orders, Length: 222377, dtype: float64

In [69]:
X_train_new_1, X_test_new_1, y_train_new_1, y_test_new_1 = train_test_split(df.drop(columns=['Post_30_orders','Pre_wk_orders','counts','Pre_30_orders']), 
                                                    df.counts, test_size=0.4, 
                                                    random_state=150)

In [70]:
X_test_new_1

Unnamed: 0,TotalPrice,Average_Price,Month,Year,Day,Weeknumber
102370,21891.07,304.042639,4.0,2010.0,1.0,13.0
696545,36746.61,303.690992,11.0,2011.0,16.0,46.0
385183,100736.84,1343.157867,12.0,2010.0,7.0,49.0
703147,26558.81,282.540532,11.0,2011.0,20.0,46.0
729914,17335.01,293.813729,12.0,2011.0,4.0,48.0
...,...,...,...,...,...,...
482940,23915.89,341.655571,4.0,2011.0,21.0,16.0
97174,22548.32,375.805333,3.0,2010.0,28.0,12.0
262231,33662.06,340.020808,9.0,2010.0,29.0,39.0
459670,20700.71,414.014200,3.0,2011.0,25.0,12.0


In [71]:
# Instantiate RFR with optimal hyperparameters
new_rf1 = RandomForestRegressor(max_depth = 50, n_estimators = 300, random_state=289)
new_rf1.fit(X_train_new_1, y_train_new_1)
new_rf1_pred = new_rf1.predict(X_test_new_1)

# Calculate evaluation metrics on new_rf
new_rf1_r2 = r2_score(y_test_new_1, new_rf1_pred)
new_rf1_mse = mse(y_test_new_1, new_rf1_pred)
new_rf1_rmse = np.sqrt(new_rf1_mse)

print(" r2 - {} / mse - {} / rmse - {}".format(new_rf1_r2,new_rf1_mse,new_rf1_rmse))

 r2 - 1.0 / mse - 0.0 / rmse - 0.0


In [74]:
y_test_new_1

102370     72.0
696545    121.0
385183     75.0
703147     94.0
729914     59.0
Name: counts, dtype: float64

In [75]:
new_rf1_pred

array([ 72., 121.,  75., ...,  99.,  50., 155.])

### Blending by combining the UK and Non UK model together

In [21]:
new_df = df_nonuk.append(df)
new_df.shape

(822272, 10)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(new_df.drop(columns='Post_30_orders'), 
                                                    new_df.Post_30_orders, test_size=0.3, 
                                                    random_state=50)


In [23]:
from sklearn.ensemble import VotingRegressor

# Create empty lists that will storage the different weights
weights1 = []
weights2 = []
scores = []

# Create a loop to evaluate different combinations of weights
for i in np.arange(0.1,1, 0.1):
    for j in np.arange(0.1,1, 0.1):
        clf_voting = VotingRegressor(estimators = [('ukest1',basic_rf),('nonukest2',non_uk_rf)],weights=[i,j],verbose = True,n_jobs=-1)
        clf_voting.fit(X_train, y_train)
        pred = clf_voting.predict(X_test)
        score = r2_score(y_test,pred)
        scores.append(score)
        weights1.append(i)
        weights2.append(j)



In [24]:
# Save the results in a data frame

test_scores = pd.DataFrame()
test_scores['Weight1'] = weights1
test_scores['Weight2'] = weights2
test_scores['Test Score'] = scores

# Create an additional column to save the sum of all the weights

test_scores['sum_weights'] = test_scores['Weight1'].add(test_scores['Weight2'])

#We are only getting the rows that the sum of all weights were equal to one

condition = test_scores['sum_weights'] == 1

test_scores = test_scores.loc[condition]


In [25]:
test_scores

Unnamed: 0,Weight1,Weight2,Test Score,sum_weights
8,0.1,0.9,1.0,1.0
16,0.2,0.8,1.0,1.0
24,0.3,0.7,1.0,1.0
32,0.4,0.6,1.0,1.0
40,0.5,0.5,1.0,1.0
48,0.6,0.4,1.0,1.0
56,0.7,0.3,1.0,1.0
64,0.8,0.2,1.0,1.0
72,0.9,0.1,1.0,1.0


In [26]:
# Create the Voting Classifier with the most equally weighted because all models performed similarly

clf_voting = VotingRegressor(estimators = [('ukest1',basic_rf),('nonukest2',non_uk_rf)],verbose = True,n_jobs=-1, weights = [0.1, 0.9])

#Fit and predict

clf_voting.fit(X_train, y_train)
pred_voting = clf_voting.predict(X_test)

In [27]:
score = r2_score(y_test,pred_voting)
print("Score - {}".format(score))

Score - 0.9999999999995045


In [28]:
non_uk_pred_clf =  clf_voting.predict(X_test_nonuk)
nonuk_score = r2_score(non_uk_pred_clf,y_test_nonuk)
print(nonuk_score)

0.9999999993761246
