In [579]:
# Import Python packages
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import json
import sys
import cachetools
from datetime import datetime
from datetime import date
import matplotlib.pyplot as plt 
import seaborn as sns
import math

# Import Snowflake modules
from snowflake.snowpark import Session
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T
from snowflake.snowpark import Window
from snowflake.snowpark.functions import col

# Clustering & Model
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score, train_test_split

# Scalers
from sklearn.preprocessing import StandardScaler

# Evaluation
from sklearn.metrics import silhouette_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# find the best combination of model hyperparameters
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score, cross_validate

# Getting Password,Username, Account
import getpass

In [580]:
# Get account credentials from a json file
with open("account.json") as f:
    data = json.load(f)
    username = data["username"]
    password = data["password"]
    account = data["account"]

# Specify connection parameters
connection_parameters = {
    "account": account,
    "user": username,
    "password": password,
    #"role": "ACCOUNTADMIN",
    #"warehouse": "tasty_ds_wh",
    #"database": "NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE",
    #"schema": "analytics",
}

# Create Snowpark session
session = Session.builder.configs(connection_parameters).create()

In [581]:
order_header_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_pos.order_header")
order_detail_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_pos.order_detail")
menu_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_pos.menu")
truck_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_pos.truck")

In [582]:
order_header_df.show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"ORDER_ID"  |"TRUCK_ID"  |"LOCATION_ID"  |"CUSTOMER_ID"  |"DISCOUNT_ID"  |"SHIFT_ID"  |"SHIFT_START_TIME"  |"SHIFT_END_TIME"  |"ORDER_CHANNEL"  |"ORDER_TS"           |"SERVED_TS"  |"ORDER_CURRENCY"  |"ORDER_AMOUNT"  |"ORDER_TAX_AMOUNT"  |"ORDER_DISCOUNT_AMOUNT"  |"ORDER_TOTAL"  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|22477378    |97          |3713.0         |NULL           |NULL           |78881       |16:00:00            |23:00:00          |NULL             |2020-04-

In [583]:
order_detail_df.show()

--------------------------------------------------------------------------------------------------------------------------------------------------------
|"ORDER_DETAIL_ID"  |"ORDER_ID"  |"MENU_ITEM_ID"  |"DISCOUNT_ID"  |"LINE_NUMBER"  |"QUANTITY"  |"UNIT_PRICE"  |"PRICE"  |"ORDER_ITEM_DISCOUNT_AMOUNT"  |
--------------------------------------------------------------------------------------------------------------------------------------------------------
|798202495          |420427395   |131             |NULL           |0              |1           |13.0000       |13.0000  |NULL                          |
|798202496          |420427396   |133             |NULL           |0              |1           |6.0000        |6.0000   |NULL                          |
|798202497          |420427397   |133             |NULL           |0              |1           |6.0000        |6.0000   |NULL                          |
|798202498          |420427398   |133             |NULL           |0              

In [584]:
menu_df.show()

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"MENU_ID"  |"MENU_TYPE_ID"  |"MENU_TYPE"  |"TRUCK_BRAND_NAME"  |"MENU_ITEM_ID"  |"MENU_ITEM_NAME"        |"ITEM_CATEGORY"  |"ITEM_SUBCATEGORY"  |"COST_OF_GOODS_USD"  |"SALE_PRICE_USD"  |"MENU_ITEM_HEALTH_METRICS_OBJ"     |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|10013      |2               |BBQ          |Smoky BBQ           |23              |Pulled Pork Sandwich    |Main             |Warm Option         |7.0000               |12.0000           |{                                  |
|           |                |             |                    |                |                      

In [585]:
#Minimum order transaction by a truck
order_header_df.group_by("TRUCK_ID").agg(F.count("ORDER_ID").alias('count')).agg(F.min('count')).show()

----------------
|"MIN(COUNT)"  |
----------------
|8850          |
----------------



In [586]:
#order_header_df.drop_duplicates('ORDER_ID').group_by('ORDER_ID').agg(F.count('ORDER_ID').alias('nnn')).agg(F.sum('NNN')).show()

In [587]:
#order_detail_df.drop_duplicates('ORDER_ID').group_by('ORDER_ID').agg(F.count('ORDER_ID').alias('nnn')).agg(F.sum('NNN')).show()

In [588]:
order_truck = order_header_df.select('ORDER_ID', 'TRUCK_ID', 'ORDER_TS')
df = order_detail_df.join(order_truck, order_detail_df.ORDER_ID == order_truck.ORDER_ID, how = "left", lsuffix = "", rsuffix = "_01")

In [589]:
df.show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"ORDER_DETAIL_ID"  |"ORDER_ID"  |"MENU_ITEM_ID"  |"DISCOUNT_ID"  |"LINE_NUMBER"  |"QUANTITY"  |"UNIT_PRICE"  |"PRICE"  |"ORDER_ITEM_DISCOUNT_AMOUNT"  |"ORDER_ID_01"  |"TRUCK_ID"  |"ORDER_TS"           |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|798202497          |420427397   |133             |NULL           |0              |1           |6.0000        |6.0000   |NULL                          |420427397      |433         |2022-04-10 19:42:18  |
|798202503          |420427400   |131             |NULL           |0              |4           |13.0000       |52.0000  |NULL                          |420427400      |433         |202

In [590]:
df = df.with_column('ORDER_YEAR', F.year(F.col('ORDER_TS')))
df = df.with_column('ORDER_MONTH', F.month(F.col('ORDER_TS')))

In [591]:
#df.show()

In [592]:
#df.describe().show()

In [593]:
demand_df = df.group_by("TRUCK_ID", 'MENU_ITEM_ID', 'ORDER_YEAR', 'ORDER_MONTH').agg(F.sum("QUANTITY").alias('DEMAND'))
#demand_df.sort('TRUCK_ID').show()

In [594]:
unit_price = df.group_by("TRUCK_ID", 'MENU_ITEM_ID', 'ORDER_YEAR', 'ORDER_MONTH').agg(F.mean("unit_price").alias('unit_price'))
#unit_price.sort('TRUCK_ID').show()

In [595]:
final_df = demand_df.join(unit_price, on= ['TRUCK_ID', 'MENU_ITEM_ID', 'ORDER_YEAR', 'ORDER_MONTH'], lsuffix = "", rsuffix = "_01")

In [596]:
truck_df.show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"TRUCK_ID"  |"MENU_TYPE_ID"  |"PRIMARY_CITY"  |"REGION"    |"ISO_REGION"  |"COUNTRY"      |"ISO_COUNTRY_CODE"  |"FRANCHISE_FLAG"  |"YEAR"  |"MAKE"        |"MODEL"           |"EV_FLAG"  |"FRANCHISE_ID"  |"TRUCK_OPENING_DATE"  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|3           |3               |San Mateo       |California  |CA            |United States  |US                  |1                 |2004    |Freightliner  |MT45 Utilimaster  |0          |2               |2021-10-01            |
|4           |4               |San Mateo       |California  |CA            |United State

In [597]:
#truck_df = truck_df.filter(F.col('COUNTRY') == 'United States')

In [598]:
truck = truck_df.with_column('LAST_DATE', F.iff(F.col("TRUCK_ID") == F.col('TRUCK_ID'), "2022-10-18", '0'))
truck = truck.withColumn("DAYS_OPENED", F.datediff("day", F.col("TRUCK_OPENING_DATE"), F.col('LAST_DATE')))
#truck = truck.select('TRUCK_ID', 'PRIMARY_CITY', 'REGION', 'COUNTRY', 'DAYS_OPENED')
truck = truck.select('TRUCK_ID','EV_FLAG','DAYS_OPENED')

truck.show()

------------------------------------------
|"TRUCK_ID"  |"EV_FLAG"  |"DAYS_OPENED"  |
------------------------------------------
|3           |0          |382            |
|4           |1          |655            |
|5           |1          |200            |
|6           |0          |1205           |
|7           |0          |655            |
|8           |1          |200            |
|9           |0          |839            |
|12          |0          |565            |
|13          |0          |474            |
|14          |0          |747            |
------------------------------------------



In [599]:
final_df = final_df.join(truck, on= ['TRUCK_ID'], lsuffix = "", rsuffix = "_01")

In [600]:
menu = menu_df.drop('MENU_ITEM_HEALTH_METRICS_OBJ')
menu.show()

------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"MENU_ID"  |"MENU_TYPE_ID"  |"MENU_TYPE"  |"TRUCK_BRAND_NAME"  |"MENU_ITEM_ID"  |"MENU_ITEM_NAME"       |"ITEM_CATEGORY"  |"ITEM_SUBCATEGORY"  |"COST_OF_GOODS_USD"  |"SALE_PRICE_USD"  |
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|10051      |7               |Vegetarian   |Plant Palace        |75              |Bottled Soda           |Beverage         |Cold Option         |0.5000               |3.0000            |
|10052      |7               |Vegetarian   |Plant Palace        |76              |Ice Tea                |Beverage         |Cold Option         |0.7500               |3.0000            |
|10053      |8               |Crepes       |Le Coin des Crêpes  |

In [601]:
menu = menu.with_column('TEMPERATURE_OPTION', F.when(F.col('ITEM_SUBCATEGORY') == 'Cold Option', 0).when(
    F.col('ITEM_SUBCATEGORY') == 'Warm Option', 1).otherwise(2))
menu = menu.select('MENU_ITEM_ID', 'MENU_TYPE_ID', 'TEMPERATURE_OPTION', 'COST_OF_GOODS_USD', 'ITEM_CATEGORY') #Add item category and ohe in pandas
menu.show()

--------------------------------------------------------------------------------------------------
|"MENU_ITEM_ID"  |"MENU_TYPE_ID"  |"TEMPERATURE_OPTION"  |"COST_OF_GOODS_USD"  |"ITEM_CATEGORY"  |
--------------------------------------------------------------------------------------------------
|10              |1               |0                     |0.6500               |Beverage         |
|11              |1               |0                     |2.5000               |Dessert          |
|12              |1               |0                     |2.5000               |Dessert          |
|13              |1               |0                     |3.0000               |Dessert          |
|14              |1               |0                     |0.5000               |Beverage         |
|15              |1               |0                     |0.5000               |Beverage         |
|16              |1               |0                     |0.7500               |Beverage         |
|17       

In [602]:
final_df = final_df.join(menu, on= ['MENU_ITEM_ID'], lsuffix = "", rsuffix = "_01")

In [603]:
final_df.show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"MENU_ITEM_ID"  |"TRUCK_ID"  |"ORDER_YEAR"  |"ORDER_MONTH"  |"DEMAND"  |"UNIT_PRICE"   |"EV_FLAG"  |"DAYS_OPENED"  |"MENU_TYPE_ID"  |"TEMPERATURE_OPTION"  |"COST_OF_GOODS_USD"  |"ITEM_CATEGORY"  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|112             |446         |2021          |3              |49108     |12.0000000000  |0          |655            |11              |2                     |4.5000               |Main             |
|114             |446         |2021          |3              |11042     |2.0000000000   |0          |655            |11              |0                     |0.5000               |Beverage         |
|125      

In [604]:
final_df.sort(['ORDER_YEAR', 'ORDER_MONTH'], ascending=[False, False]).show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"MENU_ITEM_ID"  |"TRUCK_ID"  |"ORDER_YEAR"  |"ORDER_MONTH"  |"DEMAND"  |"UNIT_PRICE"   |"EV_FLAG"  |"DAYS_OPENED"  |"MENU_TYPE_ID"  |"TEMPERATURE_OPTION"  |"COST_OF_GOODS_USD"  |"ITEM_CATEGORY"  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|61              |396         |2022          |11             |1504      |5.0000000000   |1          |1021           |6               |2                     |1.5000               |Main             |
|36              |393         |2022          |11             |309       |3.0000000000   |0          |382            |3               |0                     |0.7500               |Beverage         |
|75       

In [605]:
import feature_engine
from feature_engine.encoding import OneHotEncoder

final_df = final_df.to_pandas()
ohe_enc = OneHotEncoder(
    top_categories=None,
    drop_last=False,
    variables= ['ITEM_CATEGORY'])  # to return k-1, false to return k


ohe_enc.fit(final_df)
final_df = ohe_enc.transform(final_df)

In [606]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60291 entries, 0 to 60290
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   MENU_ITEM_ID            60291 non-null  int16  
 1   TRUCK_ID                60291 non-null  int16  
 2   ORDER_YEAR              60291 non-null  int16  
 3   ORDER_MONTH             60291 non-null  int8   
 4   DEMAND                  60291 non-null  int64  
 5   UNIT_PRICE              60291 non-null  object 
 6   EV_FLAG                 60291 non-null  int8   
 7   DAYS_OPENED             60291 non-null  int32  
 8   MENU_TYPE_ID            60291 non-null  int8   
 9   TEMPERATURE_OPTION      60291 non-null  int8   
 10  COST_OF_GOODS_USD       60291 non-null  float64
 11  ITEM_CATEGORY_Main      60291 non-null  int32  
 12  ITEM_CATEGORY_Beverage  60291 non-null  int32  
 13  ITEM_CATEGORY_Dessert   60291 non-null  int32  
 14  ITEM_CATEGORY_Snack     60291 non-null

In [607]:
final_df.corr()

  final_df.corr()


Unnamed: 0,MENU_ITEM_ID,TRUCK_ID,ORDER_YEAR,ORDER_MONTH,DEMAND,EV_FLAG,DAYS_OPENED,MENU_TYPE_ID,TEMPERATURE_OPTION,COST_OF_GOODS_USD,ITEM_CATEGORY_Main,ITEM_CATEGORY_Beverage,ITEM_CATEGORY_Dessert,ITEM_CATEGORY_Snack
MENU_ITEM_ID,1.0,0.029032,0.016474,0.003368,0.132419,-0.126577,-0.032044,0.998925,0.035485,0.06011,0.070062,0.117394,-0.346218,-0.05989
TRUCK_ID,0.029032,1.0,0.025548,-0.003171,0.239371,0.075438,-0.044308,0.029083,-0.006986,0.004133,-0.004954,0.003132,0.008007,-0.004291
ORDER_YEAR,0.016474,0.025548,1.0,-0.161856,-0.011683,0.308498,-0.540929,0.016692,-0.004068,-0.005345,0.000978,0.00364,-0.000222,-0.010105
ORDER_MONTH,0.003368,-0.003171,-0.161856,1.0,-0.068573,-0.007347,-0.032374,0.003372,0.00058,0.00123,-0.000358,0.00033,-0.001694,0.001816
DEMAND,0.132419,0.239371,-0.011683,-0.068573,1.0,0.023086,-0.014647,0.151374,0.574723,0.501136,0.55756,-0.590446,-0.036196,0.118869
EV_FLAG,-0.126577,0.075438,0.308498,-0.007347,0.023086,1.0,-0.554058,-0.126992,-0.012098,-0.020064,0.008221,-0.015059,0.026077,-0.01162
DAYS_OPENED,-0.032044,-0.044308,-0.540929,-0.032374,-0.014647,-0.554058,1.0,-0.032443,0.006993,0.008885,-0.001551,-0.006809,0.00148,0.017174
MENU_TYPE_ID,0.998925,0.029083,0.016692,0.003372,0.151374,-0.126992,-0.032443,1.0,0.061084,0.08406,0.095795,0.095413,-0.34931,-0.064737
TEMPERATURE_OPTION,0.035485,-0.006986,-0.004068,0.00058,0.574723,-0.012098,0.006993,0.061084,1.0,0.801614,0.8475,-0.776153,-0.205802,0.065245
COST_OF_GOODS_USD,0.06011,0.004133,-0.005345,0.00123,0.501136,-0.020064,0.008885,0.08406,0.801614,1.0,0.816159,-0.756692,-0.108054,-0.00997


In [608]:
# Scaling

#drop target
#target = final_df['DEMAND']
#final_df.drop('DEMAND', axis = 1, inplace = True)

#scaler = StandardScaler()
#scaler.fit(final_df)

#final_scaled = scaler.transform(final_df)
#final_scaled = pd.DataFrame(final_scaled, columns=final_df.columns)

#assign target variable back to scaled data frame
#final_scaled = final_scaled.assign(DEMAND = target)
#final_scaled.head()
final_scaled = final_df
final_scaled['UNIT_PRICE'] = final_scaled['UNIT_PRICE'].astype(float)
drop = final_scaled[((final_scaled["ORDER_YEAR"] == 2022) & (final_scaled['ORDER_MONTH'] == 11))]
merge = pd.merge(drop, final_scaled,how='outer', indicator=True)
final_scaled = merge[merge['_merge'] == 'right_only'].drop('_merge', axis = 1)
final_scaled.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 57291 entries, 3000 to 60290
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   MENU_ITEM_ID            57291 non-null  int16  
 1   TRUCK_ID                57291 non-null  int16  
 2   ORDER_YEAR              57291 non-null  int16  
 3   ORDER_MONTH             57291 non-null  int8   
 4   DEMAND                  57291 non-null  int64  
 5   UNIT_PRICE              57291 non-null  float64
 6   EV_FLAG                 57291 non-null  int8   
 7   DAYS_OPENED             57291 non-null  int32  
 8   MENU_TYPE_ID            57291 non-null  int8   
 9   TEMPERATURE_OPTION      57291 non-null  int8   
 10  COST_OF_GOODS_USD       57291 non-null  float64
 11  ITEM_CATEGORY_Main      57291 non-null  int32  
 12  ITEM_CATEGORY_Beverage  57291 non-null  int32  
 13  ITEM_CATEGORY_Dessert   57291 non-null  int32  
 14  ITEM_CATEGORY_Snack     57291 non-n

In [609]:
final_scaled.head()

Unnamed: 0,MENU_ITEM_ID,TRUCK_ID,ORDER_YEAR,ORDER_MONTH,DEMAND,UNIT_PRICE,EV_FLAG,DAYS_OPENED,MENU_TYPE_ID,TEMPERATURE_OPTION,COST_OF_GOODS_USD,ITEM_CATEGORY_Main,ITEM_CATEGORY_Beverage,ITEM_CATEGORY_Dessert,ITEM_CATEGORY_Snack
3000,131,433,2022,4,21003,13.0,0,565,13,2,6.0,1,0,0,0
3001,134,433,2022,4,4629,2.0,0,565,13,0,0.5,0,1,0,0
3002,23,422,2022,4,10738,12.0,1,290,2,1,7.0,1,0,0,0
3003,51,425,2022,4,21286,17.25,1,290,5,2,8.0,1,0,0,0
3004,53,425,2022,4,20971,17.25,1,290,5,2,7.0,1,0,0,0


In [610]:
holdout = final_scaled[(final_scaled["ORDER_YEAR"] == 2022) & (final_scaled['ORDER_MONTH'] >= 8)]
x_holdout = holdout.drop('DEMAND',axis=1)
y_holdout = holdout['DEMAND']
holdout.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8976 entries, 3037 to 60238
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   MENU_ITEM_ID            8976 non-null   int16  
 1   TRUCK_ID                8976 non-null   int16  
 2   ORDER_YEAR              8976 non-null   int16  
 3   ORDER_MONTH             8976 non-null   int8   
 4   DEMAND                  8976 non-null   int64  
 5   UNIT_PRICE              8976 non-null   float64
 6   EV_FLAG                 8976 non-null   int8   
 7   DAYS_OPENED             8976 non-null   int32  
 8   MENU_TYPE_ID            8976 non-null   int8   
 9   TEMPERATURE_OPTION      8976 non-null   int8   
 10  COST_OF_GOODS_USD       8976 non-null   float64
 11  ITEM_CATEGORY_Main      8976 non-null   int32  
 12  ITEM_CATEGORY_Beverage  8976 non-null   int32  
 13  ITEM_CATEGORY_Dessert   8976 non-null   int32  
 14  ITEM_CATEGORY_Snack     8976 non-nul

In [611]:
merged = pd.merge(holdout, final_scaled,how='outer', indicator=True)
final_scaled = merged[merged['_merge'] == 'right_only'].drop('_merge', axis = 1)
final_scaled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48315 entries, 8976 to 57290
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   MENU_ITEM_ID            48315 non-null  int16  
 1   TRUCK_ID                48315 non-null  int16  
 2   ORDER_YEAR              48315 non-null  int16  
 3   ORDER_MONTH             48315 non-null  int8   
 4   DEMAND                  48315 non-null  int64  
 5   UNIT_PRICE              48315 non-null  float64
 6   EV_FLAG                 48315 non-null  int8   
 7   DAYS_OPENED             48315 non-null  int32  
 8   MENU_TYPE_ID            48315 non-null  int8   
 9   TEMPERATURE_OPTION      48315 non-null  int8   
 10  COST_OF_GOODS_USD       48315 non-null  float64
 11  ITEM_CATEGORY_Main      48315 non-null  int32  
 12  ITEM_CATEGORY_Beverage  48315 non-null  int32  
 13  ITEM_CATEGORY_Dessert   48315 non-null  int32  
 14  ITEM_CATEGORY_Snack     48315 non-n

In [612]:
# Train Test Split
# Define Model Inputs (X) and Output (y)
X = final_scaled.drop('DEMAND',axis=1)
y = final_scaled["DEMAND"]

# Split both Inputs (X) and Ouput (y) into training set (70%) and testing set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [613]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

# Create the model "lr"
lr = LinearRegression()

# Fit the model to the training set
lr.fit(X_train, y_train)

# Make predictions on the training and testing sets
train_predictions = lr.predict(X_train)
test_predictions = lr.predict(X_test)

# Calculate RMSE for the training and testing sets
train_rmse = sqrt(mean_squared_error(y_train, train_predictions))
test_rmse = sqrt(mean_squared_error(y_test, test_predictions))

# Calculate R-squared values for the training and testing sets
train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)

# Print the results
print('Train RMSE:', train_rmse)
print('Test RMSE:', test_rmse)
print('Train R-squared:', train_r2)
print('Test R-squared:', test_r2)


Train RMSE: 10819.089566587065
Test RMSE: 10923.294480236722
Train R-squared: 0.5362171038256829
Test R-squared: 0.5367334206291553


In [614]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators = 10, random_state = 0)
rf.fit(X_train, y_train)

# Make predictions on the training and testing sets
train_predictions = rf.predict(X_train)
test_predictions = rf.predict(X_test)

# Calculate RMSE for the training and testing sets
train_rmse = sqrt(mean_squared_error(y_train, train_predictions))
test_rmse = sqrt(mean_squared_error(y_test, test_predictions))

# Calculate R-squared values for the training and testing sets
train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)

# Print the results
print('Train RMSE:', train_rmse)
print('Test RMSE:', test_rmse)
print('Train R-squared:', train_r2)
print('Test R-squared:', test_r2)

Train RMSE: 124.61734782314329
Test RMSE: 276.2199803239853
Train R-squared: 0.9999384695198331
Test R-squared: 0.9997037667065813


In [615]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf, X_train, y_train, cv = 5)
scores.mean()

0.9995526291883529

In [616]:
import xgboost as xg

# create an xgboost regression model
xgb = xg.XGBRegressor()

# Fitting the model
xgb.fit(X_train, y_train)
  
# Make predictions on the training and testing sets
train_predictions = xgb.predict(X_train)
test_predictions = xgb.predict(X_test)

# Calculate RMSE for the training and testing sets
train_rmse = sqrt(mean_squared_error(y_train, train_predictions))
test_rmse = sqrt(mean_squared_error(y_test, test_predictions))

# Calculate R-squared values for the training and testing sets
train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)

# Print the results
print('Train RMSE:', train_rmse)
print('Test RMSE:', test_rmse)
print('Train R-squared:', train_r2)
print('Test R-squared:', test_r2)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Train RMSE: 352.72726028713834
Test RMSE: 392.3411399542947
Train R-squared: 0.9995070403799713
Test R-squared: 0.999402343859219


### Holdout 

In [617]:
# Make predictions on the training and testing sets
ho_predictions = rf.predict(x_holdout)

# Calculate RMSE for the training and testing sets
ho_rmse = sqrt(mean_squared_error(y_holdout, ho_predictions))

# Calculate R-squared values for the training and testing sets
ho_r2 = r2_score(y_holdout, ho_predictions)

# Print the results
print('Train RMSE:', ho_rmse)
print('Train R-squared:', ho_r2)

Train RMSE: 644.0397181000709
Train R-squared: 0.9984080793267814


In [618]:
ho_predictions

array([10497.2, 10364.1, 46297.6, ..., 11355.9, 11244. , 11316.2])

In [619]:
y_holdout

3037     10380
3038     10318
3039     46249
3040      7696
3043      6518
         ...  
60234    36416
60235    51120
60236    11486
60237    11577
60238    11493
Name: DEMAND, Length: 8976, dtype: int64