In [53]:
# Import Python packages
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import json
import sys
import cachetools
from datetime import datetime
from datetime import date
import matplotlib.pyplot as plt 
import seaborn as sns
import math

# Import Snowflake modules
from snowflake.snowpark import Session
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T
from snowflake.snowpark import Window
from snowflake.snowpark.functions import col

# Clustering & Model
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score, train_test_split

# Scalers
from sklearn.preprocessing import StandardScaler

# Evaluation
from sklearn.metrics import silhouette_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# find the best combination of model hyperparameters
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score, cross_validate

# Getting Password,Username, Account
import getpass

In [54]:
# Get account credentials from a json file
with open("account.json") as f:
    data = json.load(f)
    username = data["username"]
    password = data["password"]
    account = data["account"]

# Specify connection parameters
connection_parameters = {
    "account": account,
    "user": username,
    "password": password,
    #"role": "ACCOUNTADMIN",
    #"warehouse": "tasty_ds_wh",
    #"database": "NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE",
    #"schema": "analytics",
}

# Create Snowpark session
session = Session.builder.configs(connection_parameters).create()

In [55]:
order_header_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_pos.order_header")
order_detail_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_pos.order_detail")
menu_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_pos.menu")
truck_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_pos.truck")

In [56]:
order_header_df.show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"ORDER_ID"  |"TRUCK_ID"  |"LOCATION_ID"  |"CUSTOMER_ID"  |"DISCOUNT_ID"  |"SHIFT_ID"  |"SHIFT_START_TIME"  |"SHIFT_END_TIME"  |"ORDER_CHANNEL"  |"ORDER_TS"           |"SERVED_TS"  |"ORDER_CURRENCY"  |"ORDER_AMOUNT"  |"ORDER_TAX_AMOUNT"  |"ORDER_DISCOUNT_AMOUNT"  |"ORDER_TOTAL"  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|82686170    |239         |6185.0         |NULL           |NULL           |232187      |15:30:00            |22:30:00          |NULL             |2021-04-

In [57]:
order_detail_df.show()

--------------------------------------------------------------------------------------------------------------------------------------------------------
|"ORDER_DETAIL_ID"  |"ORDER_ID"  |"MENU_ITEM_ID"  |"DISCOUNT_ID"  |"LINE_NUMBER"  |"QUANTITY"  |"UNIT_PRICE"  |"PRICE"  |"ORDER_ITEM_DISCOUNT_AMOUNT"  |
--------------------------------------------------------------------------------------------------------------------------------------------------------
|798202495          |420427395   |131             |NULL           |0              |1           |13.0000       |13.0000  |NULL                          |
|798202496          |420427396   |133             |NULL           |0              |1           |6.0000        |6.0000   |NULL                          |
|798202497          |420427397   |133             |NULL           |0              |1           |6.0000        |6.0000   |NULL                          |
|798202498          |420427398   |133             |NULL           |0              

In [58]:
menu_df.show()

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"MENU_ID"  |"MENU_TYPE_ID"  |"MENU_TYPE"  |"TRUCK_BRAND_NAME"  |"MENU_ITEM_ID"  |"MENU_ITEM_NAME"        |"ITEM_CATEGORY"  |"ITEM_SUBCATEGORY"  |"COST_OF_GOODS_USD"  |"SALE_PRICE_USD"  |"MENU_ITEM_HEALTH_METRICS_OBJ"     |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|10013      |2               |BBQ          |Smoky BBQ           |23              |Pulled Pork Sandwich    |Main             |Warm Option         |7.0000               |12.0000           |{                                  |
|           |                |             |                    |                |                      

In [59]:
#Minimum order transaction by a truck
order_header_df.group_by("TRUCK_ID").agg(F.count("ORDER_ID").alias('count')).agg(F.min('count')).show()

----------------
|"MIN(COUNT)"  |
----------------
|8850          |
----------------



In [60]:
#order_header_df.drop_duplicates('ORDER_ID').group_by('ORDER_ID').agg(F.count('ORDER_ID').alias('nnn')).agg(F.sum('NNN')).show()

In [61]:
#order_detail_df.drop_duplicates('ORDER_ID').group_by('ORDER_ID').agg(F.count('ORDER_ID').alias('nnn')).agg(F.sum('NNN')).show()

In [62]:
order_truck = order_header_df.select('ORDER_ID', 'TRUCK_ID', 'ORDER_TS')
df = order_detail_df.join(order_truck, order_detail_df.ORDER_ID == order_truck.ORDER_ID, how = "left", lsuffix = "", rsuffix = "_01")

In [63]:
df.show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"ORDER_DETAIL_ID"  |"ORDER_ID"  |"MENU_ITEM_ID"  |"DISCOUNT_ID"  |"LINE_NUMBER"  |"QUANTITY"  |"UNIT_PRICE"  |"PRICE"  |"ORDER_ITEM_DISCOUNT_AMOUNT"  |"ORDER_ID_01"  |"TRUCK_ID"  |"ORDER_TS"           |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|798202497          |420427397   |133             |NULL           |0              |1           |6.0000        |6.0000   |NULL                          |420427397      |433         |2022-04-10 19:42:18  |
|798202503          |420427400   |131             |NULL           |0              |4           |13.0000       |52.0000  |NULL                          |420427400      |433         |202

In [64]:
df = df.with_column('ORDER_YEAR', F.year(F.col('ORDER_TS')))
df = df.with_column('ORDER_MONTH', F.month(F.col('ORDER_TS')))

In [65]:
#df.show()

In [66]:
#df.describe().show()

In [67]:
demand_df = df.group_by("TRUCK_ID", 'MENU_ITEM_ID', 'ORDER_YEAR', 'ORDER_MONTH').agg(F.sum("QUANTITY").alias('DEMAND'))
#demand_df.sort('TRUCK_ID').show()

In [68]:
unit_price = df.group_by("TRUCK_ID", 'MENU_ITEM_ID', 'ORDER_YEAR', 'ORDER_MONTH').agg(F.mean("unit_price").alias('unit_price'))
#unit_price.sort('TRUCK_ID').show()

In [69]:
final_df = demand_df.join(unit_price, on= ['TRUCK_ID', 'MENU_ITEM_ID', 'ORDER_YEAR', 'ORDER_MONTH'], lsuffix = "", rsuffix = "_01")

In [70]:
truck_df.show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"TRUCK_ID"  |"MENU_TYPE_ID"  |"PRIMARY_CITY"  |"REGION"    |"ISO_REGION"  |"COUNTRY"      |"ISO_COUNTRY_CODE"  |"FRANCHISE_FLAG"  |"YEAR"  |"MAKE"        |"MODEL"           |"EV_FLAG"  |"FRANCHISE_ID"  |"TRUCK_OPENING_DATE"  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|3           |3               |San Mateo       |California  |CA            |United States  |US                  |1                 |2004    |Freightliner  |MT45 Utilimaster  |0          |2               |2021-10-01            |
|4           |4               |San Mateo       |California  |CA            |United State

In [71]:
#truck_df = truck_df.filter(F.col('COUNTRY') == 'United States')

In [72]:
truck = truck_df.with_column('LAST_DATE', F.iff(F.col("TRUCK_ID") == F.col('TRUCK_ID'), "2022-10-18", '0'))
truck = truck.withColumn("DAYS_OPENED", F.datediff("day", F.col("TRUCK_OPENING_DATE"), F.col('LAST_DATE')))
#truck = truck.select('TRUCK_ID', 'PRIMARY_CITY', 'REGION', 'COUNTRY', 'DAYS_OPENED')
truck = truck.select('TRUCK_ID','EV_FLAG','DAYS_OPENED')

truck.show()

------------------------------------------
|"TRUCK_ID"  |"EV_FLAG"  |"DAYS_OPENED"  |
------------------------------------------
|3           |0          |382            |
|4           |1          |655            |
|5           |1          |200            |
|6           |0          |1205           |
|7           |0          |655            |
|8           |1          |200            |
|9           |0          |839            |
|12          |0          |565            |
|13          |0          |474            |
|14          |0          |747            |
------------------------------------------



In [73]:
final_df = final_df.join(truck, on= ['TRUCK_ID'], lsuffix = "", rsuffix = "_01")

In [74]:
menu = menu_df.drop('MENU_ITEM_HEALTH_METRICS_OBJ')
menu.show()

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"MENU_ID"  |"MENU_TYPE_ID"  |"MENU_TYPE"   |"TRUCK_BRAND_NAME"  |"MENU_ITEM_ID"  |"MENU_ITEM_NAME"       |"ITEM_CATEGORY"  |"ITEM_SUBCATEGORY"  |"COST_OF_GOODS_USD"  |"SALE_PRICE_USD"  |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|10026      |3               |Tacos         |Guac n' Roll        |37              |Chicken Burrito        |Main             |Warm Option         |3.2500               |12.5000           |
|10027      |3               |Tacos         |Guac n' Roll        |38              |Lean Burrito Bowl      |Main             |Warm Option         |3.5000               |12.5000           |
|10028      |3               |Tacos         |Guac n' Roll   

In [75]:
menu = menu.with_column('TEMPERATURE_OPTION', F.when(F.col('ITEM_SUBCATEGORY') == 'Cold Option', 0).when(
    F.col('ITEM_SUBCATEGORY') == 'Warm Option', 1).otherwise(2))
menu = menu.select('MENU_ITEM_ID', 'MENU_TYPE_ID', 'TEMPERATURE_OPTION', 'COST_OF_GOODS_USD', 'ITEM_CATEGORY') #Add item category and ohe in pandas
menu.show()

--------------------------------------------------------------------------------------------------
|"MENU_ITEM_ID"  |"MENU_TYPE_ID"  |"TEMPERATURE_OPTION"  |"COST_OF_GOODS_USD"  |"ITEM_CATEGORY"  |
--------------------------------------------------------------------------------------------------
|23              |2               |1                     |7.0000               |Main             |
|24              |2               |0                     |0.5000               |Beverage         |
|25              |2               |0                     |0.5000               |Beverage         |
|26              |2               |0                     |0.7500               |Beverage         |
|27              |2               |0                     |2.2500               |Snack            |
|28              |2               |2                     |11.2500              |Main             |
|29              |2               |1                     |1.2500               |Snack            |
|31       

In [76]:
final_df = final_df.join(menu, on= ['MENU_ITEM_ID'], lsuffix = "", rsuffix = "_01")

In [77]:
#final_df.show()

In [78]:
#final_df.sort(['ORDER_YEAR', 'ORDER_MONTH'], ascending=[False, False]).show()

In [79]:
import feature_engine
from feature_engine.encoding import OneHotEncoder

final_df = final_df.to_pandas()
ohe_enc = OneHotEncoder(
    top_categories=None,
    drop_last=False,
    variables= ['ITEM_CATEGORY'])  # to return k-1, false to return k


ohe_enc.fit(final_df)
final_df = ohe_enc.transform(final_df)

In [80]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60291 entries, 0 to 60290
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   MENU_ITEM_ID            60291 non-null  int16  
 1   TRUCK_ID                60291 non-null  int16  
 2   ORDER_YEAR              60291 non-null  int16  
 3   ORDER_MONTH             60291 non-null  int8   
 4   DEMAND                  60291 non-null  int64  
 5   UNIT_PRICE              60291 non-null  object 
 6   EV_FLAG                 60291 non-null  int8   
 7   DAYS_OPENED             60291 non-null  int32  
 8   MENU_TYPE_ID            60291 non-null  int8   
 9   TEMPERATURE_OPTION      60291 non-null  int8   
 10  COST_OF_GOODS_USD       60291 non-null  float64
 11  ITEM_CATEGORY_Beverage  60291 non-null  int32  
 12  ITEM_CATEGORY_Main      60291 non-null  int32  
 13  ITEM_CATEGORY_Dessert   60291 non-null  int32  
 14  ITEM_CATEGORY_Snack     60291 non-null

In [81]:
final_df.corr()

  final_df.corr()


Unnamed: 0,MENU_ITEM_ID,TRUCK_ID,ORDER_YEAR,ORDER_MONTH,DEMAND,EV_FLAG,DAYS_OPENED,MENU_TYPE_ID,TEMPERATURE_OPTION,COST_OF_GOODS_USD,ITEM_CATEGORY_Beverage,ITEM_CATEGORY_Main,ITEM_CATEGORY_Dessert,ITEM_CATEGORY_Snack
MENU_ITEM_ID,1.0,0.029032,0.016474,0.003368,0.132419,-0.126577,-0.032044,0.998925,0.035485,0.06011,0.117394,0.070062,-0.346218,-0.05989
TRUCK_ID,0.029032,1.0,0.025548,-0.003171,0.239371,0.075438,-0.044308,0.029083,-0.006986,0.004133,0.003132,-0.004954,0.008007,-0.004291
ORDER_YEAR,0.016474,0.025548,1.0,-0.161856,-0.011683,0.308498,-0.540929,0.016692,-0.004068,-0.005345,0.00364,0.000978,-0.000222,-0.010105
ORDER_MONTH,0.003368,-0.003171,-0.161856,1.0,-0.068573,-0.007347,-0.032374,0.003372,0.00058,0.00123,0.00033,-0.000358,-0.001694,0.001816
DEMAND,0.132419,0.239371,-0.011683,-0.068573,1.0,0.023086,-0.014647,0.151374,0.574723,0.501136,-0.590446,0.55756,-0.036196,0.118869
EV_FLAG,-0.126577,0.075438,0.308498,-0.007347,0.023086,1.0,-0.554058,-0.126992,-0.012098,-0.020064,-0.015059,0.008221,0.026077,-0.01162
DAYS_OPENED,-0.032044,-0.044308,-0.540929,-0.032374,-0.014647,-0.554058,1.0,-0.032443,0.006993,0.008885,-0.006809,-0.001551,0.00148,0.017174
MENU_TYPE_ID,0.998925,0.029083,0.016692,0.003372,0.151374,-0.126992,-0.032443,1.0,0.061084,0.08406,0.095413,0.095795,-0.34931,-0.064737
TEMPERATURE_OPTION,0.035485,-0.006986,-0.004068,0.00058,0.574723,-0.012098,0.006993,0.061084,1.0,0.801614,-0.776153,0.8475,-0.205802,0.065245
COST_OF_GOODS_USD,0.06011,0.004133,-0.005345,0.00123,0.501136,-0.020064,0.008885,0.08406,0.801614,1.0,-0.756692,0.816159,-0.108054,-0.00997


In [82]:
# Scaling

#drop target
#target = final_df['DEMAND']
#final_df.drop('DEMAND', axis = 1, inplace = True)

#scaler = StandardScaler()
#scaler.fit(final_df)

#final_scaled = scaler.transform(final_df)
#final_scaled = pd.DataFrame(final_scaled, columns=final_df.columns)

#assign target variable back to scaled data frame
#final_scaled = final_scaled.assign(DEMAND = target)
#final_scaled.head()
final_scaled = final_df
final_scaled['UNIT_PRICE'] = final_scaled['UNIT_PRICE'].astype(float)
drop = final_scaled[((final_scaled["ORDER_YEAR"] == 2022) & (final_scaled['ORDER_MONTH'] == 11))]
merge = pd.merge(drop, final_scaled,how='outer', indicator=True)
final_scaled = merge[merge['_merge'] == 'right_only'].drop('_merge', axis = 1)
final_scaled.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 57291 entries, 3000 to 60290
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   MENU_ITEM_ID            57291 non-null  int16  
 1   TRUCK_ID                57291 non-null  int16  
 2   ORDER_YEAR              57291 non-null  int16  
 3   ORDER_MONTH             57291 non-null  int8   
 4   DEMAND                  57291 non-null  int64  
 5   UNIT_PRICE              57291 non-null  float64
 6   EV_FLAG                 57291 non-null  int8   
 7   DAYS_OPENED             57291 non-null  int32  
 8   MENU_TYPE_ID            57291 non-null  int8   
 9   TEMPERATURE_OPTION      57291 non-null  int8   
 10  COST_OF_GOODS_USD       57291 non-null  float64
 11  ITEM_CATEGORY_Beverage  57291 non-null  int32  
 12  ITEM_CATEGORY_Main      57291 non-null  int32  
 13  ITEM_CATEGORY_Dessert   57291 non-null  int32  
 14  ITEM_CATEGORY_Snack     57291 non-n

In [83]:
final_scaled.head()

Unnamed: 0,MENU_ITEM_ID,TRUCK_ID,ORDER_YEAR,ORDER_MONTH,DEMAND,UNIT_PRICE,EV_FLAG,DAYS_OPENED,MENU_TYPE_ID,TEMPERATURE_OPTION,COST_OF_GOODS_USD,ITEM_CATEGORY_Beverage,ITEM_CATEGORY_Main,ITEM_CATEGORY_Dessert,ITEM_CATEGORY_Snack
3000,116,116,2021,8,7280,3.0,0,1021,11,0,0.75,1,0,0,0
3001,86,113,2021,8,7394,3.0,0,1386,8,0,0.75,1,0,0,0
3002,126,297,2021,6,10869,3.0,0,839,12,0,0.75,1,0,0,0
3003,145,299,2021,6,10695,3.0,0,655,14,0,0.5,1,0,0,0
3004,112,296,2021,6,47669,12.0,0,747,11,2,4.5,0,1,0,0


In [84]:
holdout = final_scaled[(final_scaled["ORDER_YEAR"] == 2022) & (final_scaled['ORDER_MONTH'] >= 8)]
x_holdout = holdout.drop('DEMAND',axis=1)
y_holdout = holdout['DEMAND']
holdout.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8976 entries, 3029 to 60288
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   MENU_ITEM_ID            8976 non-null   int16  
 1   TRUCK_ID                8976 non-null   int16  
 2   ORDER_YEAR              8976 non-null   int16  
 3   ORDER_MONTH             8976 non-null   int8   
 4   DEMAND                  8976 non-null   int64  
 5   UNIT_PRICE              8976 non-null   float64
 6   EV_FLAG                 8976 non-null   int8   
 7   DAYS_OPENED             8976 non-null   int32  
 8   MENU_TYPE_ID            8976 non-null   int8   
 9   TEMPERATURE_OPTION      8976 non-null   int8   
 10  COST_OF_GOODS_USD       8976 non-null   float64
 11  ITEM_CATEGORY_Beverage  8976 non-null   int32  
 12  ITEM_CATEGORY_Main      8976 non-null   int32  
 13  ITEM_CATEGORY_Dessert   8976 non-null   int32  
 14  ITEM_CATEGORY_Snack     8976 non-nul

In [85]:
merged = pd.merge(holdout, final_scaled,how='outer', indicator=True)
final_scaled = merged[merged['_merge'] == 'right_only'].drop('_merge', axis = 1)
final_scaled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48315 entries, 8976 to 57290
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   MENU_ITEM_ID            48315 non-null  int16  
 1   TRUCK_ID                48315 non-null  int16  
 2   ORDER_YEAR              48315 non-null  int16  
 3   ORDER_MONTH             48315 non-null  int8   
 4   DEMAND                  48315 non-null  int64  
 5   UNIT_PRICE              48315 non-null  float64
 6   EV_FLAG                 48315 non-null  int8   
 7   DAYS_OPENED             48315 non-null  int32  
 8   MENU_TYPE_ID            48315 non-null  int8   
 9   TEMPERATURE_OPTION      48315 non-null  int8   
 10  COST_OF_GOODS_USD       48315 non-null  float64
 11  ITEM_CATEGORY_Beverage  48315 non-null  int32  
 12  ITEM_CATEGORY_Main      48315 non-null  int32  
 13  ITEM_CATEGORY_Dessert   48315 non-null  int32  
 14  ITEM_CATEGORY_Snack     48315 non-n

In [86]:
# Train Test Split
# Define Model Inputs (X) and Output (y)
X = final_scaled.drop('DEMAND',axis=1)
y = final_scaled["DEMAND"]

# Split both Inputs (X) and Ouput (y) into training set (70%) and testing set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [87]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

# Create the model "lr"
lr = LinearRegression()

# Fit the model to the training set
lr.fit(X_train, y_train)

# Make predictions on the training and testing sets
train_predictions = lr.predict(X_train)
test_predictions = lr.predict(X_test)

# Calculate RMSE for the training and testing sets
train_rmse = sqrt(mean_squared_error(y_train, train_predictions))
test_rmse = sqrt(mean_squared_error(y_test, test_predictions))

# Calculate R-squared values for the training and testing sets
train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)

# Print the results
print('Train RMSE:', train_rmse)
print('Test RMSE:', test_rmse)
print('Train R-squared:', train_r2)
print('Test R-squared:', test_r2)


Train RMSE: 10841.934883513983
Test RMSE: 10871.012743488347
Train R-squared: 0.5381119711422004
Test R-squared: 0.5322648893680376


In [88]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators = 10, random_state = 0)
rf.fit(X_train, y_train)

# Make predictions on the training and testing sets
train_predictions = rf.predict(X_train)
test_predictions = rf.predict(X_test)

# Calculate RMSE for the training and testing sets
train_rmse = sqrt(mean_squared_error(y_train, train_predictions))
test_rmse = sqrt(mean_squared_error(y_test, test_predictions))

# Calculate R-squared values for the training and testing sets
train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)

# Print the results
print('Train RMSE:', train_rmse)
print('Test RMSE:', test_rmse)
print('Train R-squared:', train_r2)
print('Test R-squared:', test_r2)

Train RMSE: 122.80028517607703
Test RMSE: 279.8780111498858
Train R-squared: 0.9999407454276689
Test R-squared: 0.9996899744261462


In [89]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf, X_train, y_train, cv = 5)
scores.mean()

0.9996193758231662

In [90]:
import xgboost as xg

# create an xgboost regression model
xgb = xg.XGBRegressor()

# Fitting the model
xgb.fit(X_train, y_train)
  
# Make predictions on the training and testing sets
train_predictions = xgb.predict(X_train)
test_predictions = xgb.predict(X_test)

# Calculate RMSE for the training and testing sets
train_rmse = sqrt(mean_squared_error(y_train, train_predictions))
test_rmse = sqrt(mean_squared_error(y_test, test_predictions))

# Calculate R-squared values for the training and testing sets
train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)

# Print the results
print('Train RMSE:', train_rmse)
print('Test RMSE:', test_rmse)
print('Train R-squared:', train_r2)
print('Test R-squared:', test_r2)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Train RMSE: 344.64219987970546
Test RMSE: 386.3716076637079
Train R-squared: 0.9995332761172356
Test R-squared: 0.9994091593437655


### Holdout 

In [91]:
# Make predictions on the training and testing sets
ho_predictions = rf.predict(x_holdout)

# Calculate RMSE for the training and testing sets
ho_rmse = sqrt(mean_squared_error(y_holdout, ho_predictions))

# Calculate R-squared values for the training and testing sets
ho_r2 = r2_score(y_holdout, ho_predictions)

# Print the results
print('Train RMSE:', ho_rmse)
print('Train R-squared:', ho_r2)

Train RMSE: 630.244995797796
Train R-squared: 0.9984755438655901


In [92]:
import pickle
filename = 'inventory_model.sav'
pickle.dump(rf, open(filename, 'wb'))

### Additional

In [100]:
dd_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_supply_chain.distribution_detail")
dh_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_supply_chain.distribution_header")
eod_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_supply_chain.eod_stock_assignment")
reci_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_supply_chain.RECIPE")

In [94]:
dd_df.show()

------------------------------------------------------------------------------------------------------------------------------------------------------
|"DH_DETAIL_ID"  |"DH_ID"  |"LINE_ITEM_ID"  |"ITEM_ID"  |"QUANTITY"      |"EXPIRATION_DATE"  |"PO_ID"  |"CREATED_DATE"              |"UPDATED_DATE"  |
------------------------------------------------------------------------------------------------------------------------------------------------------
|698170          |54436    |9               |22         |2087.000000000  |2022-07-06         |26724    |2023-04-13 22:18:29.306000  |NULL            |
|698171          |54456    |3               |3          |22.000000000    |2023-05-31         |26589    |2023-04-13 22:18:29.306000  |NULL            |
|698172          |54548    |3               |21         |28.000000000    |2022-07-06         |26777    |2023-04-13 22:18:29.306000  |NULL            |
|698173          |54315    |8               |37         |1899.000000000  |2022-07-06         |

In [95]:
dh_df.show()

-------------------------------------------------------------------------------------------------------------
|"DH_ID"  |"TRUCK_ID"  |"WAREHOUSE_ID"  |"DISTRIBUTION_DATE"  |"CREATED_DATE"              |"UPDATED_DATE"  |
-------------------------------------------------------------------------------------------------------------
|44678    |23          |2               |2023-02-27           |2023-04-10 15:24:34.910000  |NULL            |
|45051    |195         |13              |2023-02-27           |2023-04-10 15:24:34.910000  |NULL            |
|45013    |300         |20              |2023-02-27           |2023-04-10 15:24:34.910000  |NULL            |
|44798    |112         |8               |2023-02-27           |2023-04-10 15:24:34.910000  |NULL            |
|44799    |3           |1               |2023-02-27           |2023-04-10 15:24:34.910000  |NULL            |
|44760    |262         |18              |2023-02-27           |2023-04-10 15:24:34.910000  |NULL            |
|44761    

In [96]:
eod_df.show()

--------------------------------------------------------------------------------------------------------
|"ASSIGNMENT_ID"  |"TRUCK_ID"  |"ITEM_ID"  |"PO_ID"  |"QUANTITY"  |"EXPIRATION_DATE"  |"CREATED_DATE"  |
--------------------------------------------------------------------------------------------------------
|196177           |149         |1          |420      |0.05        |2023-10-05         |2022-11-21      |
|4338188          |324         |1          |20386    |1.34        |2023-12-26         |2023-02-09      |
|4431933          |60          |1          |20700    |4.78        |2023-12-29         |2023-02-20      |
|5580458          |390         |2          |27116    |0.45        |2023-06-14         |2022-07-24      |
|600409           |305         |1          |1792     |5.22        |2023-11-21         |2023-01-06      |
|4630723          |145         |3          |22407    |2.15        |2024-01-23         |2023-03-15      |
|600428           |313         |1          |1792     |4

In [97]:
d_df = dd_df.join(dh_df, on= ['DH_ID'], lsuffix = "", rsuffix = "_01")
d_df = d_df.with_column('DISTRIBUTION_YEAR', F.year(F.col('DISTRIBUTION_DATE')))
d_df = d_df.with_column('DISTRIBUTION_MONTH', F.month(F.col('DISTRIBUTION_DATE')))
d_df = d_df.group_by("TRUCK_ID","ITEM_ID",'DISTRIBUTION_YEAR', 'DISTRIBUTION_MONTH').agg(F.sum("QUANTITY").alias('Supply'))

In [99]:
# Get account credentials from a json file
with open("account.json") as f:
    data = json.load(f)
    username = data["username"]
    password = data["password"]
    account = data["account"]

# Specify connection parameters
connection_parameters = {
    "account": account,
    "user": username,
    "password": password,
    "role": "ACCOUNTADMIN",
    #"warehouse": "tasty_ds_wh",
    "database": "FROSTBYTE_TASTY_BYTES",
    "schema": "analytics",
}

# Create Snowpark session
sessionWrite = Session.builder.configs(connection_parameters).create()

final_scaled_sf = sessionWrite.create_dataframe(final_scaled)
# Save X data
final_scaled_sf.write.mode("overwrite").save_as_table("frostbyte_tasty_bytes.analytics.inventory_management")

# Save y data
d_df.write.mode("overwrite").save_as_table("frostbyte_tasty_bytes.analytics.monthly_supply")

  success, nchunks, nrows, ci_output = write_pandas(


In [105]:

demand_item = final_scaled_sf.join(reci_df, on= ['menu_item_id'], lsuffix = "", rsuffix = "_01")

In [108]:
demand_item = demand_item.with_column('Demand_item', F.col('UNIT_QUANTITY') * F.col('DEMAND'))
demand_item = demand_item.group_by("TRUCK_ID","ITEM_ID",'ORDER_YEAR', 'ORDER_MONTH').agg(F.sum("DEMAND_ITEM").alias('DEMAND_ITEM'))

In [111]:
demand_item.write.mode("overwrite").save_as_table("frostbyte_tasty_bytes.analytics.monthly_demand")

In [112]:
demand_item.show()

---------------------------------------------------------------------------
|"TRUCK_ID"  |"ITEM_ID"  |"ORDER_YEAR"  |"ORDER_MONTH"  |"DEMAND_ITEM"    |
---------------------------------------------------------------------------
|116         |3          |2021          |8              |364.000000000    |
|113         |3          |2021          |8              |369.700000000    |
|297         |3          |2021          |6              |543.450000000    |
|299         |2          |2021          |6              |534.750000000    |
|296         |70         |2021          |6              |14355.300000000  |
|296         |41         |2021          |6              |9532.800000000   |
|296         |22         |2021          |6              |4766.400000000   |
|296         |74         |2021          |6              |4766.900000000   |
|296         |61         |2021          |6              |14355.300000000  |
|293         |2          |2021          |6              |550.000000000    |
------------

In [114]:
d_df.sort("DISTRIBUTION_YEAR").show()

----------------------------------------------------------------------------------------
|"TRUCK_ID"  |"ITEM_ID"  |"DISTRIBUTION_YEAR"  |"DISTRIBUTION_MONTH"  |"SUPPLY"        |
----------------------------------------------------------------------------------------
|430         |66         |2022                 |12                    |834.000000000   |
|375         |40         |2022                 |12                    |1445.000000000  |
|87          |38         |2022                 |12                    |4481.000000000  |
|427         |60         |2022                 |12                    |390.000000000   |
|254         |2          |2022                 |12                    |458.000000000   |
|6           |56         |2022                 |12                    |65.000000000    |
|413         |56         |2022                 |12                    |774.000000000   |
|128         |61         |2022                 |12                    |499.000000000   |
|287         |2      