In [13]:
# Import Python packages
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import json
import sys
import cachetools
from datetime import datetime
from datetime import date
import matplotlib.pyplot as plt 
import seaborn as sns
import math

# Import Snowflake modules
from snowflake.snowpark import Session
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T
from snowflake.snowpark import Window
from snowflake.snowpark.functions import col

# Clustering & Model
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score, train_test_split

# Scalers
from sklearn.preprocessing import StandardScaler

# Evaluation
from sklearn.metrics import silhouette_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# find the best combination of model hyperparameters
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score, cross_validate

# Getting Password,Username, Account
import getpass

In [14]:
# Get account credentials from a json file
with open("account.json") as f:
    data = json.load(f)
    username = data["username"]
    password = data["password"]
    account = data["account"]

# Specify connection parameters
connection_parameters = {
    "account": account,
    "user": username,
    "password": password,
    #"role": "ACCOUNTADMIN",
    #"warehouse": "tasty_ds_wh",
    #"database": "NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE",
    #"schema": "analytics",
}

# Create Snowpark session
session = Session.builder.configs(connection_parameters).create()

In [15]:
order_header_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_pos.order_header")
order_detail_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_pos.order_detail")
menu_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_pos.menu")
truck_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_pos.truck")

In [16]:
order_header_df.show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"ORDER_ID"  |"TRUCK_ID"  |"LOCATION_ID"  |"CUSTOMER_ID"  |"DISCOUNT_ID"  |"SHIFT_ID"  |"SHIFT_START_TIME"  |"SHIFT_END_TIME"  |"ORDER_CHANNEL"  |"ORDER_TS"           |"SERVED_TS"  |"ORDER_CURRENCY"  |"ORDER_AMOUNT"  |"ORDER_TAX_AMOUNT"  |"ORDER_DISCOUNT_AMOUNT"  |"ORDER_TOTAL"  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|22477378    |97          |3713.0         |NULL           |NULL           |78881       |16:00:00            |23:00:00          |NULL             |2020-04-

In [17]:
order_detail_df.show()

--------------------------------------------------------------------------------------------------------------------------------------------------------
|"ORDER_DETAIL_ID"  |"ORDER_ID"  |"MENU_ITEM_ID"  |"DISCOUNT_ID"  |"LINE_NUMBER"  |"QUANTITY"  |"UNIT_PRICE"  |"PRICE"  |"ORDER_ITEM_DISCOUNT_AMOUNT"  |
--------------------------------------------------------------------------------------------------------------------------------------------------------
|798202495          |420427395   |131             |NULL           |0              |1           |13.0000       |13.0000  |NULL                          |
|798202496          |420427396   |133             |NULL           |0              |1           |6.0000        |6.0000   |NULL                          |
|798202497          |420427397   |133             |NULL           |0              |1           |6.0000        |6.0000   |NULL                          |
|798202498          |420427398   |133             |NULL           |0              

In [18]:
menu_df.show()

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"MENU_ID"  |"MENU_TYPE_ID"  |"MENU_TYPE"  |"TRUCK_BRAND_NAME"  |"MENU_ITEM_ID"  |"MENU_ITEM_NAME"    |"ITEM_CATEGORY"  |"ITEM_SUBCATEGORY"  |"COST_OF_GOODS_USD"  |"SALE_PRICE_USD"  |"MENU_ITEM_HEALTH_METRICS_OBJ"     |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|10001      |1               |Ice Cream    |Freezing Point      |10              |Lemonade            |Beverage         |Cold Option         |0.6500               |3.5000            |{                                  |
|           |                |             |                    |                |                    |                 

In [19]:
#Minimum order transaction by a truck
order_header_df.group_by("TRUCK_ID").agg(F.count("ORDER_ID").alias('count')).agg(F.min('count')).show()

----------------
|"MIN(COUNT)"  |
----------------
|8850          |
----------------



In [20]:
#order_header_df.drop_duplicates('ORDER_ID').group_by('ORDER_ID').agg(F.count('ORDER_ID').alias('nnn')).agg(F.sum('NNN')).show()

In [21]:
#order_detail_df.drop_duplicates('ORDER_ID').group_by('ORDER_ID').agg(F.count('ORDER_ID').alias('nnn')).agg(F.sum('NNN')).show()

In [22]:
order_truck = order_header_df.select('ORDER_ID', 'TRUCK_ID', 'ORDER_TS')
df = order_detail_df.join(order_truck, order_detail_df.ORDER_ID == order_truck.ORDER_ID, how = "left", lsuffix = "", rsuffix = "_01")

In [23]:
df.show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"ORDER_DETAIL_ID"  |"ORDER_ID"  |"MENU_ITEM_ID"  |"DISCOUNT_ID"  |"LINE_NUMBER"  |"QUANTITY"  |"UNIT_PRICE"  |"PRICE"  |"ORDER_ITEM_DISCOUNT_AMOUNT"  |"ORDER_ID_01"  |"TRUCK_ID"  |"ORDER_TS"           |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|798202497          |420427397   |133             |NULL           |0              |1           |6.0000        |6.0000   |NULL                          |420427397      |433         |2022-04-10 19:42:18  |
|798202503          |420427400   |131             |NULL           |0              |4           |13.0000       |52.0000  |NULL                          |420427400      |433         |202

In [24]:
df = df.with_column('ORDER_YEAR', F.year(F.col('ORDER_TS')))
df = df.with_column('ORDER_MONTH', F.month(F.col('ORDER_TS')))
df = df.with_column('ORDER_DAY', F.dayofmonth(F.col('ORDER_TS')))

In [25]:
#df.show()

In [26]:
#df.describe().show()

In [27]:
demand_df = df.group_by("TRUCK_ID", 'MENU_ITEM_ID', 'ORDER_YEAR', 'ORDER_MONTH', 'ORDER_DAY').agg(F.sum("QUANTITY").alias('DEMAND'))
#demand_df.sort('TRUCK_ID').show()

In [28]:
unit_price = df.group_by("TRUCK_ID", 'MENU_ITEM_ID', 'ORDER_YEAR', 'ORDER_MONTH', 'ORDER_DAY').agg(F.mean("unit_price").alias('unit_price'))
#unit_price.sort('TRUCK_ID').show()

In [29]:
final_df = demand_df.join(unit_price, on= ['TRUCK_ID', 'MENU_ITEM_ID', 'ORDER_YEAR', 'ORDER_MONTH', 'ORDER_DAY'], lsuffix = "", rsuffix = "_01")

In [30]:
truck_df.show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"TRUCK_ID"  |"MENU_TYPE_ID"  |"PRIMARY_CITY"  |"REGION"    |"ISO_REGION"  |"COUNTRY"      |"ISO_COUNTRY_CODE"  |"FRANCHISE_FLAG"  |"YEAR"  |"MAKE"        |"MODEL"           |"EV_FLAG"  |"FRANCHISE_ID"  |"TRUCK_OPENING_DATE"  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|3           |3               |San Mateo       |California  |CA            |United States  |US                  |1                 |2004    |Freightliner  |MT45 Utilimaster  |0          |2               |2021-10-01            |
|4           |4               |San Mateo       |California  |CA            |United State

In [31]:
#truck_df = truck_df.filter(F.col('COUNTRY') == 'United States')

In [32]:
truck = truck_df.with_column('LAST_DATE', F.iff(F.col("TRUCK_ID") == F.col('TRUCK_ID'), "2022-10-18", '0'))
truck = truck.withColumn("DAYS_OPENED", F.datediff("day", F.col("TRUCK_OPENING_DATE"), F.col('LAST_DATE')))
#truck = truck.select('TRUCK_ID', 'PRIMARY_CITY', 'REGION', 'COUNTRY', 'DAYS_OPENED')
truck = truck.select('TRUCK_ID','EV_FLAG','DAYS_OPENED')

truck.show()

------------------------------------------
|"TRUCK_ID"  |"EV_FLAG"  |"DAYS_OPENED"  |
------------------------------------------
|3           |0          |382            |
|4           |1          |655            |
|5           |1          |200            |
|6           |0          |1205           |
|7           |0          |655            |
|8           |1          |200            |
|9           |0          |839            |
|12          |0          |565            |
|13          |0          |474            |
|14          |0          |747            |
------------------------------------------



In [33]:
final_df = final_df.join(truck, on= ['TRUCK_ID'], lsuffix = "", rsuffix = "_01")

In [34]:
menu = menu_df.drop('MENU_ITEM_HEALTH_METRICS_OBJ')
menu.show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"MENU_ID"  |"MENU_TYPE_ID"  |"MENU_TYPE"  |"TRUCK_BRAND_NAME"  |"MENU_ITEM_ID"  |"MENU_ITEM_NAME"           |"ITEM_CATEGORY"  |"ITEM_SUBCATEGORY"  |"COST_OF_GOODS_USD"  |"SALE_PRICE_USD"  |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|10088      |13              |Chinese      |Peking Truck        |136             |Ice Tea                    |Beverage         |Cold Option         |0.7500               |3.0000            |
|10089      |14              |Indian       |Nani's Kitchen      |141             |Lean Chicken Tikka Masala  |Main             |Hot Option          |10.0000              |17.0000           |
|10090      |14              |Indian       |N

In [35]:
menu = menu.with_column('TEMPERATURE_OPTION', F.when(F.col('ITEM_SUBCATEGORY') == 'Cold Option', 0).when(
    F.col('ITEM_SUBCATEGORY') == 'Warm Option', 1).otherwise(2))
menu = menu.select('MENU_ITEM_ID', 'MENU_TYPE_ID', 'TEMPERATURE_OPTION', 'COST_OF_GOODS_USD', 'ITEM_CATEGORY') #Add item category and ohe in pandas
menu.show()

--------------------------------------------------------------------------------------------------
|"MENU_ITEM_ID"  |"MENU_TYPE_ID"  |"TEMPERATURE_OPTION"  |"COST_OF_GOODS_USD"  |"ITEM_CATEGORY"  |
--------------------------------------------------------------------------------------------------
|54              |5               |0                     |0.5000               |Beverage         |
|55              |5               |0                     |0.5000               |Beverage         |
|56              |5               |0                     |0.7500               |Beverage         |
|61              |6               |2                     |1.5000               |Main             |
|62              |6               |2                     |2.4000               |Main             |
|63              |6               |2                     |2.6000               |Main             |
|64              |6               |0                     |0.5000               |Beverage         |
|65       

In [36]:
final_df = final_df.join(menu, on= ['MENU_ITEM_ID'], lsuffix = "", rsuffix = "_01")

In [37]:
#final_df.show()

In [38]:
#final_df.sort(['ORDER_YEAR', 'ORDER_MONTH'], ascending=[False, False]).show()

In [39]:
import feature_engine
from feature_engine.encoding import OneHotEncoder

final_df = final_df.to_pandas()
ohe_enc = OneHotEncoder(
    top_categories=None,
    drop_last=False,
    variables= ['ITEM_CATEGORY'])  # to return k-1, false to return k


ohe_enc.fit(final_df)
final_df = ohe_enc.transform(final_df)

In [40]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1744744 entries, 0 to 1744743
Data columns (total 16 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   MENU_ITEM_ID            int16  
 1   TRUCK_ID                int16  
 2   ORDER_YEAR              int16  
 3   ORDER_MONTH             int8   
 4   ORDER_DAY               int8   
 5   DEMAND                  int64  
 6   UNIT_PRICE              object 
 7   EV_FLAG                 int8   
 8   DAYS_OPENED             int32  
 9   MENU_TYPE_ID            int8   
 10  TEMPERATURE_OPTION      int8   
 11  COST_OF_GOODS_USD       float64
 12  ITEM_CATEGORY_Main      int32  
 13  ITEM_CATEGORY_Beverage  int32  
 14  ITEM_CATEGORY_Dessert   int32  
 15  ITEM_CATEGORY_Snack     int32  
dtypes: float64(1), int16(3), int32(5), int64(1), int8(5), object(1)
memory usage: 91.5+ MB


In [41]:
final_df.corr()

  final_df.corr()


Unnamed: 0,MENU_ITEM_ID,TRUCK_ID,ORDER_YEAR,ORDER_MONTH,ORDER_DAY,DEMAND,EV_FLAG,DAYS_OPENED,MENU_TYPE_ID,TEMPERATURE_OPTION,COST_OF_GOODS_USD,ITEM_CATEGORY_Main,ITEM_CATEGORY_Beverage,ITEM_CATEGORY_Dessert,ITEM_CATEGORY_Snack
MENU_ITEM_ID,1.0,0.028822,0.017816,0.004362,0.000118,0.137943,-0.127218,-0.035291,0.998926,0.035217,0.059955,0.069602,0.11747,-0.345721,-0.060066
TRUCK_ID,0.028822,1.0,0.025549,-0.004239,-0.000364,0.252452,0.074173,-0.044074,0.028875,-0.007379,0.004212,-0.005346,0.003138,0.008973,-0.004416
ORDER_YEAR,0.017816,0.025549,1.0,-0.229731,-0.005063,0.03623,0.309228,-0.545806,0.018038,-0.003929,-0.005019,0.001168,0.003783,-0.000898,-0.010138
ORDER_MONTH,0.004362,-0.004239,-0.229731,1.0,0.008547,-0.007377,-0.035662,0.000941,0.004355,0.001051,0.002253,-0.000146,0.000282,-0.002883,0.002676
ORDER_DAY,0.000118,-0.000364,-0.005063,0.008547,1.0,-0.000248,-0.001154,0.001257,0.000118,8.9e-05,6.7e-05,3.7e-05,1.1e-05,-0.000157,5.7e-05
DEMAND,0.137943,0.252452,0.03623,-0.007377,-0.000248,1.0,0.04628,-0.043923,0.157803,0.601486,0.523795,0.583752,-0.618409,-0.037051,0.123866
EV_FLAG,-0.127218,0.074173,0.309228,-0.035662,-0.001154,0.04628,1.0,-0.544327,-0.127633,-0.011775,-0.019977,0.008565,-0.015115,0.025461,-0.011567
DAYS_OPENED,-0.035291,-0.044074,-0.545806,0.000941,0.001257,-0.043923,-0.544327,1.0,-0.035691,0.006602,0.007808,-0.002041,-0.007092,0.003333,0.016966
MENU_TYPE_ID,0.998926,0.028875,0.018038,0.004355,0.000118,0.157803,-0.127633,-0.035691,1.0,0.060816,0.083902,0.095344,0.095486,-0.348804,-0.064939
TEMPERATURE_OPTION,0.035217,-0.007379,-0.003929,0.001051,8.9e-05,0.601486,-0.011775,0.006602,0.060816,1.0,0.801789,0.847621,-0.776413,-0.20551,0.064798


In [42]:
# Scaling

#drop target
#target = final_df['DEMAND']
#final_df.drop('DEMAND', axis = 1, inplace = True)

#scaler = StandardScaler()
#scaler.fit(final_df)

#final_scaled = scaler.transform(final_df)
#final_scaled = pd.DataFrame(final_scaled, columns=final_df.columns)

#assign target variable back to scaled data frame
#final_scaled = final_scaled.assign(DEMAND = target)
#final_scaled.head()
final_scaled = final_df
final_scaled['UNIT_PRICE'] = final_scaled['UNIT_PRICE'].astype(float)
drop = final_scaled[((final_scaled["ORDER_YEAR"] == 2022) & (final_scaled['ORDER_MONTH'] == 11))]
merge = pd.merge(drop, final_scaled,how='outer', indicator=True)
final_scaled = merge[merge['_merge'] == 'right_only'].drop('_merge', axis = 1)
final_scaled.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1741744 entries, 3000 to 1744743
Data columns (total 16 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   MENU_ITEM_ID            int16  
 1   TRUCK_ID                int16  
 2   ORDER_YEAR              int16  
 3   ORDER_MONTH             int8   
 4   ORDER_DAY               int8   
 5   DEMAND                  int64  
 6   UNIT_PRICE              float64
 7   EV_FLAG                 int8   
 8   DAYS_OPENED             int32  
 9   MENU_TYPE_ID            int8   
 10  TEMPERATURE_OPTION      int8   
 11  COST_OF_GOODS_USD       float64
 12  ITEM_CATEGORY_Main      int32  
 13  ITEM_CATEGORY_Beverage  int32  
 14  ITEM_CATEGORY_Dessert   int32  
 15  ITEM_CATEGORY_Snack     int32  
dtypes: float64(2), int16(3), int32(5), int64(1), int8(5)
memory usage: 104.6 MB


In [43]:
final_scaled.head()

Unnamed: 0,MENU_ITEM_ID,TRUCK_ID,ORDER_YEAR,ORDER_MONTH,ORDER_DAY,DEMAND,UNIT_PRICE,EV_FLAG,DAYS_OPENED,MENU_TYPE_ID,TEMPERATURE_OPTION,COST_OF_GOODS_USD,ITEM_CATEGORY_Main,ITEM_CATEGORY_Beverage,ITEM_CATEGORY_Dessert,ITEM_CATEGORY_Snack
3000,52,425,2022,4,11,581,17.25,1,290,5,2,7.0,1,0,0,0
3001,76,427,2022,4,11,146,3.0,0,1021,7,0,0.75,0,1,0,0
3002,82,428,2022,4,11,566,15.0,0,565,8,2,6.0,1,0,0,0
3003,96,429,2022,4,11,129,3.0,1,290,9,0,0.75,0,1,0,0
3004,131,283,2022,6,1,1545,13.0,0,747,13,2,6.0,1,0,0,0


In [44]:
holdout = final_scaled[(final_scaled["ORDER_YEAR"] == 2022) & (final_scaled['ORDER_MONTH'] >= 8)]
x_holdout = holdout.drop('DEMAND',axis=1)
y_holdout = holdout['DEMAND']
holdout.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 275268 entries, 3147 to 1744729
Data columns (total 16 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   MENU_ITEM_ID            275268 non-null  int16  
 1   TRUCK_ID                275268 non-null  int16  
 2   ORDER_YEAR              275268 non-null  int16  
 3   ORDER_MONTH             275268 non-null  int8   
 4   ORDER_DAY               275268 non-null  int8   
 5   DEMAND                  275268 non-null  int64  
 6   UNIT_PRICE              275268 non-null  float64
 7   EV_FLAG                 275268 non-null  int8   
 8   DAYS_OPENED             275268 non-null  int32  
 9   MENU_TYPE_ID            275268 non-null  int8   
 10  TEMPERATURE_OPTION      275268 non-null  int8   
 11  COST_OF_GOODS_USD       275268 non-null  float64
 12  ITEM_CATEGORY_Main      275268 non-null  int32  
 13  ITEM_CATEGORY_Beverage  275268 non-null  int32  
 14  ITEM_CATEGORY_De

In [45]:
merged = pd.merge(holdout, final_scaled,how='outer', indicator=True)
final_scaled = merged[merged['_merge'] == 'right_only'].drop('_merge', axis = 1)
final_scaled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1466476 entries, 275268 to 1741743
Data columns (total 16 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   MENU_ITEM_ID            1466476 non-null  int16  
 1   TRUCK_ID                1466476 non-null  int16  
 2   ORDER_YEAR              1466476 non-null  int16  
 3   ORDER_MONTH             1466476 non-null  int8   
 4   ORDER_DAY               1466476 non-null  int8   
 5   DEMAND                  1466476 non-null  int64  
 6   UNIT_PRICE              1466476 non-null  float64
 7   EV_FLAG                 1466476 non-null  int8   
 8   DAYS_OPENED             1466476 non-null  int32  
 9   MENU_TYPE_ID            1466476 non-null  int8   
 10  TEMPERATURE_OPTION      1466476 non-null  int8   
 11  COST_OF_GOODS_USD       1466476 non-null  float64
 12  ITEM_CATEGORY_Main      1466476 non-null  int32  
 13  ITEM_CATEGORY_Beverage  1466476 non-null  int32  
 1

In [46]:
# Train Test Split
# Define Model Inputs (X) and Output (y)
X = final_scaled.drop('DEMAND',axis=1)
y = final_scaled["DEMAND"]

# Split both Inputs (X) and Ouput (y) into training set (70%) and testing set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [47]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

# Create the model "lr"
lr = LinearRegression()

# Fit the model to the training set
lr.fit(X_train, y_train)

# Make predictions on the training and testing sets
train_predictions = lr.predict(X_train)
test_predictions = lr.predict(X_test)

# Calculate RMSE for the training and testing sets
train_rmse = sqrt(mean_squared_error(y_train, train_predictions))
test_rmse = sqrt(mean_squared_error(y_test, test_predictions))

# Calculate R-squared values for the training and testing sets
train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)

# Print the results
print('Train RMSE:', train_rmse)
print('Test RMSE:', test_rmse)
print('Train R-squared:', train_r2)
print('Test R-squared:', test_r2)


Train RMSE: 362.0862209041853
Test RMSE: 361.7573131297499
Train R-squared: 0.5307418838247802
Test R-squared: 0.5297188374868678


In [48]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators = 10, random_state = 0)
rf.fit(X_train, y_train)

# Make predictions on the training and testing sets
train_predictions = rf.predict(X_train)
test_predictions = rf.predict(X_test)

# Calculate RMSE for the training and testing sets
train_rmse = sqrt(mean_squared_error(y_train, train_predictions))
test_rmse = sqrt(mean_squared_error(y_test, test_predictions))

# Calculate R-squared values for the training and testing sets
train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)

# Print the results
print('Train RMSE:', train_rmse)
print('Test RMSE:', test_rmse)
print('Train R-squared:', train_r2)
print('Test R-squared:', test_r2)

Train RMSE: 18.801883883711778
Test RMSE: 44.16343453557039
Train R-squared: 0.9987347086793764
Test R-squared: 0.992991119983407


In [49]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf, X_train, y_train, cv = 5)
scores.mean()

0.9920526061817314

In [50]:
import xgboost as xg

# create an xgboost regression model
xgb = xg.XGBRegressor()

# Fitting the model
xgb.fit(X_train, y_train)
  
# Make predictions on the training and testing sets
train_predictions = xgb.predict(X_train)
test_predictions = xgb.predict(X_test)

# Calculate RMSE for the training and testing sets
train_rmse = sqrt(mean_squared_error(y_train, train_predictions))
test_rmse = sqrt(mean_squared_error(y_test, test_predictions))

# Calculate R-squared values for the training and testing sets
train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)

# Print the results
print('Train RMSE:', train_rmse)
print('Test RMSE:', test_rmse)
print('Train R-squared:', train_r2)
print('Test R-squared:', test_r2)

  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Train RMSE: 59.247932563692224
Test RMSE: 59.58716602519269
Train R-squared: 0.9874358186233528
Test R-squared: 0.9872406542529708


### Holdout 

In [51]:
# Make predictions on the training and testing sets
ho_predictions = rf.predict(x_holdout)

# Calculate RMSE for the training and testing sets
ho_rmse = sqrt(mean_squared_error(y_holdout, ho_predictions))

# Calculate R-squared values for the training and testing sets
ho_r2 = r2_score(y_holdout, ho_predictions)

# Print the results
print('Holdout RMSE:', ho_rmse)
print('Holdout R-squared:', ho_r2)

Holdout RMSE: 77.12349617990647
Holdout R-squared: 0.9787962591714067


In [52]:
import pickle
filename = 'inventory_model.sav'
pickle.dump(rf, open(filename, 'wb'))

### Additional

In [53]:
dd_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_supply_chain.distribution_detail")
dh_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_supply_chain.distribution_header")
eod_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_supply_chain.eod_stock_assignment")
reci_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_supply_chain.RECIPE")

In [54]:
dd_df.show()

------------------------------------------------------------------------------------------------------------------------------------------------------
|"DH_DETAIL_ID"  |"DH_ID"  |"LINE_ITEM_ID"  |"ITEM_ID"  |"QUANTITY"      |"EXPIRATION_DATE"  |"PO_ID"  |"CREATED_DATE"              |"UPDATED_DATE"  |
------------------------------------------------------------------------------------------------------------------------------------------------------
|698170          |54436    |9               |22         |2087.000000000  |2022-07-06         |26724    |2023-04-13 22:18:29.306000  |NULL            |
|698171          |54456    |3               |3          |22.000000000    |2023-05-31         |26589    |2023-04-13 22:18:29.306000  |NULL            |
|698172          |54548    |3               |21         |28.000000000    |2022-07-06         |26777    |2023-04-13 22:18:29.306000  |NULL            |
|698173          |54315    |8               |37         |1899.000000000  |2022-07-06         |

In [55]:
dh_df.show()

-------------------------------------------------------------------------------------------------------------
|"DH_ID"  |"TRUCK_ID"  |"WAREHOUSE_ID"  |"DISTRIBUTION_DATE"  |"CREATED_DATE"              |"UPDATED_DATE"  |
-------------------------------------------------------------------------------------------------------------
|44678    |23          |2               |2023-02-27           |2023-04-10 15:24:34.910000  |NULL            |
|45051    |195         |13              |2023-02-27           |2023-04-10 15:24:34.910000  |NULL            |
|45013    |300         |20              |2023-02-27           |2023-04-10 15:24:34.910000  |NULL            |
|44798    |112         |8               |2023-02-27           |2023-04-10 15:24:34.910000  |NULL            |
|44799    |3           |1               |2023-02-27           |2023-04-10 15:24:34.910000  |NULL            |
|44760    |262         |18              |2023-02-27           |2023-04-10 15:24:34.910000  |NULL            |
|44761    

In [56]:
eod_df.show()

--------------------------------------------------------------------------------------------------------
|"ASSIGNMENT_ID"  |"TRUCK_ID"  |"ITEM_ID"  |"PO_ID"  |"QUANTITY"  |"EXPIRATION_DATE"  |"CREATED_DATE"  |
--------------------------------------------------------------------------------------------------------
|196177           |149         |1          |420      |0.05        |2023-10-05         |2022-11-21      |
|4338188          |324         |1          |20386    |1.34        |2023-12-26         |2023-02-09      |
|4431933          |60          |1          |20700    |4.78        |2023-12-29         |2023-02-20      |
|5580458          |390         |2          |27116    |0.45        |2023-06-14         |2022-07-24      |
|600409           |305         |1          |1792     |5.22        |2023-11-21         |2023-01-06      |
|4630723          |145         |3          |22407    |2.15        |2024-01-23         |2023-03-15      |
|600428           |313         |1          |1792     |4

In [57]:
d_df = dd_df.join(dh_df, on= ['DH_ID'], lsuffix = "", rsuffix = "_01")
d_df = d_df.with_column('DISTRIBUTION_YEAR', F.year(F.col('DISTRIBUTION_DATE')))
d_df = d_df.with_column('DISTRIBUTION_MONTH', F.month(F.col('DISTRIBUTION_DATE')))
d_df = d_df.with_column('DISTRIBUTION_DAY', F.dayofmonth(F.col('DISTRIBUTION_DATE')))
d_df = d_df.group_by("TRUCK_ID","ITEM_ID",'DISTRIBUTION_YEAR', 'DISTRIBUTION_MONTH', 'DISTRIBUTION_DAY').agg(F.sum("QUANTITY").alias('Supply'))

In [58]:
# Get account credentials from a json file
with open("account.json") as f:
    data = json.load(f)
    username = data["username"]
    password = data["password"]
    account = data["account"]

# Specify connection parameters
connection_parameters = {
    "account": account,
    "user": username,
    "password": password,
    "role": "ACCOUNTADMIN",
    #"warehouse": "tasty_ds_wh",
    "database": "FROSTBYTE_TASTY_BYTES",
    "schema": "analytics",
}

# Create Snowpark session
sessionWrite = Session.builder.configs(connection_parameters).create()

final_scaled_sf = sessionWrite.create_dataframe(final_scaled)
# Save X data
final_scaled_sf.write.mode("overwrite").save_as_table("frostbyte_tasty_bytes.analytics.inventory_management")

# Save y data
d_df.write.mode("overwrite").save_as_table("frostbyte_tasty_bytes.analytics.monthly_supply")

  success, nchunks, nrows, ci_output = write_pandas(


In [61]:

demand_item = final_scaled_sf.join(reci_df, on= ['menu_item_id'], lsuffix = "", rsuffix = "_01")

In [62]:
demand_item = demand_item.with_column('Demand_item', F.col('UNIT_QUANTITY') * F.col('DEMAND'))
demand_item = demand_item.group_by("TRUCK_ID","ITEM_ID",'ORDER_YEAR', 'ORDER_MONTH', 'ORDER_DAY').agg(F.sum("DEMAND_ITEM").alias('DEMAND_ITEM'))

In [63]:
demand_item.write.mode("overwrite").save_as_table("frostbyte_tasty_bytes.analytics.monthly_demand")

In [64]:
demand_item.show()

---------------------------------------------------------------------------------------
|"TRUCK_ID"  |"ITEM_ID"  |"ORDER_YEAR"  |"ORDER_MONTH"  |"ORDER_DAY"  |"DEMAND_ITEM"  |
---------------------------------------------------------------------------------------
|425         |46         |2022          |4              |11           |44.950000000   |
|425         |48         |2022          |4              |11           |30.250000000   |
|425         |51         |2022          |4              |11           |179.800000000  |
|425         |66         |2022          |4              |11           |5.810000000    |
|427         |3          |2022          |4              |11           |7.300000000    |
|428         |47         |2022          |4              |11           |33.040000000   |
|428         |40         |2022          |4              |11           |58.460000000   |
|428         |60         |2022          |4              |11           |14.150000000   |
|428         |27         |2022  

In [65]:
d_df.sort("DISTRIBUTION_YEAR").show()

------------------------------------------------------------------------------------------------------------
|"TRUCK_ID"  |"ITEM_ID"  |"DISTRIBUTION_YEAR"  |"DISTRIBUTION_MONTH"  |"DISTRIBUTION_DAY"  |"SUPPLY"       |
------------------------------------------------------------------------------------------------------------
|430         |66         |2022                 |12                    |12                  |184.000000000  |
|413         |56         |2022                 |12                    |12                  |176.000000000  |
|388         |2          |2022                 |12                    |12                  |37.000000000   |
|385         |73         |2022                 |12                    |12                  |420.000000000  |
|296         |26         |2022                 |12                    |12                  |432.000000000  |
|375         |40         |2022                 |12                    |12                  |339.000000000  |
|129         |37   