In [101]:
# Import Python packages
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import json
import sys
import cachetools
from datetime import datetime
from datetime import date
import matplotlib.pyplot as plt 
import seaborn as sns
import math

# Import Snowflake modules
from snowflake.snowpark import Session
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T
from snowflake.snowpark import Window
from snowflake.snowpark.functions import col

# Clustering & Model
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score, train_test_split

# Scalers
from sklearn.preprocessing import StandardScaler

# Evaluation
from sklearn.metrics import silhouette_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# find the best combination of model hyperparameters
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score, cross_validate

# Getting Password,Username, Account
import getpass

In [102]:
# Get account credentials from a json file
with open("account.json") as f:
    data = json.load(f)
    username = data["username"]
    password = data["password"]
    account = data["account"]

# Specify connection parameters
connection_parameters = {
    "account": account,
    "user": username,
    "password": password,
    #"role": "ACCOUNTADMIN",
    #"warehouse": "tasty_ds_wh",
    #"database": "NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE",
    #"schema": "analytics",
}

# Create Snowpark session
session = Session.builder.configs(connection_parameters).create()

In [103]:
order_header_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_pos.order_header")
order_detail_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_pos.order_detail")
menu_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_pos.menu")
truck_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_pos.truck")

In [104]:
order_header_df.show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"ORDER_ID"  |"TRUCK_ID"  |"LOCATION_ID"  |"CUSTOMER_ID"  |"DISCOUNT_ID"  |"SHIFT_ID"  |"SHIFT_START_TIME"  |"SHIFT_END_TIME"  |"ORDER_CHANNEL"  |"ORDER_TS"           |"SERVED_TS"  |"ORDER_CURRENCY"  |"ORDER_AMOUNT"  |"ORDER_TAX_AMOUNT"  |"ORDER_DISCOUNT_AMOUNT"  |"ORDER_TOTAL"  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|90240744    |239         |8822.0         |NULL           |NULL           |242333      |15:00:00            |22:00:00          |NULL             |2022-07-

In [105]:
order_detail_df.show()

--------------------------------------------------------------------------------------------------------------------------------------------------------
|"ORDER_DETAIL_ID"  |"ORDER_ID"  |"MENU_ITEM_ID"  |"DISCOUNT_ID"  |"LINE_NUMBER"  |"QUANTITY"  |"UNIT_PRICE"  |"PRICE"  |"ORDER_ITEM_DISCOUNT_AMOUNT"  |
--------------------------------------------------------------------------------------------------------------------------------------------------------
|798202495          |420427395   |131             |NULL           |0              |1           |13.0000       |13.0000  |NULL                          |
|798202496          |420427396   |133             |NULL           |0              |1           |6.0000        |6.0000   |NULL                          |
|798202497          |420427397   |133             |NULL           |0              |1           |6.0000        |6.0000   |NULL                          |
|798202498          |420427398   |133             |NULL           |0              

In [106]:
menu_df.show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"MENU_ID"  |"MENU_TYPE_ID"  |"MENU_TYPE"  |"TRUCK_BRAND_NAME"  |"MENU_ITEM_ID"  |"MENU_ITEM_NAME"           |"ITEM_CATEGORY"  |"ITEM_SUBCATEGORY"  |"COST_OF_GOODS_USD"  |"SALE_PRICE_USD"  |"MENU_ITEM_HEALTH_METRICS_OBJ"     |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|10088      |13              |Chinese      |Peking Truck        |136             |Ice Tea                    |Beverage         |Cold Option         |0.7500               |3.0000            |{                                  |
|           |                |             |                    |                |          

In [107]:
#Minimum order transaction by a truck
order_header_df.group_by("TRUCK_ID").agg(F.count("ORDER_ID").alias('count')).agg(F.min('count')).show()

----------------
|"MIN(COUNT)"  |
----------------
|8850          |
----------------



In [108]:
#order_header_df.drop_duplicates('ORDER_ID').group_by('ORDER_ID').agg(F.count('ORDER_ID').alias('nnn')).agg(F.sum('NNN')).show()

In [109]:
#order_detail_df.drop_duplicates('ORDER_ID').group_by('ORDER_ID').agg(F.count('ORDER_ID').alias('nnn')).agg(F.sum('NNN')).show()

In [110]:
order_truck = order_header_df.select('ORDER_ID', 'TRUCK_ID', 'ORDER_TS')
df = order_detail_df.join(order_truck, order_detail_df.ORDER_ID == order_truck.ORDER_ID, how = "left", lsuffix = "", rsuffix = "_01")

In [111]:
df.show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"ORDER_DETAIL_ID"  |"ORDER_ID"  |"MENU_ITEM_ID"  |"DISCOUNT_ID"  |"LINE_NUMBER"  |"QUANTITY"  |"UNIT_PRICE"  |"PRICE"  |"ORDER_ITEM_DISCOUNT_AMOUNT"  |"ORDER_ID_01"  |"TRUCK_ID"  |"ORDER_TS"           |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|798202503          |420427400   |131             |NULL           |0              |4           |13.0000       |52.0000  |NULL                          |420427400      |433         |2022-04-10 19:44:21  |
|798202504          |420427400   |132             |NULL           |1              |4           |11.0000       |44.0000  |NULL                          |420427400      |433         |202

In [112]:
df.agg(F.max("ORDER_TS")).show()

-----------------------
|"MAX(ORDER_TS)"      |
-----------------------
|2022-11-01 22:59:59  |
-----------------------



In [113]:
#df = df.with_column('ORDER_YEAR', F.year(F.col('ORDER_TS')))
#df = df.with_column('ORDER_MONTH', F.month(F.col('ORDER_TS')))
#df = df.with_column('ORDER_DAY', F.dayofmonth(F.col('ORDER_TS')))
df = df.with_column('LAST_DATE', F.iff(F.col('ORDER_TS') == F.col('ORDER_TS'), '2022-11-01', "0"))
df = df.with_column('DAYS_AGO', F.datediff('day', F.col('ORDER_TS'), F.col('LAST_DATE')))

In [114]:
df.show()

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"ORDER_DETAIL_ID"  |"ORDER_ID"  |"MENU_ITEM_ID"  |"DISCOUNT_ID"  |"LINE_NUMBER"  |"QUANTITY"  |"UNIT_PRICE"  |"PRICE"  |"ORDER_ITEM_DISCOUNT_AMOUNT"  |"ORDER_ID_01"  |"TRUCK_ID"  |"ORDER_TS"           |"LAST_DATE"  |"DAYS_AGO"  |
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|349467443          |128946761   |121             |NULL           |0              |2           |20.0000       |40.0000  |NULL                          |128946761      |282         |2022-06-01 20:56:26  |2022-11-01   |153         |
|349467444          |128946761   |122             |NULL           |1        

In [115]:
#df.describe().show()

In [116]:
#demand_df = df.group_by("TRUCK_ID", 'MENU_ITEM_ID', 'ORDER_TS').agg(F.sum("QUANTITY").alias('DEMAND')) fix order_ts also hv other codes
demand_df = df.group_by("TRUCK_ID", 'MENU_ITEM_ID', 'DAYS_AGO').agg(F.sum("QUANTITY").alias('DEMAND'))
demand_df.sort('TRUCK_ID').show()

-------------------------------------------------------
|"TRUCK_ID"  |"MENU_ITEM_ID"  |"DAYS_AGO"  |"DEMAND"  |
-------------------------------------------------------
|1           |18              |216         |39        |
|1           |17              |209         |22        |
|1           |19              |206         |38        |
|1           |13              |202         |32        |
|1           |12              |205         |42        |
|1           |10              |209         |10        |
|1           |17              |214         |31        |
|1           |12              |200         |42        |
|1           |15              |204         |12        |
|1           |18              |202         |26        |
-------------------------------------------------------



In [117]:
unit_price = df.group_by("TRUCK_ID", 'MENU_ITEM_ID', 'DAYS_AGO').agg(F.mean("unit_price").alias('unit_price'))
#unit_price = df.group_by("TRUCK_ID", 'MENU_ITEM_ID', 'ORDER_TS').agg(F.mean("unit_price").alias('unit_price'))
#unit_price.sort('TRUCK_ID').show()

In [118]:
final_df = demand_df.join(unit_price, on= ['TRUCK_ID', 'MENU_ITEM_ID', 'DAYS_AGO'], lsuffix = "", rsuffix = "_01")
#final_df = demand_df.join(unit_price, on= ['TRUCK_ID', 'MENU_ITEM_ID', 'ORDER_TS'], lsuffix = "", rsuffix = "_01")

In [119]:
final_df = final_df.withColumn("date", F.iff(F.col('TRUCK_ID') == F.col('TRUCK_ID'), '2022-11-01', '0'))
final_df = final_df.withColumn("date", F.to_date("date"))
final_df = final_df.with_column('ORDER_TS', F.date_sub(F.col('DATE'), F.col('DAYS_AGO')))
final_df = final_df.with_column('ORDER_YEAR', F.year(F.col('ORDER_TS')))
final_df = final_df.with_column('ORDER_MONTH', F.month(F.col('ORDER_TS')))
final_df = final_df.with_column('ORDER_DAY', F.dayofmonth(F.col('ORDER_TS')))

In [120]:
final_df = final_df.drop(['DATE', 'ORDER_TS'])

In [121]:
final_df.show()

--------------------------------------------------------------------------------------------------------------------
|"TRUCK_ID"  |"MENU_ITEM_ID"  |"DAYS_AGO"  |"DEMAND"  |"UNIT_PRICE"   |"ORDER_YEAR"  |"ORDER_MONTH"  |"ORDER_DAY"  |
--------------------------------------------------------------------------------------------------------------------
|88          |134             |193         |322       |2.0000000000   |2022          |4              |22           |
|88          |136             |193         |318       |3.0000000000   |2022          |4              |22           |
|89          |141             |193         |1493      |17.0000000000  |2022          |4              |22           |
|90          |153             |193         |1393      |11.0000000000  |2022          |4              |22           |
|76          |11              |192         |723       |6.0000000000   |2022          |4              |23           |
|76          |14              |192         |252       |2.0000000

In [122]:
truck_df.show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"TRUCK_ID"  |"MENU_TYPE_ID"  |"PRIMARY_CITY"  |"REGION"    |"ISO_REGION"  |"COUNTRY"      |"ISO_COUNTRY_CODE"  |"FRANCHISE_FLAG"  |"YEAR"  |"MAKE"        |"MODEL"           |"EV_FLAG"  |"FRANCHISE_ID"  |"TRUCK_OPENING_DATE"  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|3           |3               |San Mateo       |California  |CA            |United States  |US                  |1                 |2004    |Freightliner  |MT45 Utilimaster  |0          |2               |2021-10-01            |
|4           |4               |San Mateo       |California  |CA            |United State

In [123]:
#truck_df = truck_df.filter(F.col('COUNTRY') == 'United States')

In [124]:
truck = truck_df.with_column('LAST_DATE', F.iff(F.col("TRUCK_ID") == F.col('TRUCK_ID'), "2022-10-18", '0'))
truck = truck.withColumn("DAYS_OPENED", F.datediff("day", F.col("TRUCK_OPENING_DATE"), F.col('LAST_DATE')))
#truck = truck.select('TRUCK_ID', 'PRIMARY_CITY', 'REGION', 'COUNTRY', 'DAYS_OPENED')
truck = truck.select('TRUCK_ID','EV_FLAG','DAYS_OPENED')

truck.show()

------------------------------------------
|"TRUCK_ID"  |"EV_FLAG"  |"DAYS_OPENED"  |
------------------------------------------
|3           |0          |382            |
|4           |1          |655            |
|5           |1          |200            |
|6           |0          |1205           |
|7           |0          |655            |
|8           |1          |200            |
|9           |0          |839            |
|12          |0          |565            |
|13          |0          |474            |
|14          |0          |747            |
------------------------------------------



In [125]:
final_df = final_df.join(truck, on= ['TRUCK_ID'], lsuffix = "", rsuffix = "_01")

In [126]:
menu = menu_df.drop('MENU_ITEM_HEALTH_METRICS_OBJ')
menu.show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"MENU_ID"  |"MENU_TYPE_ID"  |"MENU_TYPE"     |"TRUCK_BRAND_NAME"  |"MENU_ITEM_ID"  |"MENU_ITEM_NAME"  |"ITEM_CATEGORY"  |"ITEM_SUBCATEGORY"  |"COST_OF_GOODS_USD"  |"SALE_PRICE_USD"  |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|10038      |5               |Ramen           |Kitakata Ramen Bar  |54              |Bottled Water     |Beverage         |Cold Option         |0.5000               |2.0000            |
|10039      |5               |Ramen           |Kitakata Ramen Bar  |55              |Bottled Soda      |Beverage         |Cold Option         |0.5000               |3.0000            |
|10040      |5               |Ramen           |Kitakata Ramen Bar  |56     

In [127]:
menu = menu.with_column('TEMPERATURE_OPTION', F.when(F.col('ITEM_SUBCATEGORY') == 'Cold Option', 0).when(
    F.col('ITEM_SUBCATEGORY') == 'Warm Option', 1).otherwise(2))
menu = menu.select('MENU_ITEM_ID', 'MENU_TYPE_ID', 'TEMPERATURE_OPTION', 'COST_OF_GOODS_USD', 'ITEM_CATEGORY') #Add item category and ohe in pandas
menu.show()

--------------------------------------------------------------------------------------------------
|"MENU_ITEM_ID"  |"MENU_TYPE_ID"  |"TEMPERATURE_OPTION"  |"COST_OF_GOODS_USD"  |"ITEM_CATEGORY"  |
--------------------------------------------------------------------------------------------------
|10              |1               |0                     |0.6500               |Beverage         |
|11              |1               |0                     |2.5000               |Dessert          |
|12              |1               |0                     |2.5000               |Dessert          |
|13              |1               |0                     |3.0000               |Dessert          |
|14              |1               |0                     |0.5000               |Beverage         |
|15              |1               |0                     |0.5000               |Beverage         |
|16              |1               |0                     |0.7500               |Beverage         |
|17       

In [128]:
final_df = final_df.join(menu, on= ['MENU_ITEM_ID'], lsuffix = "", rsuffix = "_01")

In [129]:
#final_df.show()

In [130]:
#final_df.sort(['ORDER_YEAR', 'ORDER_MONTH'], ascending=[False, False]).show()

In [131]:
import feature_engine
from feature_engine.encoding import OneHotEncoder

final_df = final_df.to_pandas()
ohe_enc = OneHotEncoder(
    top_categories=None,
    drop_last=False,
    variables= ['ITEM_CATEGORY'])  # to return k-1, false to return k


ohe_enc.fit(final_df)
final_df = ohe_enc.transform(final_df)

In [132]:
final_df = final_df.drop('DAYS_AGO', axis=1)
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1744744 entries, 0 to 1744743
Data columns (total 16 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   MENU_ITEM_ID            int16  
 1   TRUCK_ID                int16  
 2   DEMAND                  int64  
 3   UNIT_PRICE              object 
 4   ORDER_YEAR              int16  
 5   ORDER_MONTH             int8   
 6   ORDER_DAY               int8   
 7   EV_FLAG                 int8   
 8   DAYS_OPENED             int32  
 9   MENU_TYPE_ID            int8   
 10  TEMPERATURE_OPTION      int8   
 11  COST_OF_GOODS_USD       float64
 12  ITEM_CATEGORY_Beverage  int32  
 13  ITEM_CATEGORY_Main      int32  
 14  ITEM_CATEGORY_Snack     int32  
 15  ITEM_CATEGORY_Dessert   int32  
dtypes: float64(1), int16(3), int32(5), int64(1), int8(5), object(1)
memory usage: 91.5+ MB


In [133]:
final_df.corr()

  final_df.corr()


Unnamed: 0,MENU_ITEM_ID,TRUCK_ID,DEMAND,ORDER_YEAR,ORDER_MONTH,ORDER_DAY,EV_FLAG,DAYS_OPENED,MENU_TYPE_ID,TEMPERATURE_OPTION,COST_OF_GOODS_USD,ITEM_CATEGORY_Beverage,ITEM_CATEGORY_Main,ITEM_CATEGORY_Snack,ITEM_CATEGORY_Dessert
MENU_ITEM_ID,1.0,0.028822,0.137943,0.017816,0.004362,0.000118,-0.127218,-0.035291,0.998926,0.035217,0.059955,0.11747,0.069602,-0.060066,-0.345721
TRUCK_ID,0.028822,1.0,0.252452,0.025549,-0.004239,-0.000364,0.074173,-0.044074,0.028875,-0.007379,0.004212,0.003138,-0.005346,-0.004416,0.008973
DEMAND,0.137943,0.252452,1.0,0.03623,-0.007377,-0.000248,0.04628,-0.043923,0.157803,0.601486,0.523795,-0.618409,0.583752,0.123866,-0.037051
ORDER_YEAR,0.017816,0.025549,0.03623,1.0,-0.229731,-0.005063,0.309228,-0.545806,0.018038,-0.003929,-0.005019,0.003783,0.001168,-0.010138,-0.000898
ORDER_MONTH,0.004362,-0.004239,-0.007377,-0.229731,1.0,0.008547,-0.035662,0.000941,0.004355,0.001051,0.002253,0.000282,-0.000146,0.002676,-0.002883
ORDER_DAY,0.000118,-0.000364,-0.000248,-0.005063,0.008547,1.0,-0.001154,0.001257,0.000118,8.9e-05,6.7e-05,1.1e-05,3.7e-05,5.7e-05,-0.000157
EV_FLAG,-0.127218,0.074173,0.04628,0.309228,-0.035662,-0.001154,1.0,-0.544327,-0.127633,-0.011775,-0.019977,-0.015115,0.008565,-0.011567,0.025461
DAYS_OPENED,-0.035291,-0.044074,-0.043923,-0.545806,0.000941,0.001257,-0.544327,1.0,-0.035691,0.006602,0.007808,-0.007092,-0.002041,0.016966,0.003333
MENU_TYPE_ID,0.998926,0.028875,0.157803,0.018038,0.004355,0.000118,-0.127633,-0.035691,1.0,0.060816,0.083902,0.095486,0.095344,-0.064939,-0.348804
TEMPERATURE_OPTION,0.035217,-0.007379,0.601486,-0.003929,0.001051,8.9e-05,-0.011775,0.006602,0.060816,1.0,0.801789,-0.776413,0.847621,0.064798,-0.20551


In [134]:
# Scaling

#drop target
#target = final_df['DEMAND']
#final_df.drop('DEMAND', axis = 1, inplace = True)

#scaler = StandardScaler()
#scaler.fit(final_df)

#final_scaled = scaler.transform(final_df)
#final_scaled = pd.DataFrame(final_scaled, columns=final_df.columns)

#assign target variable back to scaled data frame
#final_scaled = final_scaled.assign(DEMAND = target)
#final_scaled.head()
final_scaled = final_df
final_scaled['UNIT_PRICE'] = final_scaled['UNIT_PRICE'].astype(float)
drop = final_scaled[((final_scaled["ORDER_YEAR"] == 2022) & (final_scaled['ORDER_MONTH'] == 11))]
merge = pd.merge(drop, final_scaled,how='outer', indicator=True)
final_scaled = merge[merge['_merge'] == 'right_only'].drop('_merge', axis = 1)
final_scaled.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1741744 entries, 3000 to 1744743
Data columns (total 16 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   MENU_ITEM_ID            int16  
 1   TRUCK_ID                int16  
 2   DEMAND                  int64  
 3   UNIT_PRICE              float64
 4   ORDER_YEAR              int16  
 5   ORDER_MONTH             int8   
 6   ORDER_DAY               int8   
 7   EV_FLAG                 int8   
 8   DAYS_OPENED             int32  
 9   MENU_TYPE_ID            int8   
 10  TEMPERATURE_OPTION      int8   
 11  COST_OF_GOODS_USD       float64
 12  ITEM_CATEGORY_Beverage  int32  
 13  ITEM_CATEGORY_Main      int32  
 14  ITEM_CATEGORY_Snack     int32  
 15  ITEM_CATEGORY_Dessert   int32  
dtypes: float64(2), int16(3), int32(5), int64(1), int8(5)
memory usage: 104.6 MB


In [135]:
final_scaled.head()

Unnamed: 0,MENU_ITEM_ID,TRUCK_ID,DEMAND,UNIT_PRICE,ORDER_YEAR,ORDER_MONTH,ORDER_DAY,EV_FLAG,DAYS_OPENED,MENU_TYPE_ID,TEMPERATURE_OPTION,COST_OF_GOODS_USD,ITEM_CATEGORY_Beverage,ITEM_CATEGORY_Main,ITEM_CATEGORY_Snack,ITEM_CATEGORY_Dessert
3000,75,397,293,3.0,2022,5,5,1,565,7,0,0.5,1,0,0,0
3001,41,394,1565,8.0,2022,5,5,0,747,4,2,3.0,0,1,0,0
3002,131,58,488,13.0,2022,8,30,1,290,13,2,6.0,0,1,0,0
3003,27,47,188,6.0,2022,8,31,0,839,2,0,2.25,0,0,1,0
3004,37,48,185,12.5,2022,8,31,0,839,3,1,3.25,0,1,0,0


In [136]:
holdout = final_scaled[(final_scaled["ORDER_YEAR"] == 2022) & (final_scaled['ORDER_MONTH'] >= 8)]
#holdout = holdout.drop(['ORDER_YEAR', 'ORDER_MONTH', 'ORDER_DAY'], axis = 1)
x_holdout = holdout.drop('DEMAND',axis=1)
y_holdout = holdout['DEMAND']
holdout.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 275268 entries, 3002 to 1744743
Data columns (total 16 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   MENU_ITEM_ID            275268 non-null  int16  
 1   TRUCK_ID                275268 non-null  int16  
 2   DEMAND                  275268 non-null  int64  
 3   UNIT_PRICE              275268 non-null  float64
 4   ORDER_YEAR              275268 non-null  int16  
 5   ORDER_MONTH             275268 non-null  int8   
 6   ORDER_DAY               275268 non-null  int8   
 7   EV_FLAG                 275268 non-null  int8   
 8   DAYS_OPENED             275268 non-null  int32  
 9   MENU_TYPE_ID            275268 non-null  int8   
 10  TEMPERATURE_OPTION      275268 non-null  int8   
 11  COST_OF_GOODS_USD       275268 non-null  float64
 12  ITEM_CATEGORY_Beverage  275268 non-null  int32  
 13  ITEM_CATEGORY_Main      275268 non-null  int32  
 14  ITEM_CATEGORY_Sn

In [137]:
merged = pd.merge(holdout, final_scaled,how='outer', indicator=True)
final_scaled = merged[merged['_merge'] == 'right_only'].drop('_merge', axis = 1)
final_scaled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1466476 entries, 275268 to 1741743
Data columns (total 16 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   MENU_ITEM_ID            1466476 non-null  int16  
 1   TRUCK_ID                1466476 non-null  int16  
 2   DEMAND                  1466476 non-null  int64  
 3   UNIT_PRICE              1466476 non-null  float64
 4   ORDER_YEAR              1466476 non-null  int16  
 5   ORDER_MONTH             1466476 non-null  int8   
 6   ORDER_DAY               1466476 non-null  int8   
 7   EV_FLAG                 1466476 non-null  int8   
 8   DAYS_OPENED             1466476 non-null  int32  
 9   MENU_TYPE_ID            1466476 non-null  int8   
 10  TEMPERATURE_OPTION      1466476 non-null  int8   
 11  COST_OF_GOODS_USD       1466476 non-null  float64
 12  ITEM_CATEGORY_Beverage  1466476 non-null  int32  
 13  ITEM_CATEGORY_Main      1466476 non-null  int32  
 1

In [138]:
#final_scaled = final_scaled.drop(['ORDER_YEAR', 'ORDER_MONTH', 'ORDER_DAY'], axis = 1)

In [139]:
final_scaled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1466476 entries, 275268 to 1741743
Data columns (total 16 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   MENU_ITEM_ID            1466476 non-null  int16  
 1   TRUCK_ID                1466476 non-null  int16  
 2   DEMAND                  1466476 non-null  int64  
 3   UNIT_PRICE              1466476 non-null  float64
 4   ORDER_YEAR              1466476 non-null  int16  
 5   ORDER_MONTH             1466476 non-null  int8   
 6   ORDER_DAY               1466476 non-null  int8   
 7   EV_FLAG                 1466476 non-null  int8   
 8   DAYS_OPENED             1466476 non-null  int32  
 9   MENU_TYPE_ID            1466476 non-null  int8   
 10  TEMPERATURE_OPTION      1466476 non-null  int8   
 11  COST_OF_GOODS_USD       1466476 non-null  float64
 12  ITEM_CATEGORY_Beverage  1466476 non-null  int32  
 13  ITEM_CATEGORY_Main      1466476 non-null  int32  
 1

In [140]:
# Train Test Split
# Define Model Inputs (X) and Output (y)
X = final_scaled.drop('DEMAND',axis=1)
y = final_scaled["DEMAND"]

# Split both Inputs (X) and Ouput (y) into training set (70%) and testing set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [141]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

# Create the model "lr"
lr = LinearRegression()

# Fit the model to the training set
lr.fit(X_train, y_train)

# Make predictions on the training and testing sets
train_predictions = lr.predict(X_train)
test_predictions = lr.predict(X_test)

# Calculate RMSE for the training and testing sets
train_rmse = sqrt(mean_squared_error(y_train, train_predictions))
test_rmse = sqrt(mean_squared_error(y_test, test_predictions))

# Calculate R-squared values for the training and testing sets
train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)

# Print the results
print('Train RMSE:', train_rmse)
print('Test RMSE:', test_rmse)
print('Train R-squared:', train_r2)
print('Test R-squared:', test_r2)


Train RMSE: 361.8957247553983
Test RMSE: 362.20276553334384
Train R-squared: 0.5303387957667102
Test R-squared: 0.5306602178043438


In [142]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators = 10, random_state = 0)
rf.fit(X_train, y_train)

# Make predictions on the training and testing sets
train_predictions = rf.predict(X_train)
test_predictions = rf.predict(X_test)

# Calculate RMSE for the training and testing sets
train_rmse = sqrt(mean_squared_error(y_train, train_predictions))
test_rmse = sqrt(mean_squared_error(y_test, test_predictions))

# Calculate R-squared values for the training and testing sets
train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)

# Print the results
print('Train RMSE:', train_rmse)
print('Test RMSE:', test_rmse)
print('Train R-squared:', train_r2)
print('Test R-squared:', test_r2)

Train RMSE: 18.752278323873046
Test RMSE: 44.08076242937327
Train R-squared: 0.9987389687108248
Test R-squared: 0.993048443792843


In [143]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf, X_train, y_train, cv = 5)
scores.mean()

0.9920360492669605

In [144]:
import xgboost as xg

# create an xgboost regression model
xgb = xg.XGBRegressor()

# Fitting the model
xgb.fit(X_train, y_train)
  
# Make predictions on the training and testing sets
train_predictions = xgb.predict(X_train)
test_predictions = xgb.predict(X_test)

# Calculate RMSE for the training and testing sets
train_rmse = sqrt(mean_squared_error(y_train, train_predictions))
test_rmse = sqrt(mean_squared_error(y_test, test_predictions))

# Calculate R-squared values for the training and testing sets
train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)

# Print the results
print('Train RMSE:', train_rmse)
print('Test RMSE:', test_rmse)
print('Train R-squared:', train_r2)
print('Test R-squared:', test_r2)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Train RMSE: 59.181209035958375
Test RMSE: 59.61661253840157
Train R-squared: 0.9874401211974049
Test R-squared: 0.987284939398157


### Holdout 

In [145]:
# Make predictions on the training and testing sets
ho_predictions = rf.predict(x_holdout)

# Calculate RMSE for the training and testing sets
ho_rmse = sqrt(mean_squared_error(y_holdout, ho_predictions))

# Calculate R-squared values for the training and testing sets
ho_r2 = r2_score(y_holdout, ho_predictions)

# Print the results
print('Holdout RMSE:', ho_rmse)
print('Holdout R-squared:', ho_r2)

Holdout RMSE: 76.78367661488934
Holdout R-squared: 0.9789827022689281


In [146]:
import pickle
filename = 'inventory_model.sav'
pickle.dump(rf, open(filename, 'wb'))

### Additional

In [147]:
dd_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_supply_chain.distribution_detail")
dh_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_supply_chain.distribution_header")
eod_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_supply_chain.eod_stock_assignment")
reci_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_supply_chain.RECIPE")

In [148]:
dd_df.show()

------------------------------------------------------------------------------------------------------------------------------------------------------
|"DH_DETAIL_ID"  |"DH_ID"  |"LINE_ITEM_ID"  |"ITEM_ID"  |"QUANTITY"      |"EXPIRATION_DATE"  |"PO_ID"  |"CREATED_DATE"              |"UPDATED_DATE"  |
------------------------------------------------------------------------------------------------------------------------------------------------------
|698170          |54436    |9               |22         |2087.000000000  |2022-07-06         |26724    |2023-04-13 22:18:29.306000  |NULL            |
|698171          |54456    |3               |3          |22.000000000    |2023-05-31         |26589    |2023-04-13 22:18:29.306000  |NULL            |
|698172          |54548    |3               |21         |28.000000000    |2022-07-06         |26777    |2023-04-13 22:18:29.306000  |NULL            |
|698173          |54315    |8               |37         |1899.000000000  |2022-07-06         |

In [149]:
dh_df.show()

-------------------------------------------------------------------------------------------------------------
|"DH_ID"  |"TRUCK_ID"  |"WAREHOUSE_ID"  |"DISTRIBUTION_DATE"  |"CREATED_DATE"              |"UPDATED_DATE"  |
-------------------------------------------------------------------------------------------------------------
|43542    |174         |12              |2023-02-13           |2023-04-10 15:14:18.801000  |NULL            |
|43543    |398         |27              |2023-02-13           |2023-04-10 15:14:18.801000  |NULL            |
|43544    |389         |26              |2023-02-13           |2023-04-10 15:14:18.801000  |NULL            |
|43415    |224         |15              |2023-02-13           |2023-04-10 15:14:18.801000  |NULL            |
|43416    |180         |12              |2023-02-13           |2023-04-10 15:14:18.801000  |NULL            |
|43417    |409         |28              |2023-02-13           |2023-04-10 15:14:18.801000  |NULL            |
|43418    

In [150]:
eod_df.show()

--------------------------------------------------------------------------------------------------------
|"ASSIGNMENT_ID"  |"TRUCK_ID"  |"ITEM_ID"  |"PO_ID"  |"QUANTITY"  |"EXPIRATION_DATE"  |"CREATED_DATE"  |
--------------------------------------------------------------------------------------------------------
|4252658          |396         |3          |2310     |6.80        |2023-12-13         |2023-01-30      |
|5525900          |219         |1          |27018    |10.47       |2023-06-07         |2022-07-18      |
|6813586          |161         |2          |32886    |6.35        |2023-08-31         |2022-10-09      |
|5713374          |380         |2          |28441    |0.05        |2023-07-11         |2022-08-08      |
|6997186          |373         |1          |33572    |5.53        |2023-09-17         |2022-10-30      |
|5795694          |25          |73         |28722    |45.60       |2022-08-29         |2022-08-17      |
|6874732          |29          |3          |33102    |4

In [151]:
d_df = dd_df.join(dh_df, on= ['DH_ID'], lsuffix = "", rsuffix = "_01")
d_df = d_df.with_column('DISTRIBUTION_YEAR', F.year(F.col('DISTRIBUTION_DATE')))
d_df = d_df.with_column('DISTRIBUTION_MONTH', F.month(F.col('DISTRIBUTION_DATE')))
d_df = d_df.with_column('DISTRIBUTION_DAY', F.dayofmonth(F.col('DISTRIBUTION_DATE')))
d_df = d_df.group_by("TRUCK_ID","ITEM_ID",'DISTRIBUTION_YEAR', 'DISTRIBUTION_MONTH', 'DISTRIBUTION_DAY').agg(F.sum("QUANTITY").alias('Supply'))

In [152]:
# Get account credentials from a json file
with open("account.json") as f:
    data = json.load(f)
    username = data["username"]
    password = data["password"]
    account = data["account"]

# Specify connection parameters
connection_parameters = {
    "account": account,
    "user": username,
    "password": password,
    "role": "ACCOUNTADMIN",
    #"warehouse": "tasty_ds_wh",
    "database": "FROSTBYTE_TASTY_BYTES",
    "schema": "analytics",
}

# Create Snowpark session
sessionWrite = Session.builder.configs(connection_parameters).create()

final_scaled_sf = sessionWrite.create_dataframe(final_scaled)
# Save X data
final_scaled_sf.write.mode("overwrite").save_as_table("frostbyte_tasty_bytes.analytics.inventory_management")

# Save y data
d_df.write.mode("overwrite").save_as_table("frostbyte_tasty_bytes.analytics.monthly_supply")

  success, nchunks, nrows, ci_output = write_pandas(


In [153]:

demand_item = final_scaled_sf.join(reci_df, on= ['menu_item_id'], lsuffix = "", rsuffix = "_01")

In [154]:
demand_item = demand_item.with_column('Demand_item', F.col('UNIT_QUANTITY') * F.col('DEMAND'))
demand_item = demand_item.group_by("TRUCK_ID","ITEM_ID",'ORDER_YEAR', 'ORDER_MONTH', 'ORDER_DAY').agg(F.sum("DEMAND_ITEM").alias('DEMAND_ITEM'))

In [155]:
demand_item.write.mode("overwrite").save_as_table("frostbyte_tasty_bytes.analytics.monthly_demand")

In [156]:
demand_item.show()

---------------------------------------------------------------------------------------
|"TRUCK_ID"  |"ITEM_ID"  |"ORDER_YEAR"  |"ORDER_MONTH"  |"ORDER_DAY"  |"DEMAND_ITEM"  |
---------------------------------------------------------------------------------------
|397         |2          |2022          |5              |5            |14.650000000   |
|394         |45         |2022          |5              |5            |116.700000000  |
|394         |40         |2022          |5              |5            |466.800000000  |
|394         |41         |2022          |5              |5            |466.800000000  |
|392         |22         |2022          |5              |6            |39.150000000   |
|392         |23         |2022          |5              |6            |39.150000000   |
|392         |24         |2022          |5              |6            |39.150000000   |
|392         |25         |2022          |5              |6            |5.872500000    |
|394         |1          |2022  

In [157]:
d_df.sort("DISTRIBUTION_YEAR").show()

-------------------------------------------------------------------------------------------------------------
|"TRUCK_ID"  |"ITEM_ID"  |"DISTRIBUTION_YEAR"  |"DISTRIBUTION_MONTH"  |"DISTRIBUTION_DAY"  |"SUPPLY"        |
-------------------------------------------------------------------------------------------------------------
|4           |40         |2022                 |6                     |27                  |165.000000000   |
|176         |3          |2022                 |6                     |27                  |22.000000000    |
|426         |54         |2022                 |6                     |27                  |287.000000000   |
|282         |78         |2022                 |6                     |27                  |661.000000000   |
|133         |1          |2022                 |6                     |27                  |101.000000000   |
|404         |37         |2022                 |6                     |27                  |1899.000000000  |
|240      