In [59]:
# Import Python packages
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import json
import sys
import cachetools
from datetime import datetime
from datetime import date
import matplotlib.pyplot as plt 
import seaborn as sns
import math

# Import Snowflake modules
from snowflake.snowpark import Session
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T
from snowflake.snowpark import Window
from snowflake.snowpark.functions import col

# Clustering & Model
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score, train_test_split

# Scalers
from sklearn.preprocessing import StandardScaler

# Evaluation
from sklearn.metrics import silhouette_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# find the best combination of model hyperparameters
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score, cross_validate

# Getting Password,Username, Account
import getpass

In [60]:
# Get account credentials from a json file
with open("account.json") as f:
    data = json.load(f)
    username = data["username"]
    password = data["password"]
    account = data["account"]

# Specify connection parameters
connection_parameters = {
    "account": account,
    "user": username,
    "password": password,
    #"role": "ACCOUNTADMIN",
    #"warehouse": "tasty_ds_wh",
    #"database": "NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE",
    #"schema": "analytics",
}

# Create Snowpark session
session = Session.builder.configs(connection_parameters).create()

In [61]:
order_header_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_pos.order_header")
order_detail_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_pos.order_detail")
menu_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_pos.menu")
truck_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_pos.truck")

In [62]:
order_header_df.show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"ORDER_ID"  |"TRUCK_ID"  |"LOCATION_ID"  |"CUSTOMER_ID"  |"DISCOUNT_ID"  |"SHIFT_ID"  |"SHIFT_START_TIME"  |"SHIFT_END_TIME"  |"ORDER_CHANNEL"  |"ORDER_TS"           |"SERVED_TS"  |"ORDER_CURRENCY"  |"ORDER_AMOUNT"  |"ORDER_TAX_AMOUNT"  |"ORDER_DISCOUNT_AMOUNT"  |"ORDER_TOTAL"  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|22477378    |97          |3713.0         |NULL           |NULL           |78881       |16:00:00            |23:00:00          |NULL             |2020-04-

In [63]:
order_detail_df.show()

--------------------------------------------------------------------------------------------------------------------------------------------------------
|"ORDER_DETAIL_ID"  |"ORDER_ID"  |"MENU_ITEM_ID"  |"DISCOUNT_ID"  |"LINE_NUMBER"  |"QUANTITY"  |"UNIT_PRICE"  |"PRICE"  |"ORDER_ITEM_DISCOUNT_AMOUNT"  |
--------------------------------------------------------------------------------------------------------------------------------------------------------
|349467439          |128946760   |121             |NULL           |0              |1           |20.0000       |20.0000  |NULL                          |
|349467440          |128946760   |122             |NULL           |1              |2           |12.0000       |24.0000  |NULL                          |
|349467441          |128946760   |123             |NULL           |2              |1           |11.0000       |11.0000  |NULL                          |
|349467442          |128946760   |125             |NULL           |3              

In [64]:
menu_df.show()

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"MENU_ID"  |"MENU_TYPE_ID"  |"MENU_TYPE"  |"TRUCK_BRAND_NAME"  |"MENU_ITEM_ID"  |"MENU_ITEM_NAME"    |"ITEM_CATEGORY"  |"ITEM_SUBCATEGORY"  |"COST_OF_GOODS_USD"  |"SALE_PRICE_USD"  |"MENU_ITEM_HEALTH_METRICS_OBJ"     |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|10001      |1               |Ice Cream    |Freezing Point      |10              |Lemonade            |Beverage         |Cold Option         |0.6500               |3.5000            |{                                  |
|           |                |             |                    |                |                    |                 

In [65]:
#Minimum order transaction by a truck
order_header_df.group_by("TRUCK_ID").agg(F.count("ORDER_ID").alias('count')).agg(F.min('count')).show()

----------------
|"MIN(COUNT)"  |
----------------
|8850          |
----------------



In [66]:
#order_header_df.drop_duplicates('ORDER_ID').group_by('ORDER_ID').agg(F.count('ORDER_ID').alias('nnn')).agg(F.sum('NNN')).show()

In [67]:
#order_detail_df.drop_duplicates('ORDER_ID').group_by('ORDER_ID').agg(F.count('ORDER_ID').alias('nnn')).agg(F.sum('NNN')).show()

In [68]:
order_truck = order_header_df.select('ORDER_ID', 'TRUCK_ID')
df = order_detail_df.join(order_truck, order_detail_df.ORDER_ID == order_truck.ORDER_ID, how = "left", lsuffix = "", rsuffix = "_01")

In [69]:
df.show()

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"ORDER_DETAIL_ID"  |"ORDER_ID"  |"MENU_ITEM_ID"  |"DISCOUNT_ID"  |"LINE_NUMBER"  |"QUANTITY"  |"UNIT_PRICE"  |"PRICE"  |"ORDER_ITEM_DISCOUNT_AMOUNT"  |"ORDER_ID_01"  |"TRUCK_ID"  |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|349467439          |128946760   |121             |NULL           |0              |1           |20.0000       |20.0000  |NULL                          |128946760      |282         |
|349467440          |128946760   |122             |NULL           |1              |2           |12.0000       |24.0000  |NULL                          |128946760      |282         |
|349467441          |128946760   |123             |NULL           |2              |1      

In [70]:
#df.describe().show()

In [71]:
demand_df = df.group_by("TRUCK_ID", 'MENU_ITEM_ID').agg(F.sum("QUANTITY").alias('DEMAND'))
demand_df.sort('TRUCK_ID').show()

------------------------------------------
|"TRUCK_ID"  |"MENU_ITEM_ID"  |"DEMAND"  |
------------------------------------------
|1           |12              |46767     |
|1           |19              |46755     |
|1           |14              |15749     |
|1           |16              |15926     |
|1           |15              |15651     |
|1           |13              |46438     |
|1           |17              |46561     |
|1           |11              |46912     |
|1           |18              |46580     |
|1           |10              |15686     |
------------------------------------------



In [72]:
unit_price = df.group_by("TRUCK_ID", 'MENU_ITEM_ID').agg(F.mean("unit_price").alias('unit_price'))
unit_price.sort('TRUCK_ID').show()

----------------------------------------------
|"TRUCK_ID"  |"MENU_ITEM_ID"  |"UNIT_PRICE"  |
----------------------------------------------
|1           |19              |3.0000000000  |
|1           |18              |5.0000000000  |
|1           |15              |3.0000000000  |
|1           |12              |6.0000000000  |
|1           |17              |4.0000000000  |
|1           |16              |3.0000000000  |
|1           |13              |7.0000000000  |
|1           |14              |2.0000000000  |
|1           |10              |3.5000000000  |
|1           |11              |6.0000000000  |
----------------------------------------------



In [73]:
final_df = demand_df.join(unit_price, on= ['TRUCK_ID', 'MENU_ITEM_ID'], lsuffix = "", rsuffix = "_01")

In [74]:
truck_df.show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"TRUCK_ID"  |"MENU_TYPE_ID"  |"PRIMARY_CITY"  |"REGION"    |"ISO_REGION"  |"COUNTRY"      |"ISO_COUNTRY_CODE"  |"FRANCHISE_FLAG"  |"YEAR"  |"MAKE"        |"MODEL"           |"EV_FLAG"  |"FRANCHISE_ID"  |"TRUCK_OPENING_DATE"  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|3           |3               |San Mateo       |California  |CA            |United States  |US                  |1                 |2004    |Freightliner  |MT45 Utilimaster  |0          |2               |2021-10-01            |
|4           |4               |San Mateo       |California  |CA            |United State

In [75]:
truck_df = truck_df.filter(F.col('COUNTRY') == 'United States')

In [76]:
truck = truck_df.with_column('LAST_DATE', F.iff(F.col("TRUCK_ID") == F.col('TRUCK_ID'), "2022-10-18", '0'))
truck = truck.withColumn("DAYS_OPENED", F.datediff("day", F.col("TRUCK_OPENING_DATE"), F.col('LAST_DATE')))
#truck = truck.select('TRUCK_ID', 'PRIMARY_CITY', 'REGION', 'COUNTRY', 'DAYS_OPENED')
truck = truck.select('TRUCK_ID','EV_FLAG','DAYS_OPENED')

truck.show()

------------------------------------------
|"TRUCK_ID"  |"EV_FLAG"  |"DAYS_OPENED"  |
------------------------------------------
|3           |0          |382            |
|4           |1          |655            |
|5           |1          |200            |
|6           |0          |1205           |
|7           |0          |655            |
|8           |1          |200            |
|9           |0          |839            |
|12          |0          |565            |
|13          |0          |474            |
|14          |0          |747            |
------------------------------------------



In [77]:
final_df = final_df.join(truck, on= ['TRUCK_ID'], lsuffix = "", rsuffix = "_01")

In [78]:
menu = menu_df.drop('MENU_ITEM_HEALTH_METRICS_OBJ')
menu.show()

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"MENU_ID"  |"MENU_TYPE_ID"  |"MENU_TYPE"  |"TRUCK_BRAND_NAME"  |"MENU_ITEM_ID"  |"MENU_ITEM_NAME"    |"ITEM_CATEGORY"  |"ITEM_SUBCATEGORY"  |"COST_OF_GOODS_USD"  |"SALE_PRICE_USD"  |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|10001      |1               |Ice Cream    |Freezing Point      |10              |Lemonade            |Beverage         |Cold Option         |0.6500               |3.5000            |
|10002      |1               |Ice Cream    |Freezing Point      |11              |Sugar Cone          |Dessert          |Cold Option         |2.5000               |6.0000            |
|10003      |1               |Ice Cream    |Freezing Point      |12             

In [79]:
menu = menu.with_column('TEMPERATURE_OPTION', F.when(F.col('ITEM_SUBCATEGORY') == 'Cold Option', 0).when(
    F.col('ITEM_SUBCATEGORY') == 'Warm Option', 1).otherwise(2))
menu = menu.select('MENU_ITEM_ID', 'MENU_TYPE_ID', 'TEMPERATURE_OPTION', 'COST_OF_GOODS_USD', 'ITEM_CATEGORY') #Add item category and ohe in pandas
menu.show()

--------------------------------------------------------------------------------------------------
|"MENU_ITEM_ID"  |"MENU_TYPE_ID"  |"TEMPERATURE_OPTION"  |"COST_OF_GOODS_USD"  |"ITEM_CATEGORY"  |
--------------------------------------------------------------------------------------------------
|23              |2               |1                     |7.0000               |Main             |
|24              |2               |0                     |0.5000               |Beverage         |
|25              |2               |0                     |0.5000               |Beverage         |
|26              |2               |0                     |0.7500               |Beverage         |
|27              |2               |0                     |2.2500               |Snack            |
|28              |2               |2                     |11.2500              |Main             |
|29              |2               |1                     |1.2500               |Snack            |
|31       

In [80]:
final_df = final_df.join(menu, on= ['MENU_ITEM_ID'], lsuffix = "", rsuffix = "_01")

In [81]:
#final_df.show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"MENU_ITEM_ID"  |"TRUCK_ID"  |"DEMAND"  |"UNIT_PRICE"   |"EV_FLAG"  |"DAYS_OPENED"  |"MENU_TYPE_ID"  |"TEMPERATURE_OPTION"  |"COST_OF_GOODS_USD"  |"ITEM_CATEGORY"  |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|83              |23          |434588    |9.0000000000   |0          |1021           |8               |1                     |4.0000               |Snack            |
|125             |27          |82642     |3.0000000000   |0          |839            |12              |0                     |0.5000               |Beverage         |
|141             |29          |334308    |17.0000000000  |0          |747            |14              |2                     |10.0000              |Main             

In [82]:
import feature_engine
from feature_engine.encoding import OneHotEncoder

final_df = final_df.to_pandas()
ohe_enc = OneHotEncoder(
    top_categories=None,
    drop_last=False,
    variables= ['ITEM_CATEGORY'])  # to return k-1, false to return k


ohe_enc.fit(final_df)
final_df = ohe_enc.transform(final_df)

In [83]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   MENU_ITEM_ID            500 non-null    int16  
 1   TRUCK_ID                500 non-null    int16  
 2   DEMAND                  500 non-null    int64  
 3   UNIT_PRICE              500 non-null    object 
 4   EV_FLAG                 500 non-null    int8   
 5   DAYS_OPENED             500 non-null    int32  
 6   MENU_TYPE_ID            500 non-null    int8   
 7   TEMPERATURE_OPTION      500 non-null    int8   
 8   COST_OF_GOODS_USD       500 non-null    float64
 9   ITEM_CATEGORY_Beverage  500 non-null    int32  
 10  ITEM_CATEGORY_Main      500 non-null    int32  
 11  ITEM_CATEGORY_Dessert   500 non-null    int32  
 12  ITEM_CATEGORY_Snack     500 non-null    int32  
dtypes: float64(1), int16(2), int32(5), int64(1), int8(3), object(1)
memory usage: 25.0+ KB


In [84]:
final_df.corr()

  final_df.corr()


Unnamed: 0,MENU_ITEM_ID,TRUCK_ID,DEMAND,EV_FLAG,DAYS_OPENED,MENU_TYPE_ID,TEMPERATURE_OPTION,COST_OF_GOODS_USD,ITEM_CATEGORY_Beverage,ITEM_CATEGORY_Main,ITEM_CATEGORY_Dessert,ITEM_CATEGORY_Snack
MENU_ITEM_ID,1.0,0.207132,0.103248,-0.082996,-0.157421,0.998917,0.041305,0.063378,0.115938,0.078794,-0.355758,-0.056457
TRUCK_ID,0.207132,1.0,0.499577,0.089223,-0.215916,0.207357,0.01388,0.018124,0.019486,0.021639,-0.07445,-0.01259
DEMAND,0.103248,0.499577,1.0,-0.187664,0.184416,0.115501,0.356991,0.312639,-0.394885,0.364829,0.001352,0.072821
EV_FLAG,-0.082996,0.089223,-0.187664,1.0,-0.583604,-0.081807,0.029858,0.000904,0.00462,-0.00155,-0.014543,0.008804
DAYS_OPENED,-0.157421,-0.215916,0.184416,-0.583604,1.0,-0.159253,-0.051118,-0.043103,-0.030914,-0.025148,0.119111,-0.001971
MENU_TYPE_ID,0.998917,0.207357,0.115501,-0.081807,-0.159253,1.0,0.066939,0.087406,0.093975,0.104354,-0.359041,-0.060716
TEMPERATURE_OPTION,0.041305,0.01388,0.356991,0.029858,-0.051118,0.066939,1.0,0.798124,-0.771667,0.845334,-0.211232,0.074593
COST_OF_GOODS_USD,0.063378,0.018124,0.312639,0.000904,-0.043103,0.087406,0.798124,1.0,-0.756706,0.816459,-0.110024,-0.004311
ITEM_CATEGORY_Beverage,0.115938,0.019486,-0.394885,0.00462,-0.030914,0.093975,-0.771667,-0.756706,1.0,-0.801639,-0.233181,-0.211741
ITEM_CATEGORY_Main,0.078794,0.021639,0.364829,-0.00155,-0.025148,0.104354,0.845334,0.816459,-0.801639,1.0,-0.219436,-0.19926


In [85]:
# Scaling

#drop target
target = final_df['DEMAND']
final_df.drop('DEMAND', axis = 1, inplace = True)

scaler = StandardScaler()
scaler.fit(final_df)

final_scaled = scaler.transform(final_df)
final_scaled = pd.DataFrame(final_scaled, columns=final_df.columns)

#assign target variable back to scaled data frame
final_scaled = final_scaled.assign(DEMAND = target)
final_scaled.head()


Unnamed: 0,MENU_ITEM_ID,TRUCK_ID,UNIT_PRICE,EV_FLAG,DAYS_OPENED,MENU_TYPE_ID,TEMPERATURE_OPTION,COST_OF_GOODS_USD,ITEM_CATEGORY_Beverage,ITEM_CATEGORY_Main,ITEM_CATEGORY_Dessert,ITEM_CATEGORY_Snack,DEMAND
0,-0.264038,0.627637,-0.80861,-0.537271,1.236948,-0.309134,-0.83608,-0.767249,1.083473,-0.868554,-0.252646,-0.229416,91268
1,-0.353771,0.627637,-0.227921,-0.537271,1.236948,-0.309134,1.486364,-0.241736,-0.922958,1.151339,-0.252646,-0.229416,405959
2,0.565989,0.8121,0.546332,-0.537271,1.236948,0.58046,0.325142,0.586345,-0.922958,1.151339,-0.252646,-0.229416,403005
3,0.633288,0.8121,-0.80861,-0.537271,1.236948,0.58046,-0.83608,-0.767249,1.083473,-0.868554,-0.252646,-0.229416,90961
4,-1.430563,0.397058,-1.002173,-0.537271,0.057779,-1.421127,-0.83608,-0.846872,1.083473,-0.868554,-0.252646,-0.229416,46751


In [86]:
# Train Test Split
# Define Model Inputs (X) and Output (y)
X = final_scaled.drop('DEMAND',axis=1)
y = final_scaled["DEMAND"]

# Split both Inputs (X) and Ouput (y) into training set (70%) and testing set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [87]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

# Create the model "lr"
lr = LinearRegression()

# Fit the model to the training set
lr.fit(X_train, y_train)

# Make predictions on the training and testing sets
train_predictions = lr.predict(X_train)
test_predictions = lr.predict(X_test)

# Calculate RMSE for the training and testing sets
train_rmse = sqrt(mean_squared_error(y_train, train_predictions))
test_rmse = sqrt(mean_squared_error(y_test, test_predictions))

# Calculate R-squared values for the training and testing sets
train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)

# Print the results
print('Train RMSE:', train_rmse)
print('Test RMSE:', test_rmse)
print('Train R-squared:', train_r2)
print('Test R-squared:', test_r2)


Train RMSE: 197798.71193729364
Test RMSE: 189608.83256161635
Train R-squared: 0.523208898463682
Test R-squared: 0.4924263782750632
