In [2]:
# Import Python packages
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import json
import sys
import cachetools
from datetime import datetime
from datetime import date
import matplotlib.pyplot as plt 
import seaborn as sns
import math

# Import Snowflake modules
from snowflake.snowpark import Session
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T
from snowflake.snowpark import Window
from snowflake.snowpark.functions import col

# Clustering & Model
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score, train_test_split

# Scalers
from sklearn.preprocessing import StandardScaler

# Evaluation
from sklearn.metrics import silhouette_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# find the best combination of model hyperparameters
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score, cross_validate

# Getting Password,Username, Account
import getpass

In [3]:
# Get account credentials from a json file
with open("account.json") as f:
    data = json.load(f)
    username = data["username"]
    password = data["password"]
    account = data["account"]

# Specify connection parameters
connection_parameters = {
    "account": account,
    "user": username,
    "password": password,
    #"role": "ACCOUNTADMIN",
    #"warehouse": "tasty_ds_wh",
    #"database": "NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE",
    #"schema": "analytics",
}

# Create Snowpark session
session = Session.builder.configs(connection_parameters).create()

In [4]:
order_header_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_pos.order_header")
order_detail_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_pos.order_detail")
menu_df = session.table("NGEE_ANN_POLYTECHNIC_FROSTBYTE_DATA_SHARE.raw_pos.menu")

In [5]:
order_header_df.show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"ORDER_ID"  |"TRUCK_ID"  |"LOCATION_ID"  |"CUSTOMER_ID"  |"DISCOUNT_ID"  |"SHIFT_ID"  |"SHIFT_START_TIME"  |"SHIFT_END_TIME"  |"ORDER_CHANNEL"  |"ORDER_TS"           |"SERVED_TS"  |"ORDER_CURRENCY"  |"ORDER_AMOUNT"  |"ORDER_TAX_AMOUNT"  |"ORDER_DISCOUNT_AMOUNT"  |"ORDER_TOTAL"  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|82686170    |239         |6185.0         |NULL           |NULL           |232187      |15:30:00            |22:30:00          |NULL             |2021-04-

In [6]:
order_detail_df.show()

--------------------------------------------------------------------------------------------------------------------------------------------------------
|"ORDER_DETAIL_ID"  |"ORDER_ID"  |"MENU_ITEM_ID"  |"DISCOUNT_ID"  |"LINE_NUMBER"  |"QUANTITY"  |"UNIT_PRICE"  |"PRICE"  |"ORDER_ITEM_DISCOUNT_AMOUNT"  |
--------------------------------------------------------------------------------------------------------------------------------------------------------
|702836225          |385518728   |51              |NULL           |0              |2           |17.2500       |34.5000  |NULL                          |
|702836226          |385518728   |53              |NULL           |1              |1           |17.2500       |17.2500  |NULL                          |
|702836227          |385518729   |51              |NULL           |0              |1           |17.2500       |17.2500  |NULL                          |
|702836228          |385518729   |52              |NULL           |1              

In [7]:
menu_df.show()

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"MENU_ID"  |"MENU_TYPE_ID"  |"MENU_TYPE"  |"TRUCK_BRAND_NAME"  |"MENU_ITEM_ID"  |"MENU_ITEM_NAME"    |"ITEM_CATEGORY"  |"ITEM_SUBCATEGORY"  |"COST_OF_GOODS_USD"  |"SALE_PRICE_USD"  |"MENU_ITEM_HEALTH_METRICS_OBJ"     |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|10001      |1               |Ice Cream    |Freezing Point      |10              |Lemonade            |Beverage         |Cold Option         |0.6500               |3.5000            |{                                  |
|           |                |             |                    |                |                    |                 

In [8]:
#Minimum order transaction by a truck
order_header_df.group_by("TRUCK_ID").agg(F.count("ORDER_ID").alias('count')).agg(F.min('count')).show()

----------------
|"MIN(COUNT)"  |
----------------
|8850          |
----------------



In [9]:
order_header_df.drop_duplicates('ORDER_ID').group_by('ORDER_ID').agg(F.count('ORDER_ID').alias('nnn')).agg(F.sum('NNN')).show()

--------------
|"SUM(NNN)"  |
--------------
|248201269   |
--------------



In [10]:
order_detail_df.drop_duplicates('ORDER_ID').group_by('ORDER_ID').agg(F.count('ORDER_ID').alias('nnn')).agg(F.sum('NNN')).show()

--------------
|"SUM(NNN)"  |
--------------
|248201269   |
--------------



In [11]:
order_truck = order_header_df.select('ORDER_ID', 'TRUCK_ID')
df = order_detail_df.join(order_truck, order_detail_df.ORDER_ID == order_truck.ORDER_ID, how = "left", lsuffix = "", rsuffix = "_01")

In [12]:
df.show()

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"ORDER_DETAIL_ID"  |"ORDER_ID"  |"MENU_ITEM_ID"  |"DISCOUNT_ID"  |"LINE_NUMBER"  |"QUANTITY"  |"UNIT_PRICE"  |"PRICE"  |"ORDER_ITEM_DISCOUNT_AMOUNT"  |"ORDER_ID_01"  |"TRUCK_ID"  |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|798202497          |420427397   |133             |NULL           |0              |1           |6.0000        |6.0000   |NULL                          |420427397      |433         |
|798202503          |420427400   |131             |NULL           |0              |4           |13.0000       |52.0000  |NULL                          |420427400      |433         |
|798202504          |420427400   |132             |NULL           |1              |4      

In [13]:
df.describe().show()

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"SUMMARY"  |"ORDER_DETAIL_ID"  |"ORDER_ID"          |"MENU_ITEM_ID"      |"DISCOUNT_ID"  |"LINE_NUMBER"       |"QUANTITY"          |"UNIT_PRICE"       |"PRICE"             |"ORDER_ITEM_DISCOUNT_AMOUNT"  |"ORDER_ID_01"       |"TRUCK_ID"          |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|max        |904745310.0        |459520440.0         |156.0               |NULL           |9.0                 |22.0                |21.0               |396.0               |NULL                          |459520440.0         |450.0               |
|mean   

In [14]:
demand_df = df.group_by("TRUCK_ID", 'MENU_ITEM_ID').agg(F.sum("QUANTITY").alias('DEMAND'))
demand_df.sort('TRUCK_ID').show()

------------------------------------------
|"TRUCK_ID"  |"MENU_ITEM_ID"  |"DEMAND"  |
------------------------------------------
|1           |15              |15651     |
|1           |18              |46580     |
|1           |19              |46755     |
|1           |11              |46912     |
|1           |13              |46438     |
|1           |16              |15926     |
|1           |10              |15686     |
|1           |17              |46561     |
|1           |14              |15749     |
|1           |12              |46767     |
------------------------------------------



In [15]:
unit_price = df.group_by("TRUCK_ID", 'MENU_ITEM_ID').agg(F.mean("unit_price").alias('unit_price'))
unit_price.sort('TRUCK_ID').show()

----------------------------------------------
|"TRUCK_ID"  |"MENU_ITEM_ID"  |"UNIT_PRICE"  |
----------------------------------------------
|1           |16              |3.0000000000  |
|1           |12              |6.0000000000  |
|1           |17              |4.0000000000  |
|1           |10              |3.5000000000  |
|1           |18              |5.0000000000  |
|1           |11              |6.0000000000  |
|1           |14              |2.0000000000  |
|1           |13              |7.0000000000  |
|1           |19              |3.0000000000  |
|1           |15              |3.0000000000  |
----------------------------------------------



In [16]:
final_df = demand_df.join(unit_price, on= ['TRUCK_ID', 'MENU_ITEM_ID'], lsuffix = "", rsuffix = "_01")

In [17]:
final_df.sort('TRUCK_ID').show()

---------------------------------------------------------
|"TRUCK_ID"  |"MENU_ITEM_ID"  |"DEMAND"  |"UNIT_PRICE"  |
---------------------------------------------------------
|1           |15              |15651     |3.0000000000  |
|1           |13              |46438     |7.0000000000  |
|1           |14              |15749     |2.0000000000  |
|1           |19              |46755     |3.0000000000  |
|1           |18              |46580     |5.0000000000  |
|1           |16              |15926     |3.0000000000  |
|1           |17              |46561     |4.0000000000  |
|1           |10              |15686     |3.5000000000  |
|1           |12              |46767     |6.0000000000  |
|1           |11              |46912     |6.0000000000  |
---------------------------------------------------------



In [18]:
# Scaling

final_df = final_df.to_pandas()
#drop target
target = final_df['DEMAND']
final_df.drop('DEMAND', axis = 1, inplace = True)

scaler = StandardScaler()
scaler.fit(final_df)

final_scaled = scaler.transform(final_df)
final_scaled = pd.DataFrame(final_scaled, columns=final_df.columns)

#assign target variable back to scaled data frame
final_scaled = final_scaled.assign(DEMAND = target)
final_scaled.head()


Unnamed: 0,TRUCK_ID,MENU_ITEM_ID,UNIT_PRICE,DEMAND
0,-1.030641,-1.452996,-0.034357,119234
1,-1.007548,-0.712701,-0.80861,89453
2,-0.922874,1.687647,0.739895,272896
3,-1.022943,-1.116498,2.675527,152353
4,-0.953664,0.745454,0.933458,395262


In [19]:
# Train Test Split
# Define Model Inputs (X) and Output (y)
X = final_scaled.drop('DEMAND',axis=1)
y = final_scaled["DEMAND"]

# Split both Inputs (X) and Ouput (y) into training set (70%) and testing set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

# Create the model "lr"
lr = LinearRegression()

# Fit the model to the training set
lr.fit(X_train, y_train)

# Make predictions on the training and testing sets
train_predictions = lr.predict(X_train)
test_predictions = lr.predict(X_test)

# Calculate RMSE for the training and testing sets
train_rmse = sqrt(mean_squared_error(y_train, train_predictions))
test_rmse = sqrt(mean_squared_error(y_test, test_predictions))

# Calculate R-squared values for the training and testing sets
train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)

# Print the results
print('Train RMSE:', train_rmse)
print('Test RMSE:', test_rmse)
print('Train R-squared:', train_r2)
print('Test R-squared:', test_r2)


Train RMSE: 338037.1777949491
Test RMSE: 317406.9998558533
Train R-squared: 0.24858227101314023
Test R-squared: 0.26189515395976315
