Connect to Snowflake

In [1]:
from snowflake.snowpark import Session
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
session = Session.builder.configs(SnowflakeLoginOptions("test_conn")).create()

SnowflakeLoginOptions() is in private preview since 0.2.0. Do not use it in production. 


Create test dataframe w/ 10M rows, random entry values between 1..7

In [2]:
import pandas as pd
import numpy as np
from snowflake.snowpark.functions import call_udf

df = pd.DataFrame(
    np.random.randint(low=1, high=7, size=(10000000, 3)),
    columns=['invoice_num', 'trx_num', 'weekday'])
df = session.createDataFrame(df,
    schema=['invoice_num', 'trx_num', 'weekday'])
df.show()

-----------------------------------------
|"invoice_num"  |"trx_num"  |"weekday"  |
-----------------------------------------
|3              |2          |4          |
|1              |5          |3          |
|1              |4          |5          |
|6              |1          |1          |
|3              |3          |6          |
|6              |5          |5          |
|4              |2          |6          |
|1              |4          |3          |
|3              |4          |6          |
|5              |2          |2          |
-----------------------------------------



Create UDF (no cache), to get the name of a week day by number 1..7

In [5]:
import cachetools
import pickle, sys
from snowflake.snowpark.types import StringType, IntegerType

session.add_import("@int_stage/alldays.pkl")

@cachetools.cached(cache={})
def extract_name() -> dict:
    import_dir = sys._xoptions["snowflake_import_directory"]
    file_path = f"{import_dir}alldays.pkl"
    
    with open(file_path, 'rb') as file:
        dict = pickle.load(file)
    return dict

def cachetools_test(key:int) -> str:   
    return extract_name()[key]

session.add_packages("cachetools")
udf_nocache = session.udf.register(
    func=cachetools_test, name="cachetools_test", stage_location='int_stage',
    is_permanent=True, replace=True,
    input_types=[IntegerType()], return_type=StringType())



Call the UDF 10M times, and saves a table w/ WEEKDAYNAME as last added column

In [6]:
from datetime import datetime

st = datetime.now()
df.withColumn('weekdayname',
        call_udf('cachetools_test', df['"weekday"'].astype('int'))
    ).write.mode('overwrite').save_as_table("cachetools_test")
et = datetime.now()
print(f"{(et - st).total_seconds()} seconds")

7.711658 seconds
