Connect to Snowflake

In [1]:
# see https://github.com/Snowflake-Labs/sfguide-snowpark-python-top-three-tips-for-optimal-performance/blob/main/lab2_vectorized_udfs.ipynb

from snowflake.snowpark import Session
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
session = Session.builder.configs(SnowflakeLoginOptions("test_conn")).create()

SnowflakeLoginOptions() is in private preview since 0.2.0. Do not use it in production. 


Calculate mean and stddev of account balance field of all customers

In [2]:
from snowflake.snowpark.functions import avg, stddev, udf
from snowflake.snowpark.types import PandasSeries

df = session.table("SNOWFLAKE_SAMPLE_DATA.TPCH_SF100.CUSTOMER")
print(f"count: {df.count()}")

df_mean = float(df.agg(avg("C_ACCTBAL")).collect()[0][0])
print(f"mean: {df_mean}")
df_stddev = float(df.agg(stddev("C_ACCTBAL")).collect()[0][0])
print(f"stddev: {df_stddev}")

count: 15000000
mean: 4498.55193071
stddev: 3175.397870825572


Create similar regular and vectorized UDFs

In [3]:
@udf()
def udf_mean_stddev(inp: float) -> float:
    return (inp - df_mean + df_stddev) * 10000.0

@udf()
def udf_mean_stddev_vect(inp: PandasSeries[float]) -> PandasSeries[float]:
    return (inp - df_mean + df_stddev) * 10000.0

Benchmark previous UDFs

In [4]:
from datetime import datetime

st = datetime.now()
df.select(udf_mean_stddev("C_ACCTBAL").alias("bal_from_mean")).agg(avg("bal_from_mean")).collect()
et = datetime.now()
print(f"udf_mean_stddev: {(et - st).total_seconds()} seconds")

st = datetime.now()
df.select(udf_mean_stddev_vect("C_ACCTBAL").alias("bal_from_mean")).agg(avg("bal_from_mean")).collect()
et = datetime.now()
print(f"udf_mean_stddev_vect: {(et - st).total_seconds()} seconds")

udf_mean_stddev: 7.045351 seconds
udf_mean_stddev_vect: 3.415198 seconds
