Connect to Snowflake and get the IMDB test dataset

In [1]:
# see https://github.com/Snowflake-Labs/snowpark-python-demos/blob/main/snowpark_nlp_ml_demo/notebook/Sentiment_Analysis_NLP_with_Snowpark_ML.ipynb

import cachetools
from snowflake.snowpark import functions as fn
from snowflake.snowpark.functions import col, udf
from snowflake.snowpark.types import PandasDataFrame, PandasSeries
from snowflake.snowpark.session import Session
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions

pars = SnowflakeLoginOptions("test_conn")
pars["database"] = "IMDB"
session = Session.builder.configs(pars).create()
session.query_tag = "sentiment-serving"

test_dataset = session.table("TEST_DATASET")
df = test_dataset.withColumn("SENTIMENT_FLAG",
    fn.when(test_dataset.SENTIMENT == "positive", 1).otherwise(2))
df.show()

SnowflakeLoginOptions() is in private preview since 0.2.0. Do not use it in production. 


---------------------------------------------------------------------------------------
|"REVIEW"                                            |"SENTIMENT"  |"SENTIMENT_FLAG"  |
---------------------------------------------------------------------------------------
|aking this film into a monumental success simpl...  |negative     |2                 |
|"I remember seeing this film years ago on, I th...  |positive     |1                 |
|"A truly masterful piece of filmmaking. It mana...  |negative     |2                 |
|"Terrible. There's no way to get around it. A s...  |negative     |2                 |
|"Take a few dark and stormy nights, fog coming ...  |negative     |2                 |
|"I enjoyed the beautiful scenery in this movie ...  |negative     |2                 |
|"*********Ten out of Ten Stars********* <br /><...  |positive     |1                 |
|"This film was total rubbish. I was sitting wat...  |negative     |2                 |
|"Lady and the Tramp II: Scamp's

Prepare imports and packages for the next UDFs

In [2]:
session.clear_imports()
session.add_import("@models/model_review1.joblib")
session.add_import("@models/vect_review1.joblib")

session.clear_packages()
session.add_packages("snowflake-snowpark-python",
    "scikit-learn", "pandas", "numpy", "nltk", "joblib", "cachetools")



Create cached utility function (for next UDFs)

In [3]:
@cachetools.cached(cache={})
def load_file(filename):
    
    import os, sys, joblib
    import_dir = sys._xoptions.get("snowflake_import_directory")
    if import_dir:
        with open(os.path.join(import_dir, filename), 'rb') as file:
            return joblib.load(file)

Create and test UDF for model inference

In [4]:
@udf(name='predict_review', is_permanent=True, stage_location='@files', replace=True)
def predict_review(args: list) -> float:
    
    import pandas as pd
    row = pd.DataFrame([args], columns=list(["REVIEW", "SENTIMENT_FLAG"]))
    bowTest = load_file("vect_review1.joblib").transform(row.REVIEW.values)
    return load_file("model_review1.joblib").predict(bowTest)


# alternative to register UDF
# session.udf.register(func=predict_review, name="predict_review",
#    is_permanent=True, stage_location='@files', replace=True)

query = df.select(df.REVIEW, df.SENTIMENT, df.SENTIMENT_FLAG,
    fn.call_udf("predict_review",
        fn.array_construct(col("REVIEW"), col("SENTIMENT_FLAG"))
    ).alias('PREDICTED_REVIEW'))
print(query.queries)
query.show()

{'queries': ['SELECT "REVIEW", "SENTIMENT", "SENTIMENT_FLAG", predict_review(array_construct("REVIEW", "SENTIMENT_FLAG")) AS "PREDICTED_REVIEW" FROM ( SELECT "REVIEW", "SENTIMENT",  CASE  WHEN ("SENTIMENT" = \'positive\') THEN 1 :: INT ELSE 2 :: INT END  AS "SENTIMENT_FLAG" FROM TEST_DATASET)'], 'post_actions': []}
------------------------------------------------------------------------------------------------------------
|"REVIEW"                                            |"SENTIMENT"  |"SENTIMENT_FLAG"  |"PREDICTED_REVIEW"  |
------------------------------------------------------------------------------------------------------------
|aking this film into a monumental success simpl...  |negative     |2                 |2.0                 |
|"I remember seeing this film years ago on, I th...  |positive     |1                 |1.0                 |
|"A truly masterful piece of filmmaking. It mana...  |negative     |2                 |2.0                 |
|"Terrible. There's no way to

Create and test alternative vectorized UDF for batch inference

In [5]:
@udf(name="predict_review_batch", is_permanent=True, stage_location='@files', replace=True)
def predict_review_batch(df: PandasDataFrame[str]) -> PandasSeries[float]:

    vec = load_file("vect_review1.joblib")
    bowTest = vec.transform(df[0].values)

    model = load_file("model_review1.joblib")
    return model.predict(bowTest)


query = df.select(df.REVIEW, df.SENTIMENT, df.SENTIMENT_FLAG,
        fn.call_udf("predict_review_batch", col("REVIEW")
    ).alias('PREDICTED_REVIEW'))
print(query.queries)
query.show()

{'queries': ['SELECT "REVIEW", "SENTIMENT",  CASE  WHEN ("SENTIMENT" = \'positive\') THEN 1 :: INT ELSE 2 :: INT END  AS "SENTIMENT_FLAG", predict_review_batch("REVIEW") AS "PREDICTED_REVIEW" FROM TEST_DATASET'], 'post_actions': []}
------------------------------------------------------------------------------------------------------------
|"REVIEW"                                            |"SENTIMENT"  |"SENTIMENT_FLAG"  |"PREDICTED_REVIEW"  |
------------------------------------------------------------------------------------------------------------
|aking this film into a monumental success simpl...  |negative     |2                 |2.0                 |
|"I remember seeing this film years ago on, I th...  |positive     |1                 |1.0                 |
|"A truly masterful piece of filmmaking. It mana...  |negative     |2                 |2.0                 |
|"Terrible. There's no way to get around it. A s...  |negative     |2                 |2.0                 |
|"Ta