Connect to Snowflake and get the IMDB test dataset

In [1]:
import cachetools
from snowflake.snowpark import functions as fn
from snowflake.snowpark.functions import col, udf
from snowflake.snowpark.types import PandasDataFrame, PandasSeries
from snowflake.snowpark.session import Session
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions

conn = SnowflakeLoginOptions("sanju")
conn["database"] = "IMDB"
session = Session.builder.configs(conn).create()
session.use_warehouse("ADHOC_WH")
session.use_database("IMDB")
session.use_schema("PUBLIC")
session.query_tag = "imdb-sentiment-serving"

test_dataset = session.table("TEST_DATASET")
df = test_dataset.withColumn("SENTIMENT_FLAG",
    fn.when(test_dataset.SENTIMENT == "positive", 1).otherwise(2))
df.show()

SnowflakeLoginOptions() is in private preview since 0.2.0. Do not use it in production. 


---------------------------------------------------------------------------------------
|"REVIEW"                                            |"SENTIMENT"  |"SENTIMENT_FLAG"  |
---------------------------------------------------------------------------------------
|aking this film into a monumental success simpl...  |negative     |2                 |
|"I remember seeing this film years ago on, I th...  |positive     |1                 |
|"A truly masterful piece of filmmaking. It mana...  |negative     |2                 |
|"Terrible. There's no way to get around it. A s...  |negative     |2                 |
|"Take a few dark and stormy nights, fog coming ...  |negative     |2                 |
|"I enjoyed the beautiful scenery in this movie ...  |negative     |2                 |
|"*********Ten out of Ten Stars********* <br /><...  |positive     |1                 |
|"This film was total rubbish. I was sitting wat...  |negative     |2                 |
|"Lady and the Tramp II: Scamp's

Prepare imports and packages for the UDFs

In [2]:
# Load the previously saved vector & model
session.clear_imports()
session.add_import("@models/imdb_review_vector.joblib")
session.add_import("@models/imdb_review_svm_model.joblib")


session.clear_packages()
session.add_packages("snowflake-snowpark-python",
    "scikit-learn", "pandas", "numpy", "nltk", "joblib", "cachetools")



Caching utility function for optimizing our UDF

In [3]:
@cachetools.cached(cache={})
def load_file(filename):
    
    import os, sys, joblib
    import_dir = sys._xoptions.get("snowflake_import_directory") # snowflake_import_directory is internal snowflake location where previously imported files(model & vector) are stored
    if import_dir:
        with open(os.path.join(import_dir, filename), 'rb') as file:
            return joblib.load(file)

Non-vectorized UDF for model inference

In [None]:
@udf(name='imdb_predict_review', is_permanent=True, stage_location='@files', replace=True)
def imdb_predict_review(review: str) -> float:
    
    import pandas as pd
    row = pd.DataFrame([review], columns=["REVIEW"])
    bowTest = load_file("imdb_review_vector.joblib").transform(row.REVIEW.values)
    return load_file("imdb_review_svm_model.joblib").predict(bowTest)


# alternative to register UDF
# session.udf.register(func=predict_review, name="predict_review",
#    is_permanent=True, stage_location='@files', replace=True)

query = df.select(df.REVIEW, df.SENTIMENT, df.SENTIMENT_FLAG,
    fn.call_udf("imdb_predict_review", col("REVIEW")).alias('PREDICTED_REVIEW'))
print(query.queries)
query.show()

{'queries': ['SELECT "REVIEW", "SENTIMENT",  CASE  WHEN ("SENTIMENT" = \'positive\') THEN 1 :: INT ELSE 2 :: INT END  AS "SENTIMENT_FLAG", imdb_predict_review("REVIEW") AS "PREDICTED_REVIEW" FROM TEST_DATASET'], 'post_actions': []}
------------------------------------------------------------------------------------------------------------
|"REVIEW"                                            |"SENTIMENT"  |"SENTIMENT_FLAG"  |"PREDICTED_REVIEW"  |
------------------------------------------------------------------------------------------------------------
|aking this film into a monumental success simpl...  |negative     |2                 |2.0                 |
|"I remember seeing this film years ago on, I th...  |positive     |1                 |1.0                 |
|"A truly masterful piece of filmmaking. It mana...  |negative     |2                 |2.0                 |
|"Terrible. There's no way to get around it. A s...  |negative     |2                 |2.0                 |
|"Tak

Vectorized UDF for model inference

In [6]:
@udf(name="imdb_predict_review_v", is_permanent=True, stage_location='@files', replace=True)
def imdb_predict_review_v(df: PandasDataFrame[str]) -> PandasSeries[float]:

    vec = load_file("imdb_review_vector.joblib")
    bowTest = vec.transform(df[0].values)

    model = load_file("imdb_review_svm_model.joblib")
    return model.predict(bowTest)


query = df.select(df.REVIEW, df.SENTIMENT, df.SENTIMENT_FLAG,
        fn.call_udf("imdb_predict_review_v", col("REVIEW")
    ).alias('PREDICTED_REVIEW'))
print(query.queries)
query.show()

{'queries': ['SELECT "REVIEW", "SENTIMENT",  CASE  WHEN ("SENTIMENT" = \'positive\') THEN 1 :: INT ELSE 2 :: INT END  AS "SENTIMENT_FLAG", imdb_predict_review_v("REVIEW") AS "PREDICTED_REVIEW" FROM TEST_DATASET'], 'post_actions': []}
------------------------------------------------------------------------------------------------------------
|"REVIEW"                                            |"SENTIMENT"  |"SENTIMENT_FLAG"  |"PREDICTED_REVIEW"  |
------------------------------------------------------------------------------------------------------------
|aking this film into a monumental success simpl...  |negative     |2                 |2.0                 |
|"I remember seeing this film years ago on, I th...  |positive     |1                 |1.0                 |
|"A truly masterful piece of filmmaking. It mana...  |negative     |2                 |2.0                 |
|"Terrible. There's no way to get around it. A s...  |negative     |2                 |2.0                 |
|"T