Connect to Snowflake for the IMDB datasets

In [1]:
from snowflake.snowpark.session import Session
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions

pars = SnowflakeLoginOptions("test_conn")
pars["database"] = "IMDB"
session = Session.builder.configs(pars).create()
session.query_tag = "sentiment-1"

SnowflakeLoginOptions() is in private preview since 0.2.0. Do not use it in production. 


Isolate all code into a Python function

In [2]:
def train_imdb(session: Session, train_dataset_name: str, tmp_folder: str):
    
    from snowflake.snowpark import functions as fn
    import sklearn.feature_extraction.text as txt
    from sklearn import svm
    from joblib import dump
    
    df = session.table(train_dataset_name)
    df_flag = df.withColumn("SENTIMENT_FLAG",
        fn.when(df.SENTIMENT == "positive", 1).otherwise(2))
    train_x = df_flag.toPandas().REVIEW.values
    train_y = df_flag.toPandas().SENTIMENT_FLAG.values
    df_flag.show()

    filename = f'{tmp_folder}vect_review3.joblib'
    print(f'Building Sparse Matrix into {filename}...')
    vector = txt.CountVectorizer(
        token_pattern="[\\w']+\\w\\b", ngram_range=(1, 2), analyzer='word', 
        max_df=0.02, min_df=1 * 1./len(train_x), vocabulary=None, binary=True)
    bow = vector.fit_transform(train_x)
    dump(vector, filename, compress=True)
    session.file.put(filename, "@models", auto_compress=True, overwrite=True)

    filename = f'{tmp_folder}model_review3.joblib'
    print(f'Fitting model into {filename}...')
    model = svm.LinearSVC(C=1.8, max_iter=100)
    model.fit(bow, train_y)
    dump(model, filename, compress=True)
    session.file.put(filename, "@models", auto_compress=True, overwrite=True)

    return { "STATUS": "SUCCESS", "R2 Score Train": str(model.score(bow, train_y)) }

Deploy all the code as a Snowflake stored proc

In [3]:
from snowflake.snowpark.functions import sproc
from snowflake.snowpark.types import Variant

@sproc(name='train_imdb_sp', is_permanent=True, stage_location='@files', replace=True,
    imports=[], packages=['snowflake-snowpark-python',
    'scikit-learn', 'pandas', 'numpy', 'nltk', 'joblib', 'cachetools'])
def train_imdb_sp(session: Session, train_dataset_name: str, tmp_folder: str) -> Variant:
    return train_imdb(session, train_dataset_name, tmp_folder)



 Call Snowflake stored proc and list @MODELS stage files

In [4]:
ret = session.call("train_imdb_sp", "TRAIN_DATASET", "/tmp/")
print(ret)

session.sql("LS @MODELS").show()

----------------------------------------------------------------------------------------------------------------
|"name"                          |"size"    |"md5"                             |"last_modified"                |
----------------------------------------------------------------------------------------------------------------
|models/model_review.joblib.gz   |10820048  |1c63425ac807b5048c1e1f7ddc72da23  |Wed, 24 Apr 2024 15:52:14 GMT  |
|models/model_review1.joblib.gz  |10820800  |38506eb60f34f5d16a4c9d68384ef3ee  |Wed, 24 Apr 2024 18:08:37 GMT  |
|models/model_review2.joblib.gz  |10831088  |f4f7de7178e43463789bf213a44d96f5  |Wed, 24 Apr 2024 19:28:46 GMT  |
|models/model_review3.joblib.gz  |10822480  |43d3eba9ea59d5c444348793f3d9c4c0  |Wed, 24 Apr 2024 19:38:02 GMT  |
|models/vect_review.joblib.gz    |27852416  |92d019222e89c2db309379b0584dc958  |Wed, 24 Apr 2024 15:51:49 GMT  |
|models/vect_review1.joblib.gz   |27852432  |72b6c79a63956493b3b75f1d90c9de0a  |Wed, 24 Apr 2024