In [1]:
from snowflake.snowpark import functions as fn
from snowflake.snowpark.functions import sproc
from snowflake.snowpark.session import Session
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
from snowflake.snowpark.types import Variant

pars = SnowflakeLoginOptions("test_conn")
pars["database"] = "IMDB"
session = Session.builder.configs(pars).create()

train_dataset = session.table("TRAIN_DATASET")
train_dataset_flag = train_dataset.withColumn("SENTIMENT_FLAG",
    fn.when(train_dataset.SENTIMENT == "positive", 1).otherwise(2))
train_dataset_flag.show()

SnowflakeLoginOptions() is in private preview since 0.2.0. Do not use it in production. 


---------------------------------------------------------------------------------------
|"REVIEW"                                            |"SENTIMENT"  |"SENTIMENT_FLAG"  |
---------------------------------------------------------------------------------------
|"In Victorian times a father is separated from ...  |positive     |1                 |
|"When this film gets it right it really gets it...  |positive     |1                 |
|"What this film has is its realism , you really...  |negative     |2                 |
|"""Like the first touch of pleasure and guilt, ...  |positive     |1                 |
|"This is a weird and compelling film. The topic...  |positive     |1                 |
|"Although I am not a Michael Jackson fan, I lik...  |positive     |1                 |
|"Tigerland follows the lives of a group of rece...  |positive     |1                 |
|"To all the reviewers on this page, I would hav...  |positive     |1                 |
|"I really should give this stin

In [2]:
session.clear_imports()

session.clear_packages()
session.add_packages("snowflake-snowpark-python",
    "scikit-learn", "pandas", "numpy", "nltk", "joblib", "cachetools")



In [4]:
# alternative to register a stored proc
# @sproc(name='train_model_review_pipline', is_permanent=True, stage_location='@files', replace=True)
def train_model_review_pipline(session: Session, train_dataset_name: str) -> Variant:
    
    import sklearn.feature_extraction.text as txt
    from sklearn import svm
    from joblib import dump
        
    data = session.table(train_dataset_name)
    flag = data.withColumn("SENTIMENT_FLAG",
        fn.when(data.SENTIMENT == "positive", 1).otherwise(2))
    train_x = flag.toPandas().REVIEW.values
    train_y = flag.toPandas().SENTIMENT_FLAG.values
    
    filename = '/tmp/vect_review.joblib'
    print(f'Building Sparse Matrix into {filename}...')
    vec = txt.CountVectorizer(
        token_pattern="[\\w']+\\w\\b", ngram_range=(1, 2), analyzer='word', 
        max_df=0.02, min_df=1 * 1./len(train_x), vocabulary=None, binary=True)
    bow = vec.fit_transform(train_x)
    dump(vec, filename, compress=True)
    session.file.put(filename, "@models", auto_compress=True, overwrite=True)

    filename = '/tmp/model_review.joblib'
    print(f'Fitting model into {filename}...')
    model = svm.LinearSVC(C=1.8, max_iter=100)
    model.fit(bow, train_y)
    dump(model, filename, compress=True)
    session.file.put(filename, "@models", auto_compress=True, overwrite=True)
    
    return { "STATUS": "SUCCESS", "R2 Score Train": str(model.score(bow, train_y)) }

Register train function as stored proc, test and call.

In [5]:
# print("Testing train function...")
# train_model_review_pipline(session, train_dataset_name="TRAIN_DATASET")

print("Registering stored proc...")
session.sproc.register(
    func=train_model_review_pipline, name="train_model_review_pipline",
    is_permanent=True, stage_location='@files', replace=True)

print("Executing stored proc...")
session.call(
    "train_model_review_pipline",
    "TRAIN_DATASET")

Registering stored proc...
Executing stored proc...


'{\n  "R2 Score Train": "1.0",\n  "STATUS": "SUCCESS"\n}'