In [1]:
# see https://github.com/Snowflake-Labs/snowpark-python-demos/blob/main/snowpark_nlp_ml_demo/notebook/Sentiment_Analysis_NLP_with_Snowpark_ML.ipynb

import sklearn.feature_extraction.text as txt
from sklearn import svm
from joblib import dump
from snowflake.snowpark import functions as fn
from snowflake.snowpark.session import Session
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions

Connect to Snowflake for the IMDB datasets

In [2]:
pars = SnowflakeLoginOptions("test_conn")
pars["database"] = "IMDB"
session = Session.builder.configs(pars).create()
session.query_tag = "sentiment-1"

SnowflakeLoginOptions() is in private preview since 0.2.0. Do not use it in production. 


Get training datasets

In [3]:
df = session.table("TRAIN_DATASET")
df_flag = df.withColumn("SENTIMENT_FLAG",
    fn.when(df.SENTIMENT == "positive", 1).otherwise(2))
train_x = df_flag.toPandas().REVIEW.values
train_y = df_flag.toPandas().SENTIMENT_FLAG.values
df_flag.show()

---------------------------------------------------------------------------------------
|"REVIEW"                                            |"SENTIMENT"  |"SENTIMENT_FLAG"  |
---------------------------------------------------------------------------------------
|"In Victorian times a father is separated from ...  |positive     |1                 |
|"When this film gets it right it really gets it...  |positive     |1                 |
|"What this film has is its realism , you really...  |negative     |2                 |
|"""Like the first touch of pleasure and guilt, ...  |positive     |1                 |
|"This is a weird and compelling film. The topic...  |positive     |1                 |
|"Although I am not a Michael Jackson fan, I lik...  |positive     |1                 |
|"Tigerland follows the lives of a group of rece...  |positive     |1                 |
|"To all the reviewers on this page, I would hav...  |positive     |1                 |
|"I really should give this stin

Create sparse matrix with word counts

In [4]:
vector = txt.CountVectorizer(
    token_pattern="[\\w']+\\w\\b", ngram_range=(1, 2), analyzer='word', 
    max_df=0.02, min_df=1 * 1./len(train_x), vocabulary=None, binary=True)
bow = vector.fit_transform(train_x)
dump(vector, '../../../.spool/vect_review1.joblib', compress=True)

['../../../.spool/vect_review1.joblib']

Train model with SVM (Support Vector Machine) linear classifier

In [5]:
model = svm.LinearSVC(C=1.8, max_iter=100)
model.fit(bow, train_y)
dump(model, '../../../.spool/model_review1.joblib', compress=True)

print({ "STATUS": "SUCCESS", "R2 Score Train": str(model.score(bow, train_y)) })



{'STATUS': 'SUCCESS', 'R2 Score Train': '1.0'}
