# IMDB Reviews Sentiment Analysis

Connect to Snowflake IMDB database

In [1]:
import os
import gzip
import shutil
import sklearn.feature_extraction.text as txt
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from joblib import dump, load
from snowflake.snowpark import functions as fn
from snowflake.snowpark.functions import sproc
from snowflake.snowpark.types import Variant

from snowflake.snowpark.session import Session
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions

conn = SnowflakeLoginOptions("sanju")
conn["database"] = "IMDB"
session = Session.builder.configs(conn).create()
session.query_tag = "imdb_sentiment_1"

SnowflakeLoginOptions() is in private preview since 0.2.0. Do not use it in production. 


Model training:

In [2]:
# Function to train the IMDB sentiment analysis model
def train_imdb_model(session: Session, train_dataset: str, tmp_dir: str):
    df = session.table("TRAIN_DATASET")
    df_flag = df.withColumn("SENTIMENT_FLAG",
        fn.when(df.SENTIMENT == "positive", 1).otherwise(2))
    train_x = df_flag.toPandas().REVIEW.values
    train_y = df_flag.toPandas().SENTIMENT_FLAG.values
    df_flag.show()

    # Parameter         | Meaning
    # -------------------|-------------------------------------------------------------------------------------------------------------
    # token_pattern     | "[\\w']+\\w\\b" - Regular expression that defines what counts as a "token" (word). Matches words, including contractions like "don't", "it's".
    # ngram_range       | (1, 2) - Extract unigrams (single words) and bigrams (two-word phrases). Ex: "good" and "good movie".
    # analyzer          | 'word' - Tells it to split text into words, not characters or custom logic.
    # max_df            | 0.02 - Ignore words that appear in more than 2% of the documents (likely too common to be meaningful, like "the", "and").
    # min_df            | 1 * 1./len(train_x) - Only keep words that appear in at least one document. (1 / number of documents). Basically, no rarest terms are filtered out.
    # vocabulary        | None - Build the vocabulary automatically from the training data (instead of fixing it beforehand).
    # binary            | True - Instead of counting how often a word appears, just mark 1 if it appears, 0 if not. (turns into binary presence/absence matrix).
    vector = txt.CountVectorizer(
        token_pattern="[\\w']+\\w\\b", ngram_range=(1, 2), analyzer='word', 
        max_df=0.02, min_df=1 * 1./len(train_x), vocabulary=None, binary=True)
    bow = vector.fit_transform(train_x)
    filename = f'{tmp_dir}imdb_review_vector.joblib'
    dump(vector, filename, compress=True)
    session.file.put(filename, "@models", auto_compress=True, overwrite=True)

    svm_model = svm.LinearSVC(C=1.8, max_iter=100)
    svm_model.fit(bow, train_y)
    filename = f'{tmp_dir}imdb_review_svm_model.joblib'
    dump(svm_model, filename, compress=True)
    session.file.put(filename, "@models", auto_compress=True, overwrite=True)

    # print({ "STATUS": "SUCCESS", "R2 Score Train": str(svm_model.score(bow, train_y)) })
    return {
    "STATUS": "SUCCESS",
    "R2_Score_Train": str(svm_model.score(bow, train_y))
}


Evaluate the Model:

In [3]:
# Function to test the IMDB sentiment analysis model
def test_imdb_model(session: Session, test_dataset: str, tmp_dir: str):
    # Download the vectorizer and model files from @models stage to tmp_dir
    session.file.get("@models/imdb_review_vector.joblib", tmp_dir)
    session.file.get("@models/imdb_review_svm_model.joblib", tmp_dir)

    # Load the vectorizer and model from temp directory
    # Decompress the .gz files
    vector_gz_path = os.path.join(tmp_dir, "imdb_review_vector.joblib.gz")
    model_gz_path = os.path.join(tmp_dir, "imdb_review_svm_model.joblib.gz")

    vector_path = os.path.join(tmp_dir, "imdb_review_vector.joblib")
    model_path = os.path.join(tmp_dir, "imdb_review_svm_model.joblib")

    with gzip.open(vector_gz_path, 'rb') as f_in, open(vector_path, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
    with gzip.open(model_gz_path, 'rb') as f_in, open(model_path, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

    # Now load the vectorizer and model
    vector = load(vector_path)
    model = load(model_path)

    # Load the test data from Snowflake (TEST_DATASET table)
    df_test = session.table("TEST_DATASET")

    # Add the sentiment flag for test data
    df_test_flag = df_test.withColumn("SENTIMENT_FLAG", 
        fn.when(df_test.SENTIMENT == "positive", 1).otherwise(2))

    # Extract the review text and sentiment labels for test data
    test_x = df_test_flag.toPandas().REVIEW.values
    test_y = df_test_flag.toPandas().SENTIMENT_FLAG.values

    # Transform the test data using the same vectorizer
    test_bow = vector.transform(test_x)

    # Make predictions on the test data
    predictions = model.predict(test_bow)

    # Calculate accuracy on the test data
    accuracy = accuracy_score(test_y, predictions)
    # Calculate R2 score
    r2 = r2_score(test_y, predictions)
    
    return { "accuracy": str(accuracy), "r2_score": str(r2) }


Deploy the functions as a Snowflake stored proc

In [None]:
session.use_warehouse("ADHOC_WH")
session.use_database("IMDB")
session.use_schema("PUBLIC")
@sproc(name='train_imdb_sp', is_permanent=True, stage_location='@files', replace=True,
    imports=[], packages=['snowflake-snowpark-python',
    'scikit-learn', 'pandas', 'numpy', 'nltk', 'joblib', 'cachetools'])
def train_imdb_sp(session: Session, train_dataset_name: str, tmp_dir: str) -> Variant:
    return train_imdb_model(session, train_dataset_name, tmp_dir)

@sproc(name='test_imdb_sp', is_permanent=True, stage_location='@files', replace=True,
    imports=[], packages=['snowflake-snowpark-python',
    'scikit-learn', 'pandas', 'numpy', 'nltk', 'joblib', 'cachetools'])
def test_imdb_sp(session: Session, test_dataset_name: str, tmp_dir: str) -> Variant:
    return test_imdb_model(session, test_dataset_name, tmp_dir)



Model Training / Test results:

In [6]:

print("Model Training Results:")
print(session.call("train_imdb_sp", "TRAIN_DATASET","/tmp/"))

session.sql("LS @MODELS").show()

print("Model Test Results:")
print(session.call("test_imdb_sp", "TEST_DATASET","/tmp/"))



Model Training Results:
{
  "R2_Score_Train": "1.0",
  "STATUS": "SUCCESS"
}
------------------------------------------------------------------------------------------------------------------------
|"name"                                  |"size"    |"md5"                             |"last_modified"                |
------------------------------------------------------------------------------------------------------------------------
|models/imdb_review_svm_model.joblib.gz  |10823392  |3c282adf2033c7db15f79e511b37f158  |Mon, 28 Apr 2025 23:48:59 GMT  |
|models/imdb_review_vector.joblib.gz     |27844896  |6e2f6c26089457c3539342cc1bf63430  |Mon, 28 Apr 2025 23:48:52 GMT  |
------------------------------------------------------------------------------------------------------------------------

Model Test Results:
{
  "accuracy": "1.0",
  "r2_score": "1.0"
}
