# IMDB Reviews Sentiment Analysis

In [None]:
import sklearn.feature_extraction.text as txt
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from joblib import dump, load
from snowflake.snowpark import functions as fn
from snowflake.snowpark.session import Session
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions

Connect to Snowflake IMDB database

In [3]:
conn = SnowflakeLoginOptions("sanju")
conn["database"] = "IMDB"
session = Session.builder.configs(conn).create()
session.query_tag = "imdb_sentiment_1"

SnowflakeLoginOptions() is in private preview since 0.2.0. Do not use it in production. 


Get training dataset

In [4]:
session.use_warehouse("ADHOC_WH")
df = session.table("TRAIN_DATASET")
df_flag = df.withColumn("SENTIMENT_FLAG",
    fn.when(df.SENTIMENT == "positive", 1).otherwise(2))
train_x = df_flag.toPandas().REVIEW.values
train_y = df_flag.toPandas().SENTIMENT_FLAG.values
df_flag.show()


---------------------------------------------------------------------------------------
|"REVIEW"                                            |"SENTIMENT"  |"SENTIMENT_FLAG"  |
---------------------------------------------------------------------------------------
|"In Victorian times a father is separated from ...  |positive     |1                 |
|"When this film gets it right it really gets it...  |positive     |1                 |
|"What this film has is its realism , you really...  |negative     |2                 |
|"""Like the first touch of pleasure and guilt, ...  |positive     |1                 |
|"This is a weird and compelling film. The topic...  |positive     |1                 |
|"Although I am not a Michael Jackson fan, I lik...  |positive     |1                 |
|"Tigerland follows the lives of a group of rece...  |positive     |1                 |
|"To all the reviewers on this page, I would hav...  |positive     |1                 |
|"I really should give this stin

Create word-count matrix

In [5]:
# Parameter         | Meaning
# -------------------|-------------------------------------------------------------------------------------------------------------
# token_pattern     | "[\\w']+\\w\\b" - Regular expression that defines what counts as a "token" (word). Matches words, including contractions like "don't", "it's".
# ngram_range       | (1, 2) - Extract unigrams (single words) and bigrams (two-word phrases). Ex: "good" and "good movie".
# analyzer          | 'word' - Tells it to split text into words, not characters or custom logic.
# max_df            | 0.02 - Ignore words that appear in more than 2% of the documents (likely too common to be meaningful, like "the", "and").
# min_df            | 1 * 1./len(train_x) - Only keep words that appear in at least one document. (1 / number of documents). Basically, no rarest terms are filtered out.
# vocabulary        | None - Build the vocabulary automatically from the training data (instead of fixing it beforehand).
# binary            | True - Instead of counting how often a word appears, just mark 1 if it appears, 0 if not. (turns into binary presence/absence matrix).
vector = txt.CountVectorizer(
    token_pattern="[\\w']+\\w\\b", ngram_range=(1, 2), analyzer='word', 
    max_df=0.02, min_df=1 * 1./len(train_x), vocabulary=None, binary=True)
bow = vector.fit_transform(train_x)
dump(vector, '../../scratchpad/imdb_review_vector.joblib', compress=True)

['../../scratchpad/imdb_review_vector.joblib']

Train model with SVM (Support Vector Machine) linear classifier

In [6]:
svm_model = svm.LinearSVC(C=1.8, max_iter=100)
svm_model.fit(bow, train_y)
dump(svm_model, '../../scratchpad/imdb_review_svm_model.joblib', compress=True)

print({ "STATUS": "SUCCESS", "R2 Score Train": str(svm_model.score(bow, train_y)) })



{'STATUS': 'SUCCESS', 'R2 Score Train': '1.0'}


Evaluate the Model:

In [None]:
# Load the previously saved vectorizer and model
vector = load('../../scratchpad/imdb_review_vector.joblib')
model = load('../../scratchpad/imdb_review_svm_model.joblib')

# Load the test data from Snowflake (TEST_DATASET table)
df_test = session.table("TEST_DATASET")

# Add the sentiment flag for test data
df_test_flag = df_test.withColumn("SENTIMENT_FLAG", 
    fn.when(df_test.SENTIMENT == "positive", 1).otherwise(2))

# Extract the review text and sentiment labels for test data
test_x = df_test_flag.toPandas().REVIEW.values
test_y = df_test_flag.toPandas().SENTIMENT_FLAG.values

# Transform the test data using the same vectorizer
test_bow = vector.transform(test_x)

# Make predictions on the test data
predictions = model.predict(test_bow)

# Calculate accuracy on the test data
accuracy = accuracy_score(test_y, predictions)
print(f"Accuracy on test data: {accuracy}")

# Calculate R2 score
r2 = r2_score(test_y, predictions)
print(f"R2 Score on test data: {r2}")


Accuracy on test data: 1.0
R2 Score on test data: 1.0
