# IMDB Sentiment Analysis

## Authors
1. Jakub Swistak
2. Nikita Kozlov
3. Jacek Zalewski
4. Zosia Lagiewka

## Dataset
We are using the IMDB dataset with a defined split into train/test, which can be found [here](https://huggingface.co/datasets/stanfordnlp/imdb).

## Methods
We will try different methods with embedding-based models.
## Outcome
The outcome will be a metrics for all tested models and data-processing pipelines.


## Introduction
In this notebook, we will perform sentiment analysis on the IMDB dataset using various embedding-based models. The goal is to compare the performance of different models and data-processing pipelines.


In [None]:
!pip install llmware numpy pandas seaborn gensim

In [3]:
# Load iMDB dataset 
#!%pip install transformers datasets torch

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from xgboost import XGBClassifier
from nltk.tokenize import word_tokenize
import nltk
from textblob import TextBlob
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from llmware.models import ModelCatalog
from gensim.models import FastText, Word2Vec
nltk.download('punkt')
nltk.download('punkt_tab')



splits = {'train': 'plain_text/train-00000-of-00001.parquet', 'test': 'plain_text/test-00000-of-00001.parquet', 'unsupervised': 'plain_text/unsupervised-00000-of-00001.parquet'}
imdb_dataset = pd.read_parquet("hf://datasets/stanfordnlp/imdb/" + splits["train"])

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jakubswistak/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/jakubswistak/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
imdb_dataset.head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [4]:
model_scores = pd.DataFrame(columns=["model", "f1", "accuracy", "precision", "recall"])

### TextBlob

In [7]:
def get_sentiment(text):
    blob = TextBlob(text)
    sentiment = blob.sentiment.polarity
    return sentiment

# Convert list to pandas Series to use apply method
imdb_dataset['sentiment_blob'] = imdb_dataset['text'].apply(get_sentiment)
f1_textblob = f1_score(imdb_dataset['label'], imdb_dataset['sentiment_blob'].apply(lambda x: 1 if x > 0 else 0))
accuracy_textblob = accuracy_score(imdb_dataset['label'], imdb_dataset['sentiment_blob'].apply(lambda x: 1 if x > 0 else 0))
precision_textblob = precision_score(imdb_dataset['label'], imdb_dataset['sentiment_blob'].apply(lambda x: 1 if x > 0 else 0))
recall_textblob = recall_score(imdb_dataset['label'], imdb_dataset['sentiment_blob'].apply(lambda x: 1 if x > 0 else 0))

model_scores = pd.concat([model_scores, pd.DataFrame([["TextBlob", f1_textblob, accuracy_textblob, precision_textblob, recall_textblob]], columns=["model", "f1", "accuracy", "precision", "recall"])])

model_scores

  model_scores = pd.concat([model_scores, pd.DataFrame([["TextBlob", f1_textblob, accuracy_textblob, precision_textblob, recall_textblob]], columns=["model", "f1", "accuracy", "precision", "recall"])])


Unnamed: 0,model,f1,accuracy,precision,recall
0,TextBlob,0.750198,0.68516,0.621758,0.94552


### distilbert-base-uncased-finetuned-sst-2-english

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

total = len(imdb_dataset)

current = 0

def get_bert_sentiment(text):
    global current
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    model.to(device)
    
    with torch.no_grad():
        inputs = {name: tensor.to(device) for name, tensor in inputs.items()}
        logits = model(**inputs).logits
    predicted_class_id = logits.argmax().item()
    
    print(f"[{current}/{total}] {text[:10]} -> {model.config.id2label[predicted_class_id]}")
    return model.config.id2label[predicted_class_id]

imdb_dataset['sentiment_bert'] = imdb_dataset['text'].apply(get_bert_sentiment).map({'NEGATIVE': 0, 'POSITIVE': 1})

In [16]:
f1_bert = f1_score(imdb_dataset['label'], imdb_dataset['sentiment_bert'])
accuracy_bert = accuracy_score(imdb_dataset['label'], imdb_dataset['sentiment_bert'])
precision_bert = precision_score(imdb_dataset['label'], imdb_dataset['sentiment_bert'])
recall_bert = recall_score(imdb_dataset['label'], imdb_dataset['sentiment_bert'])

model_scores = pd.concat([model_scores, pd.DataFrame([["distilbert-base-uncased-finetuned-sst-2-english", f1_bert, accuracy_bert, precision_bert, recall_bert]], columns=["model", "f1", "accuracy", "precision", "recall"])])

model_scores

Unnamed: 0,model,f1,accuracy,precision,recall
0,TextBlob,0.750198,0.68516,0.621758,0.94552
0,distilbert-base-uncased-finetuned-sst-2-english,0.884697,0.88852,0.916117,0.85536


## Divide the dataset since all of the samles are quite long to run

In [5]:
# split the data into train and test
train, test = train_test_split(imdb_dataset, test_size=0.2, random_state=42)


### Slim sentiment analysis

In [61]:
slim_model = ModelCatalog().load_model("llmware/slim-sentiment")

def get_sentiment_llm(text):
    response = slim_model.function_call(text, params=["sentiment"], function="classify")
    return response

test['sentiment_slim_unprocessed'] = test['text'].apply(get_sentiment_llm)


[37mINFO: update: function call output could not be automatically converted, but remediation was successful to type - dict [39m
[37mINFO: update: function call output could not be automatically converted, but remediation was successful to type - dict [39m


In [None]:
test["sentiment_slim_processed"] = test["sentiment_slim_unprocessed"].apply(lambda x: x['llm_response'])

In [None]:
test["sentiment_slim"] = test["sentiment_slim_processed"].apply(lambda x: 1 if x.get('sentiment', ['negative'])[0] == "positive" else 0)

In [24]:
f1_slim = f1_score(test['label'], test['sentiment_slim'])
accuracy_slim = accuracy_score(test['label'], test['sentiment_slim'])
precision_slim = precision_score(test['label'], test['sentiment_slim'])
recall_slim = recall_score(test['label'], test['sentiment_slim'])

model_scores = pd.concat([model_scores, pd.DataFrame([["slim-sentiment", f1_slim, accuracy_slim, precision_slim, recall_slim]], columns=["model", "f1", "accuracy", "precision", "recall"])])

model_scores

Unnamed: 0,model,f1,accuracy,precision,recall
0,TextBlob,0.750198,0.68516,0.621758,0.94552
0,distilbert-base-uncased-finetuned-sst-2-english,0.884697,0.88852,0.916117,0.85536
0,slim-sentiment,0.901526,0.9006,0.887978,0.915493


### Vader

In [45]:
nltk.download('vader_lexicon')

def get_sentiment_vader(text):
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    sid = SentimentIntensityAnalyzer()
    return sid.polarity_scores(text)['compound']

test['sentiment_vader'] = test['text'].apply(get_sentiment_vader)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/nk2/nltk_data...


In [46]:
f1_vader = f1_score(test['label'], test['sentiment_vader'].apply(lambda x: 1 if x > 0 else 0))
accuracy_vader = accuracy_score(test['label'], test['sentiment_vader'].apply(lambda x: 1 if x > 0 else 0))
precision_vader = precision_score(test['label'], test['sentiment_vader'].apply(lambda x: 1 if x > 0 else 0))
recall_vader = recall_score(test['label'], test['sentiment_vader'].apply(lambda x: 1 if x > 0 else 0))

model_scores = pd.concat([model_scores, pd.DataFrame([["Vader", f1_vader, accuracy_vader, precision_vader, recall_vader]], columns=["model", "f1", "accuracy", "precision", "recall"])])
model_scores

Unnamed: 0,model,f1,accuracy,precision,recall
0,TextBlob,0.750198,0.68516,0.621758,0.94552
0,distilbert-base-uncased-finetuned-sst-2-english,0.884697,0.88852,0.916117,0.85536
0,slim-sentiment,0.901526,0.9006,0.887978,0.915493
0,Vader,0.733529,0.6918,0.643117,0.853521


In [7]:
def tokenize(text):
    return word_tokenize(text.lower())

imdb_dataset['nltk_tokens'] = imdb_dataset['text'].apply(tokenize)

X_train, X_test, y_train, y_test = train_test_split(imdb_dataset['nltk_tokens'], imdb_dataset['label'], test_size=0.2, random_state=42)

word2vec_model = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=1, workers=4)

def vectorize_sentence(sentence, model):
    valid_words = [word for word in sentence if word in model.wv.key_to_index]
    if len(valid_words) == 0:
        return np.zeros(model.vector_size)
    return np.mean([model.wv[word] for word in valid_words], axis=0)

X_train_vectors = np.array([vectorize_sentence(sentence, word2vec_model) for sentence in X_train])
X_test_vectors = np.array([vectorize_sentence(sentence, word2vec_model) for sentence in X_test])

classifiers = {
    "Logistic Regression": LogisticRegression(),
    "AdaBoost": AdaBoostClassifier(),
    "SVM": SVC(probability=True),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}


for model_name, classifier in classifiers.items():
    classifier.fit(X_train_vectors, y_train)
    y_pred = classifier.predict(X_test_vectors)
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    model_scores = pd.concat([model_scores, pd.DataFrame([[f"Word2Vec + {model_name}", f1, accuracy, precision, recall]], columns=["model", "f1", "accuracy", "precision", "recall"])])
model_scores



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.



Unnamed: 0,model,f1,accuracy,precision,recall
0,TextBlob,0.750198,0.68516,0.621758,0.94552
0,distilbert-base-uncased-finetuned-sst-2-english,0.884697,0.88852,0.916117,0.85536
0,slim-sentiment,0.901526,0.9006,0.887978,0.915493
0,Vader,0.733529,0.6918,0.643117,0.853521
0,Word2Vec + Logistic Regression,0.815889,0.8146,0.80549,0.826559
0,Word2Vec + AdaBoost,0.754498,0.7544,0.749702,0.759356
0,Word2Vec + SVM,0.809609,0.8082,0.798981,0.820523
0,Word2Vec + Random Forest,0.763702,0.7646,0.762019,0.765392
0,Word2Vec + XGBoost,0.797777,0.7962,0.787001,0.808853


In [8]:
fasttext_model = FastText(sentences=X_train, vector_size=100, window=5, min_count=1, workers=4)

def vectorize_sentence(sentence, model):
    valid_words = [word for word in sentence if word in model.wv.key_to_index]
    if len(valid_words) == 0:
        return np.zeros(model.vector_size)
    return np.mean([model.wv[word] for word in valid_words], axis=0)

X_train_vectors = np.array([vectorize_sentence(sentence, fasttext_model) for sentence in X_train])
X_test_vectors = np.array([vectorize_sentence(sentence, fasttext_model) for sentence in X_test])

classifiers = {
    "Logistic Regression": LogisticRegression(),
    "AdaBoost": AdaBoostClassifier(),
    "SVM": SVC(probability=True),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}


for model_name, classifier in classifiers.items():
    classifier.fit(X_train_vectors, y_train)
    
    y_pred = classifier.predict(X_test_vectors)
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    model_scores = pd.concat([model_scores, pd.DataFrame([[f"FastText + {model_name}", f1, accuracy, precision, recall]], 
                                                         columns=["model", "f1", "accuracy", "precision", "recall"])], ignore_index=True)
    

model_scores

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.



Unnamed: 0,model,f1,accuracy,precision,recall
0,TextBlob,0.750198,0.68516,0.621758,0.94552
1,distilbert-base-uncased-finetuned-sst-2-english,0.884697,0.88852,0.916117,0.85536
2,slim-sentiment,0.901526,0.9006,0.887978,0.915493
3,Vader,0.733529,0.6918,0.643117,0.853521
4,Word2Vec + Logistic Regression,0.815889,0.8146,0.80549,0.826559
5,Word2Vec + AdaBoost,0.754498,0.7544,0.749702,0.759356
6,Word2Vec + SVM,0.809609,0.8082,0.798981,0.820523
7,Word2Vec + Random Forest,0.763702,0.7646,0.762019,0.765392
8,Word2Vec + XGBoost,0.797777,0.7962,0.787001,0.808853
9,FastText + Logistic Regression,0.799682,0.7984,0.789949,0.809658


In [9]:
def get_dimension_size(line):
    size = 0
    l_split = line.strip().split()
    for i in l_split:
        try:
            _ = float(i)
            size = size + 1
        except:
            pass
    return size

def get_embeddings(file):
    embs = dict()
    firstLine = open(file, 'r').readline()
    dimension = get_dimension_size(firstLine)
    for l in open(file, 'r').readlines():
        l_split = l.strip().split()
        if len(l_split) == 2:
            continue
        emb = l_split[-1 * dimension:]
        word = l_split[:-1 * dimension]
        word = ''.join(word)
        embs[word] = [float(em) for em in emb]
    print("Got {} embeddings from {}".format(len(embs), file))
    return embs

glove_file = 'glove840B300d.txt' 
glove_embeddings = get_embeddings(glove_file)

def vectorize_sentence(sentence, embeddings, vector_size=300):
    valid_words = [word for word in sentence if word in embeddings]
    if len(valid_words) == 0:
        return np.zeros(vector_size)
    return np.mean([embeddings[word] for word in valid_words], axis=0)

X_train_vectors = np.array([vectorize_sentence(sentence, glove_embeddings) for sentence in X_train])
X_test_vectors = np.array([vectorize_sentence(sentence, glove_embeddings) for sentence in X_test])

classifiers = {
    "Logistic Regression": LogisticRegression(),
    "AdaBoost": AdaBoostClassifier(),
    "SVM": SVC(probability=True),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}


for model_name, classifier in classifiers.items():
    classifier.fit(X_train_vectors, y_train)
    y_pred = classifier.predict(X_test_vectors)
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    model_scores = pd.concat([model_scores, pd.DataFrame([[f"GloVe + {model_name}", f1, accuracy, precision, recall]], 
                                                         columns=["model", "f1", "accuracy", "precision", "recall"])], ignore_index=True)

model_scores

Got 2195892 embeddings from glove840B300d.txt


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.



Unnamed: 0,model,f1,accuracy,precision,recall
0,TextBlob,0.750198,0.68516,0.621758,0.94552
1,distilbert-base-uncased-finetuned-sst-2-english,0.884697,0.88852,0.916117,0.85536
2,slim-sentiment,0.901526,0.9006,0.887978,0.915493
3,Vader,0.733529,0.6918,0.643117,0.853521
4,Word2Vec + Logistic Regression,0.815889,0.8146,0.80549,0.826559
5,Word2Vec + AdaBoost,0.754498,0.7544,0.749702,0.759356
6,Word2Vec + SVM,0.809609,0.8082,0.798981,0.820523
7,Word2Vec + Random Forest,0.763702,0.7646,0.762019,0.765392
8,Word2Vec + XGBoost,0.797777,0.7962,0.787001,0.808853
9,FastText + Logistic Regression,0.799682,0.7984,0.789949,0.809658
