## **Comparing Word Embeddings for IMDB Sentiment Analysis**
---

### **Submitted By:**
- **Gizachew Bayness Kassa**

---

### ** Task Explanation:**

- This notebook compares the performance of various word embeddings (Word2Vec, FastText, GloVe, and BERT) on the IMDB sentiment analysis dataset.
- Due to computational constraints on a CPU-only system, the BERT model is evaluated on a stratified subset of 5000 samples from the training and testing datasets.
- Processing the full IMDB dataset with BERT embeddings on a CPU previously caused system freezes.
- Word2Vec, FastText, and GloVe embeddings use the full dataset as they are less resource-intensive compared to BERT.

---


In [14]:
# Import necessary libraries
import numpy as np
import pandas as pd
from datasets import load_dataset
import gensim.downloader as api
import re
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk


In [15]:
# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/gizachew/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gizachew/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
# Load IMDB dataset
dataset = load_dataset("stanfordnlp/imdb")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [17]:
# Split the dataset into training and testing sets
train_texts = dataset['train']['text']
train_labels = dataset['train']['label']
test_texts = dataset['test']['text']
test_labels = dataset['test']['label']

In [18]:
# Preprocessing using NLTK
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """
    Preprocesses text by removing stopwords, lowercasing, and tokenizing.

    Args:
        text (str): The text to preprocess.

    Returns:
        str: Preprocessed text.
    """
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    tokens = word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return " ".join(tokens)

# Apply preprocessing to training and testing texts
train_texts = [preprocess_text(text) for text in train_texts]
test_texts = [preprocess_text(text) for text in test_texts]

In [30]:
# Stratified sampling to ensure balanced classes
def create_stratified_subset(texts, labels, size=1000):
    """
    Creates a stratified subset of the data.

    Args:
        texts (list): The list of texts.
        labels (list): The corresponding labels.
        size (int): Size of the subset.

    Returns:
        tuple: Stratified texts and labels.
    """
    stratified_texts, _, stratified_labels, _ = train_test_split(
        texts,
        labels,
        train_size=size,
        stratify=labels,
        random_state=42
    )
    return stratified_texts, stratified_labels

# Create stratified subsets for BERT
train_texts_bert, train_labels_bert = create_stratified_subset(train_texts, train_labels, size=5000)
test_texts_bert, test_labels_bert = create_stratified_subset(test_texts, test_labels, size=5000)

In [21]:
# Embedding functions
def embed_word2vec(texts, model):
    """
    Embeds text using Word2Vec embeddings.

    Args:
        texts (list of str): The list of preprocessed texts.
        model: The pre-trained Word2Vec model.

    Returns:
        np.ndarray: The array of text embeddings.
    """
    embeddings = []
    for text in texts:
        words = text.split()
        vectors = [model[word] for word in words if word in model]
        if vectors:
            embeddings.append(np.mean(vectors, axis=0))
        else:
            embeddings.append(np.zeros(model.vector_size))
    return np.array(embeddings)

In [22]:
def embed_fasttext(texts, model):
    """
    Embeds text using FastText embeddings.

    Args:
        texts (list of str): The list of preprocessed texts.
        model: The pre-trained FastText model.

    Returns:
        np.ndarray: The array of text embeddings.
    """
    embeddings = []
    for text in texts:
        words = text.split()
        vectors = [model[word] for word in words if word in model]
        if vectors:
            embeddings.append(np.mean(vectors, axis=0))
        else:
            embeddings.append(np.zeros(model.vector_size))
    return np.array(embeddings)

In [24]:
def embed_glove(texts, model):
    """
    Embeds text using GloVe embeddings.

    Args:
        texts (list of str): The list of preprocessed texts.
        model: The pre-trained GloVe model.

    Returns:
        np.ndarray: The array of text embeddings.
    """
    embeddings = []
    for text in texts:
        words = text.split()
        vectors = [model[word] for word in words if word in model]
        if vectors:
            embeddings.append(np.mean(vectors, axis=0))
        else:
            embeddings.append(np.zeros(model.vector_size))
    return np.array(embeddings)

In [25]:
# Load pre-trained Word2Vec, FastText, and GloVe
word2vec = api.load("word2vec-google-news-300")
fasttext = api.load("fasttext-wiki-news-subwords-300")
glove = api.load("glove-wiki-gigaword-300")

# Generate embeddings
X_train_w2v = embed_word2vec(train_texts, word2vec)
X_test_w2v = embed_word2vec(test_texts, word2vec)
X_train_ft = embed_fasttext(train_texts, fasttext)
X_test_ft = embed_fasttext(test_texts, fasttext)
X_train_glove = embed_glove(train_texts, glove)
X_test_glove = embed_glove(test_texts, glove)

In [26]:
# Configure BERT for CPU
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased").to('cpu')  # Force BERT to use CPU

def embed_bert(texts, tokenizer, model):
    """
    Embeds text using BERT embeddings.

    Args:
        texts (list of str): The list of texts to embed.
        tokenizer: The BERT tokenizer.
        model: The BERT model.

    Returns:
        np.ndarray: The array of BERT embeddings.
    """
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {key: value.to('cpu') for key, value in inputs.items()}  # Ensure inputs are on CPU
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state[:, 0, :].cpu().numpy())
    return np.concatenate(embeddings, axis=0)

In [31]:
# Generate BERT embeddings on balanced subset
X_train_bert = embed_bert(train_texts_bert, tokenizer, model)
X_test_bert = embed_bert(test_texts_bert, tokenizer, model)

In [32]:
# Train and evaluate models
def evaluate(y_true, y_pred):
    """
    Evaluates predictions with accuracy, precision, recall, and F1 score.

    Args:
        y_true (list): True labels.
        y_pred (list): Predicted labels.

    Returns:
        tuple: Accuracy, precision, recall, and F1 score.
    """
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
    accuracy = accuracy_score(y_true, y_pred)
    return accuracy, precision, recall, f1

results = []

# Train and evaluate logistic regression models
for name, X_train, X_test, y_train, y_test in [
    ("Word2Vec", X_train_w2v, X_test_w2v, train_labels, test_labels),
    ("FastText", X_train_ft, X_test_ft, train_labels, test_labels),
    ("GloVe", X_train_glove, X_test_glove, train_labels, test_labels),
    ("BERT", X_train_bert, X_test_bert, train_labels_bert, test_labels_bert),
]:
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train[:len(X_train)])  # Align labels with data
    y_pred = clf.predict(X_test)
    acc, prec, rec, f1 = evaluate(y_test[:len(y_pred)], y_pred)
    results.append({"Embedding": name, "Accuracy": acc, "Precision": prec, "Recall": rec, "F1-Score": f1})

In [None]:
# print the used model
print('The used model is:Logistic Regression')
# Generate a comparison table
df_results = pd.DataFrame(results)
print(df_results)

  Embedding  Accuracy  Precision   Recall  F1-Score
0  Word2Vec   0.85260   0.855071  0.84912  0.852085
1  FastText   0.81284   0.818211  0.80440  0.811247
2     GloVe   0.83104   0.830828  0.83136  0.831094
3      BERT   0.80860   0.814513  0.79920  0.806784
