## IMDB Dataset

Izabela Telejko, Tomasz Siudalski, Grzegorz Zbrzeżny 

In [15]:
import pandas as pd
import numpy as np
import torch
import fasttext
from tqdm import tqdm

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt_tab')

from datasets import load_dataset
from transformers import BertTokenizerFast
from transformers import BertModel
import gensim.downloader as api

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/telejkoi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/telejkoi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [4]:
# Load data

ds = load_dataset("stanfordnlp/imdb")
del ds['unsupervised']
# we delete the first 10% of test dataset as there are some corrupted data
new_test_dataset = ds['test'].select(range(2500, 25000))
ds['test'] = new_test_dataset
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 22500
    })
})

### Preprocessing - Tokenize, remove stopwords, apply stemming

In [6]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def nltk_tokenize_stem_remove_stopwords_batch(batch):
    """Tokenize, remove stopwords, and then apply stemming."""
    batch['tokens'] = [
        [stemmer.stem(word) for word in word_tokenize(str(text)) if word.lower() not in stop_words]
        if text else []  # Handle missing text
        for text in batch['text']
    ]
    return batch

for split in ds:
    ds[split] = ds[split].map(nltk_tokenize_stem_remove_stopwords_batch, batched=True, batch_size=64)

# Example of a tokenized and stemmed sentence
print(ds['train'][0])

Map: 100%|██████████| 25000/25000 [00:55<00:00, 452.45 examples/s]
Map: 100%|██████████| 22500/22500 [00:49<00:00, 453.97 examples/s]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be




### Embedding with Word2vec

In [7]:
word2vec_model = api.load('word2vec-google-news-300')

In [8]:
def get_word2vec_embeddings(words):
    """Get Word2Vec embeddings for a list of words."""
    embeddings = []
    for word in words:
        if word in word2vec_model:
            embeddings.append(word2vec_model[word])
        else:
            embeddings.append(np.zeros(300))
    return np.array(embeddings)

def get_word2vec_embeddings_batch(batch):
    """Get Word2Vec embeddings for a batch of sentences."""
    all_word_embeddings = []
    
    for tokens in batch['tokens']:
        try:
            # Get Word2Vec embeddings for each word
            word_embeddings = get_word2vec_embeddings(tokens)
            
            # Pool word embeddings by averaging them to get a single sentence vector
            sentence_embedding = np.mean(word_embeddings, axis=0)
            all_word_embeddings.append(sentence_embedding)
        except Exception as e:
            print(f"Error with tokens: {tokens} - {e}")

    return {'word2vec_embeddings': all_word_embeddings}

for split in ds:
    ds[split] = ds[split].map(get_word2vec_embeddings_batch, batched=True, batch_size=64)

Map: 100%|██████████| 25000/25000 [00:19<00:00, 1312.39 examples/s]
Map: 100%|██████████| 22500/22500 [00:20<00:00, 1119.06 examples/s]


In [25]:
X_train_w2v = np.array(ds['train']['word2vec_embeddings'])
y_train_w2v = np.array(ds['train']['label'])

X_test_w2v = np.array(ds['test']['word2vec_embeddings'])
y_test_w2v = np.array(ds['test']['label'])

#### Random Forest Classifier

In [26]:
rf_classifier_w2v = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier_w2v.fit(X_train_w2v, y_train_w2v)

y_pred_rf_w2v = rf_classifier_w2v.predict(X_test_w2v)

accuracy_rf_w2v = accuracy_score(y_test_w2v, y_pred_rf_w2v)
precision_rf_w2v = precision_score(y_test_w2v, y_pred_rf_w2v, average='weighted')
recall_rf_w2v = recall_score(y_test_w2v, y_pred_rf_w2v, average='weighted')
f1_rf_w2v = f1_score(y_test_w2v, y_pred_rf_w2v, average='weighted')

print("Random Forest Classifier with Word2Vec:")
print(f"Accuracy: {accuracy_rf_w2v:.4f}")
print(f"Precision: {precision_rf_w2v:.4f}")
print(f"Recall: {recall_rf_w2v:.4f}")
print(f"F1 Score: {f1_rf_w2v:.4f}")

Random Forest Classifier with Word2Vec:
Accuracy: 0.7557
Precision: 0.7593
Recall: 0.7557
F1 Score: 0.7564


#### Gradient Boosting Classifier

In [29]:
gb_classifier_w2v = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_classifier_w2v.fit(X_train_w2v, y_train_w2v)

y_pred_gb_w2v = gb_classifier_w2v.predict(X_test_w2v)

accuracy_gb_w2v = accuracy_score(y_test_w2v, y_pred_gb_w2v)
precision_gb_w2v = precision_score(y_test_w2v, y_pred_gb_w2v, average='weighted')
recall_gb_w2v = recall_score(y_test_w2v, y_pred_gb_w2v, average='weighted')
f1_gb_w2v = f1_score(y_test_w2v, y_pred_gb_w2v, average='weighted')

print("Gradient Boosting with Word2Vec:")
print(f"Accuracy: {accuracy_gb_w2v:.4f}")
print(f"Precision: {precision_gb_w2v:.4f}")
print(f"Recall: {recall_gb_w2v:.4f}")
print(f"F1 Score: {f1_gb_w2v:.4f}")

Gradient Boosting with Word2Vec:
Accuracy: 0.7692
Precision: 0.7720
Recall: 0.7692
F1 Score: 0.7698


### Embedding with fasttext

In [18]:
# wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
# gunzip cc.en.300.bin.gz

fasttext_model = fasttext.load_model('cc.en.300.bin')

In [19]:
def get_fasttext_embeddings(tokens, fasttext_model):
    """Get FastText embeddings for a list of tokens."""
    embeddings = []
    for token in tokens:
        embedding = fasttext_model.get_word_vector(token)  # Get the FastText embedding for each token
        embeddings.append(embedding)
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(fasttext_model.get_dimension())

for split in ds:
    ds[split] = ds[split].map(lambda batch: {
        'fasttext_embeddings': [get_fasttext_embeddings(tokens, fasttext_model) for tokens in batch['tokens']]
    }, batched=True)

print(ds['train'][0])

Map: 100%|██████████| 25000/25000 [00:22<00:00, 1120.24 examples/s]
Map: 100%|██████████| 22500/22500 [00:19<00:00, 1164.27 examples/s]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be




In [20]:
X_train_ft = np.array(ds['train']['fasttext_embeddings'])
y_train_ft = np.array(ds['train']['label'])

X_test_ft = np.array(ds['test']['fasttext_embeddings'])
y_test_ft = np.array(ds['test']['label'])

#### Random Forest Classifier

In [21]:
rf_classifier_ft = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier_ft.fit(X_train_ft, y_train_ft)

y_pred_rf_ft = rf_classifier_ft.predict(X_test_ft)

accuracy_rf_ft = accuracy_score(y_test_ft, y_pred_rf_ft)
precision_rf_ft = precision_score(y_test_ft, y_pred_rf_ft, average='weighted')
recall_rf_ft = recall_score(y_test_ft, y_pred_rf_ft, average='weighted')
f1_rf_ft = f1_score(y_test_ft, y_pred_rf_ft, average='weighted')

print("Random Forest Classifier with FastText:")
print(f"Accuracy: {accuracy_rf_ft:.4f}")
print(f"Precision: {precision_rf_ft:.4f}")
print(f"Recall: {recall_rf_ft:.4f}")
print(f"F1 Score: {f1_rf_ft:.4f}")

Random Forest Classifier with FastText:
Accuracy: 0.7202
Precision: 0.7231
Recall: 0.7202
F1 Score: 0.7209


#### Gradient Boosting Classifier

In [22]:
gb_classifier_ft = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_classifier_ft.fit(X_train_ft, y_train_ft)

y_pred_gb_ft = gb_classifier_ft.predict(X_test_ft)

accuracy_gb_ft = accuracy_score(y_test_ft, y_pred_gb_ft)
precision_gb_ft = precision_score(y_test_ft, y_pred_gb_ft, average='weighted')
recall_gb_ft = recall_score(y_test_ft, y_pred_gb_ft, average='weighted')
f1_gb_ft = f1_score(y_test_ft, y_pred_gb_ft, average='weighted')

print("Gradient Boosting Classifier with FastText:")
print(f"Accuracy: {accuracy_gb_ft:.4f}")
print(f"Precision: {precision_gb_ft:.4f}")
print(f"Recall: {recall_gb_ft:.4f}")
print(f"F1 Score: {f1_gb_ft:.4f}")

Gradient Boosting Classifier with FastText:
Accuracy: 0.7281
Precision: 0.7310
Recall: 0.7281
F1 Score: 0.7288


In [34]:
results = {
    "Classifier": [
        "Random Forest Classifier",
        "Gradient Boosting Classifier",
        "Random Forest Classifier",
        "Gradient Boosting Classifier"
    ],
    "Embedding Type": [
        "Word2Vec",
        "Word2Vec",
        "FastText",
        "FastText"
    ],
    "Accuracy": [
        accuracy_rf_w2v,  
        accuracy_gb_w2v,  
        accuracy_rf_ft,   
        accuracy_gb_ft    
    ],
    "Precision": [
        precision_rf_w2v,  
        precision_gb_w2v,  
        precision_rf_ft,    
        precision_gb_ft    
    ],
    "Recall": [
        recall_rf_w2v,     
        recall_gb_w2v,   
        recall_rf_ft,      
        recall_gb_ft   
    ],
    "F1 Score": [
        f1_rf_w2v,    
        f1_gb_w2v,  
        f1_rf_ft,  
        f1_gb_ft  
    ]
}

# Create a DataFrame
results_df = pd.DataFrame(results)
results_df.sort_values(by='F1 Score', ascending=False)

Unnamed: 0,Classifier,Embedding Type,Accuracy,Precision,Recall,F1 Score
1,Gradient Boosting Classifier,Word2Vec,0.7692,0.77203,0.7692,0.769795
0,Random Forest Classifier,Word2Vec,0.755733,0.759272,0.755733,0.756415
3,Gradient Boosting Classifier,FastText,0.728089,0.730995,0.728089,0.728785
2,Random Forest Classifier,FastText,0.720222,0.723094,0.720222,0.720932
