In [1]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from gensim.models import Word2Vec, FastText
import gensim.downloader as api
import gc

In [2]:
import pandas as pd

splits = {'train': 'plain_text/train-00000-of-00001.parquet', 'test': 'plain_text/test-00000-of-00001.parquet', 'unsupervised': 'plain_text/unsupervised-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/" + splits["train"])
df_test = pd.read_parquet("hf://datasets/stanfordnlp/imdb/" + splits["test"])

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

def preprocess(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    return text

def tokenize(text):
    return text.split()

def load_glove_vectors(vocab, glove_model):
    return {word: glove_model[word] for word in vocab if word in glove_model}

def vectorize(text, model):
    vec = np.mean([model[word] for word in text if word in model], axis=0)
    return vec if vec is not np.nan else np.zeros(100)

In [4]:

# Apply preprocessing
df['text'] = df['text'].apply(preprocess)
df_test['text'] = df_test['text'].apply(preprocess)

X_train, X_val, y_train, y_val = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)


X_train = X_train.apply(tokenize)
X_val = X_val.apply(tokenize)
X_test = df_test['text'].apply(tokenize)



In [5]:

w2v_model = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=5, workers=4)

X_train_w2v = np.array([vectorize(sentence, w2v_model.wv) for sentence in X_train])
X_val_w2v = np.array([vectorize(sentence, w2v_model.wv) for sentence in X_val])
X_test_w2v = np.array([vectorize(sentence, w2v_model.wv) for sentence in X_test])

clf_w2v = LogisticRegression(solver='saga')
clf_w2v.fit(X_train_w2v, y_train)
y_pred_w2v = clf_w2v.predict(X_val_w2v)

precision_w2v = precision_score(y_val, y_pred_w2v)
recall_w2v = recall_score(y_val, y_pred_w2v)
f1_w2v = f1_score(y_val, y_pred_w2v)
accuracy_w2v = accuracy_score(y_val, y_pred_w2v)



In [6]:


ft_model = FastText(sentences=X_train, vector_size=100, window=5, min_count=5, workers=4)

X_train_ft = np.array([vectorize(sentence, ft_model.wv) for sentence in X_train])
X_val_ft = np.array([vectorize(sentence, ft_model.wv) for sentence in X_val])
X_test_ft = np.array([vectorize(sentence, ft_model.wv) for sentence in X_test])

clf_ft = LogisticRegression(solver='saga')
clf_ft.fit(X_train_ft, y_train)
y_pred_ft = clf_ft.predict(X_val_ft)

precision_ft = precision_score(y_val, y_pred_ft)
recall_ft = recall_score(y_val, y_pred_ft)
f1_ft = f1_score(y_val, y_pred_ft)
accuracy_ft = accuracy_score(y_val, y_pred_ft)



In [7]:
vocab = set(word for sentence in X_train for word in sentence)
glove_vectors = load_glove_vectors(vocab, api.load("glove-wiki-gigaword-100"))


X_train_glove = np.array([vectorize(sentence, glove_vectors) for sentence in X_train])
X_val_glove = np.array([vectorize(sentence, glove_vectors) for sentence in X_val])
X_test_glove = np.array([vectorize(sentence, glove_vectors) for sentence in X_test])

clf_glove = LogisticRegression(solver='saga')
clf_glove.fit(X_train_glove, y_train)
y_pred_glove = clf_glove.predict(X_val_glove)

precision_glove = precision_score(y_val, y_pred_glove)
recall_glove = recall_score(y_val, y_pred_glove)
f1_glove = f1_score(y_val, y_pred_glove)
accuracy_glove = accuracy_score(y_val, y_pred_glove)

In [8]:


results = pd.DataFrame({
    'Model': ['Word2Vec', 'FastText', 'GloVe'],
    'Precision': [precision_w2v, precision_ft, precision_glove],
    'Recall': [recall_w2v, recall_ft, recall_glove],
    'F1-Score': [f1_w2v, f1_ft, f1_glove],
    'Accuracy': [accuracy_w2v, accuracy_ft, accuracy_glove]
})

print(results)

del w2v_model, ft_model, clf_w2v, clf_ft, clf_glove, X_train_w2v, X_val_w2v, X_test_w2v
gc.collect()


      Model  Precision    Recall  F1-Score  Accuracy
0  Word2Vec   0.810758  0.830986  0.820747    0.8196
1  FastText   0.808864  0.822535  0.815642    0.8152
2     GloVe   0.790985  0.797988  0.794471    0.7948


86

# same but with parameter tuning:

In [9]:
from sklearn.model_selection import GridSearchCV

w2v_params = {'vector_size': [100, 200], 'window': [5, 10]}
ft_params = {'vector_size': [100, 200], 'window': [5, 10]}

def train_evaluate_w2v(params):
    model = Word2Vec(sentences=X_train, vector_size=params['vector_size'], window=params['window'], min_count=5, workers=4)
    X_train_vec = np.array([vectorize(sentence, model.wv) for sentence in X_train])
    X_val_vec = np.array([vectorize(sentence, model.wv) for sentence in X_val])
    clf = LogisticRegression(solver='saga')
    clf.fit(X_train_vec, y_train)
    y_pred = clf.predict(X_val_vec)
    return accuracy_score(y_val, y_pred)

def train_evaluate_ft(params):
    model = FastText(sentences=X_train, vector_size=params['vector_size'], window=params['window'], min_count=5, workers=4)
    X_train_vec = np.array([vectorize(sentence, model.wv) for sentence in X_train])
    X_val_vec = np.array([vectorize(sentence, model.wv) for sentence in X_val])
    clf = LogisticRegression(solver='saga')
    clf.fit(X_train_vec, y_train)
    y_pred = clf.predict(X_val_vec)
    return accuracy_score(y_val, y_pred)

best_w2v_params = None
best_w2v_score = 0
for vector_size in w2v_params['vector_size']:
    for window in w2v_params['window']:
        params = {'vector_size': vector_size, 'window': window}
        score = train_evaluate_w2v(params)
        if score > best_w2v_score:
            best_w2v_score = score
            best_w2v_params = params

best_ft_params = None
best_ft_score = 0
for vector_size in ft_params['vector_size']:
    for window in ft_params['window']:
        params = {'vector_size': vector_size, 'window': window}
        score = train_evaluate_ft(params)
        if score > best_ft_score:
            best_ft_score = score
            best_ft_params = params

print(f"Best Word2Vec params: {best_w2v_params}, Score: {best_w2v_score}")
print(f"Best FastText params: {best_ft_params}, Score: {best_ft_score}")

w2v_model = Word2Vec(sentences=X_train, vector_size=best_w2v_params['vector_size'], window=best_w2v_params['window'], min_count=5, workers=4)
ft_model = FastText(sentences=X_train, vector_size=best_ft_params['vector_size'], window=best_ft_params['window'], min_count=5, workers=4)

X_train_w2v = np.array([vectorize(sentence, w2v_model.wv) for sentence in X_train])
X_val_w2v = np.array([vectorize(sentence, w2v_model.wv) for sentence in X_val])
X_test_w2v = np.array([vectorize(sentence, w2v_model.wv) for sentence in X_test])

X_train_ft = np.array([vectorize(sentence, ft_model.wv) for sentence in X_train])
X_val_ft = np.array([vectorize(sentence, ft_model.wv) for sentence in X_val])
X_test_ft = np.array([vectorize(sentence, ft_model.wv) for sentence in X_test])

clf_w2v = LogisticRegression(solver='saga')
clf_w2v.fit(X_train_w2v, y_train)
y_pred_w2v = clf_w2v.predict(X_val_w2v)

clf_ft = LogisticRegression(solver='saga')
clf_ft.fit(X_train_ft, y_train)
y_pred_ft = clf_ft.predict(X_val_ft)

precision_w2v = precision_score(y_val, y_pred_w2v)
recall_w2v = recall_score(y_val, y_pred_w2v)
f1_w2v = f1_score(y_val, y_pred_w2v)
accuracy_w2v = accuracy_score(y_val, y_pred_w2v)

precision_ft = precision_score(y_val, y_pred_ft)
recall_ft = recall_score(y_val, y_pred_ft)
f1_ft = f1_score(y_val, y_pred_ft)
accuracy_ft = accuracy_score(y_val, y_pred_ft)

results = pd.DataFrame({
    'Model': ['Word2Vec', 'FastText'],
    'Precision': [precision_w2v, precision_ft],
    'Recall': [recall_w2v, recall_ft],
    'F1-Score': [f1_w2v, f1_ft],
    'Accuracy': [accuracy_w2v, accuracy_ft]
})

print(results)



Best Word2Vec params: {'vector_size': 200, 'window': 10}, Score: 0.8436
Best FastText params: {'vector_size': 200, 'window': 10}, Score: 0.8262




      Model  Precision    Recall  F1-Score  Accuracy
0  Word2Vec   0.833929  0.844668  0.839264    0.8392
1  FastText   0.820492  0.831388  0.825904    0.8258




In [10]:
# Display the results DataFrame
print(results)

      Model  Precision    Recall  F1-Score  Accuracy
0  Word2Vec   0.833929  0.844668  0.839264    0.8392
1  FastText   0.820492  0.831388  0.825904    0.8258
