In [47]:
# Wasielewska Maja
# Jakub Kubacki

import numpy as np
import pandas as pd
import tensorflow as tf
import re
import nltk
from datasets import load_dataset
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec, FastText, KeyedVectors
from transformers import BertTokenizer, TFBertModel
from gensim.models import KeyedVectors
from google.colab import files
from tqdm import tqdm
from sklearn.svm import SVC
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [55]:
# Preprocessing

nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    tokens = text.lower().split()

    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

    return ' '.join(tokens)

X_train_cleaned = [preprocess_text(text) for text in X_train]
X_test_cleaned = [preprocess_text(text) for text in X_test]

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred = np.round(y_pred).astype(int)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
    return precision, recall, f1

dataset = load_dataset("imdb")

split_dataset = dataset['train'].train_test_split(test_size=0.2)
train_data = split_dataset['train']
test_data = split_dataset['test']

X_train = [item['text'] for item in train_data]
y_train = [item['label'] for item in train_data]
X_test = [item['text'] for item in test_data]
y_test = [item['label'] for item in test_data]

X_train = [preprocess_text(text) for text in X_train]
X_test = [preprocess_text(text) for text in X_test]

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

results_table = pd.DataFrame(columns=["Model", "Precision", "Recall", "F1 Score"])

print(X_train_cleaned[:5])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['take say lightly ive seen many many film reviewed great deal print tell film single funniest scene ever seen movie might want listen lot diversity opinion make incredibly stupid movie funny didnt get well cant blame much scene speak come minute mark involves dead convict shackled john candy point found film dumb confusing beginning lose scene came laughed hard peed pant movie ever done project began going berserk supposed sctv movie remember announced time went cast whittled john candy joe flaherty eugene levy also must regime change universal shot upon released shown nearly zero theater watching second time listened theme song actually flaunt incomprehensible plot lyric relaxed logic nerve figured going aside aforementioned routine going berserk many hilarious scene recommend almost stooge flick except much funnier director david steinberg razor sharp timing must laughing candy basically charge never funnier plot device explanatory scene thrown window absolutely run wild flaherty le

In [54]:
## 1. Word2Vec Embeddings + Logistic Regression
print("Training Word2Vec model...")
word2vec_model = Word2Vec([x.split() for x in X_train], vector_size=100, window=5, min_count=2, workers=4)
word2vec_vectors = word2vec_model.wv

def get_word2vec_embedding(text):
    tokens = text.split()
    return np.mean([word2vec_vectors[word] for word in tokens if word in word2vec_vectors], axis=0)

X_train_word2vec = np.array([get_word2vec_embedding(text) for text in X_train])
X_test_word2vec = np.array([get_word2vec_embedding(text) for text in X_test])

# Logistic Regression on Word2Vec embeddings
clf_word2vec = LogisticRegression(max_iter=1000)
clf_word2vec.fit(X_train_word2vec, y_train)

precision, recall, f1 = evaluate_model(clf_word2vec, X_test_word2vec, y_test)
new_row = pd.DataFrame({"Model": ["Word2Vec + Logistic Regression"], "Precision": [precision], "Recall": [recall], "F1 Score": [f1]})
results_table = pd.concat([results_table, new_row], ignore_index=True)

## 2. FastText Embeddings + Logistic Regression
print("Training FastText model...")
fasttext_model = FastText([x.split() for x in X_train], vector_size=100, window=5, min_count=2, workers=4)

def get_fasttext_embedding(text):
    tokens = text.split()
    return np.mean([fasttext_model.wv[word] for word in tokens if word in fasttext_model.wv], axis=0)

X_train_fasttext = np.array([get_fasttext_embedding(text) for text in X_train])
X_test_fasttext = np.array([get_fasttext_embedding(text) for text in X_test])

# Logistic Regression on FastText embeddings
clf_fasttext = LogisticRegression(max_iter=1000)
clf_fasttext.fit(X_train_fasttext, y_train)

precision, recall, f1 = evaluate_model(clf_fasttext, X_test_fasttext, y_test)
new_row = pd.DataFrame({"Model": ["FastText + Logistic Regression"], "Precision": [precision], "Recall": [recall], "F1 Score": [f1]})
results_table = pd.concat([results_table, new_row], ignore_index=True)

## 3. GloVe Embeddings + Logistic Regression

# Path to GloVe file on Google Drive
glove_path = '/content/drive/My Drive/glove.6B.100d.txt'

# Step 1: Count the number of lines (words) in the GloVe file
def count_lines(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return sum(1 for _ in f)

# Step 2: Preallocate KeyedVectors
num_words = count_lines(glove_path)
vector_size = 100  # GloVe 100d has 100-dimensional vectors

# Preallocate KeyedVectors with the number of words and vector size
glove_vectors = KeyedVectors(vector_size=vector_size, count=num_words)

# Step 3: Load GloVe vectors with progress bar
print("Loading GloVe embeddings...")

# Using tqdm to show the progress bar while loading the GloVe file
with open(glove_path, 'r', encoding='utf-8') as f:
    batch = []
    for line in tqdm(f, total=num_words, desc="Processing GloVe Vectors"):
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        batch.append((word, vector))

    glove_vectors.add_vectors([word for word, vector in batch], np.array([vector for word, vector in batch]))

# Normalize vectors
glove_vectors.fill_norms()

# Function to get GloVe embeddings for a given text
def get_glove_embedding(text):
    tokens = text.split()
    vectors = [glove_vectors[word] for word in tokens if word in glove_vectors]

    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)  # Return a zero vector if no valid words are found

X_train_glove = np.array([get_glove_embedding(text) for text in tqdm(X_train, desc="Processing Training Set")])
X_test_glove = np.array([get_glove_embedding(text) for text in tqdm(X_test, desc="Processing Test Set")])

# Logistic Regression on GloVe embeddings
clf_glove = LogisticRegression(max_iter=1000)
clf_glove.fit(X_train_glove, y_train)

precision, recall, f1 = evaluate_model(clf_glove, X_test_glove, y_test)
new_row = pd.DataFrame({"Model": ["GloVe + Logistic Regression"], "Precision": [precision], "Recall": [recall], "F1 Score": [f1]})
results_table = pd.concat([results_table, new_row], ignore_index=True)

# Show final results table
print(results_table)

Training Word2Vec model...


  results_table = pd.concat([results_table, new_row], ignore_index=True)


Training FastText model...
Loading GloVe embeddings...


Processing GloVe Vectors: 100%|██████████| 400001/400001 [00:37<00:00, 10548.13it/s]
Processing Training Set: 100%|██████████| 20000/20000 [00:28<00:00, 709.31it/s] 
Processing Test Set: 100%|██████████| 5000/5000 [00:08<00:00, 565.38it/s]


                            Model  Precision    Recall  F1 Score
0  Word2Vec + Logistic Regression   0.824602  0.854443  0.839258
1  FastText + Logistic Regression   0.811866  0.836349  0.823926
2     GloVe + Logistic Regression   0.778750  0.786892  0.782800


In [56]:
## 1. Word2Vec Embeddings + SVM
print("Training Word2Vec model...")
word2vec_model = Word2Vec([x.split() for x in X_train], vector_size=100, window=5, min_count=2, workers=4)
word2vec_vectors = word2vec_model.wv

def get_word2vec_embedding(text):
    tokens = text.split()
    return np.mean([word2vec_vectors[word] for word in tokens if word in word2vec_vectors], axis=0)

X_train_word2vec = np.array([get_word2vec_embedding(text) for text in X_train])
X_test_word2vec = np.array([get_word2vec_embedding(text) for text in X_test])

# SVM on Word2Vec embeddings
clf_word2vec = SVC(kernel='linear', max_iter=1000)  # Używamy SVM z liniowym jądrem
clf_word2vec.fit(X_train_word2vec, y_train)

precision, recall, f1 = evaluate_model(clf_word2vec, X_test_word2vec, y_test)
new_row = pd.DataFrame({"Model": ["Word2Vec + SVM"], "Precision": [precision], "Recall": [recall], "F1 Score": [f1]})
results_table = pd.concat([results_table, new_row], ignore_index=True)

## 2. FastText Embeddings + SVM
print("Training FastText model...")
fasttext_model = FastText([x.split() for x in X_train], vector_size=100, window=5, min_count=2, workers=4)

def get_fasttext_embedding(text):
    tokens = text.split()
    return np.mean([fasttext_model.wv[word] for word in tokens if word in fasttext_model.wv], axis=0)

X_train_fasttext = np.array([get_fasttext_embedding(text) for text in X_train])
X_test_fasttext = np.array([get_fasttext_embedding(text) for text in X_test])

# SVM on FastText embeddings
clf_fasttext = SVC(kernel='linear', max_iter=1000)  # Używamy SVM z liniowym jądrem
clf_fasttext.fit(X_train_fasttext, y_train)

precision, recall, f1 = evaluate_model(clf_fasttext, X_test_fasttext, y_test)
new_row = pd.DataFrame({"Model": ["FastText + SVM"], "Precision": [precision], "Recall": [recall], "F1 Score": [f1]})
results_table = pd.concat([results_table, new_row], ignore_index=True)

## 3. GloVe Embeddings + SVM

# Path to GloVe file on Google Drive
glove_path = '/content/drive/My Drive/glove.6B.100d.txt'

# Step 1: Count the number of lines (words) in the GloVe file
def count_lines(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return sum(1 for _ in f)

# Step 2: Preallocate KeyedVectors
num_words = count_lines(glove_path)
vector_size = 100  # GloVe 100d has 100-dimensional vectors

# Preallocate KeyedVectors with the number of words and vector size
glove_vectors = KeyedVectors(vector_size=vector_size, count=num_words)

# Step 3: Load GloVe vectors with progress bar
print("Loading GloVe embeddings...")

# Using tqdm to show the progress bar while loading the GloVe file
with open(glove_path, 'r', encoding='utf-8') as f:
    batch = []
    for line in tqdm(f, total=num_words, desc="Processing GloVe Vectors"):
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        batch.append((word, vector))

    glove_vectors.add_vectors([word for word, vector in batch], np.array([vector for word, vector in batch]))

# Normalize vectors
glove_vectors.fill_norms()

# Function to get GloVe embeddings for a given text
def get_glove_embedding(text):
    tokens = text.split()
    vectors = [glove_vectors[word] for word in tokens if word in glove_vectors]

    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)  # Return a zero vector if no valid words are found

X_train_glove = np.array([get_glove_embedding(text) for text in tqdm(X_train, desc="Processing Training Set")])
X_test_glove = np.array([get_glove_embedding(text) for text in tqdm(X_test, desc="Processing Test Set")])

# SVM on GloVe embeddings
clf_glove = SVC(kernel='linear', max_iter=1000)  # Używamy SVM z liniowym jądrem
clf_glove.fit(X_train_glove, y_train)

precision, recall, f1 = evaluate_model(clf_glove, X_test_glove, y_test)
new_row = pd.DataFrame({"Model": ["GloVe + SVM"], "Precision": [precision], "Recall": [recall], "F1 Score": [f1]})
results_table = pd.concat([results_table, new_row], ignore_index=True)

# Show final results table
print(results_table)

Training Word2Vec model...


  results_table = pd.concat([results_table, new_row], ignore_index=True)


Training FastText model...




Loading GloVe embeddings...


Processing GloVe Vectors: 100%|██████████| 400001/400001 [00:39<00:00, 10169.42it/s]
Processing Training Set: 100%|██████████| 20000/20000 [00:33<00:00, 590.63it/s]
Processing Test Set: 100%|██████████| 5000/5000 [00:07<00:00, 703.82it/s]


            Model  Precision    Recall  F1 Score
0  Word2Vec + SVM   0.523464  0.988719  0.684519
1  FastText + SVM   0.514673  0.932716  0.663324
2     GloVe + SVM   0.496698  1.000000  0.663725


In [None]:
# The best F1 score -> Word2Vec + Logistic Regression -> 0.839258