## Tutorial task - embeddings

Authors:
- Hubert Bujakowski
- Jan Kruszewski

Let's load imdb dataset with custom train and test data splits.

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd

splits = {'train': 'plain_text/train-00000-of-00001.parquet', 'test': 'plain_text/test-00000-of-00001.parquet'}
train = pd.read_parquet("hf://datasets/stanfordnlp/imdb/" + splits["train"])
test = pd.read_parquet("hf://datasets/stanfordnlp/imdb/" + splits["test"])

# Preprocessing

The next step is **simple text preprocessing**. It includes deleting *HTML* tags, removing punctuation, as well as converting uppercase letters to lowercase.

In [3]:
import re

def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

In [4]:
train['processed'] = train['text'].apply(preprocess_text)
test['processed'] = test['text'].apply(preprocess_text)

In [5]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec, FastText
import gensim.downloader as api

glove_wiki = api.load("glove-wiki-gigaword-50")

X_train = train['processed']
X_test = test['processed']
y_train = train['label']
y_test = test['label']

### Embeddings and classifier models comparison

To compare the methods, we chose 3 metrics, including f1_score, precision and recall.

In [6]:
from sklearn.metrics import f1_score, precision_score, recall_score

def metrics(y_test, y_pred):
    return {'f1': f1_score(y_test, y_pred), 'precision': precision_score(y_test, y_pred), 'recall': recall_score(y_test, y_pred)}

To extract features from text we used 3 embedding models:
- Word2Vec
- FastText
- GloVe

In [16]:
def get_word2vec_embeddings(sentences):
    tokenized_sentences = [sentence.split() for sentence in sentences]
    model = Word2Vec(tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)
    embeddings = [np.mean([model.wv[word] for word in sentence.split() if word in model.wv], axis=0) for sentence in sentences]
    return np.array(embeddings)

def get_fasttext_embeddings(sentences):
    model = FastText(sentences, vector_size=100, window=5, min_count=1, workers=4)
    embeddings = [np.mean([model.wv[word] for word in sentence.split() if word in model.wv], axis=0) for sentence in sentences]
    return np.array(embeddings)

def get_glove_embeddings(series):
    def get_glove_embedding(text, vectors=glove_wiki):
        embeddings = []
        for word in text.split():
            try:
                embeddings.append(vectors[word])
            except KeyError:
                continue
        if len(embeddings) == 0:
            return np.zeros(vectors.vector_size)
        return np.mean(embeddings, axis=0)

    return np.vstack(series.apply(lambda x: get_glove_embedding(x)))

For classification we tested 3 models:
- Logistic Regression
- Random Forest
- SVM classifier

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

lr_classifier = LogisticRegression(max_iter=1000)
rf_classifier = RandomForestClassifier()
svm_classifier = SVC()

X_train_w2v = get_word2vec_embeddings(X_train)
X_test_w2v = get_word2vec_embeddings(X_test)

X_train_ft = get_fasttext_embeddings(X_train)
X_test_ft = get_fasttext_embeddings(X_test)

X_train_glove = get_glove_embeddings(X_train)
X_test_glove = get_glove_embeddings(X_test)

classifiers = [lr_classifier, rf_classifier, svm_classifier]

In [18]:
results = []

for classifier in classifiers:
    for X_train, X_test, name in zip([X_train_w2v, X_train_ft, X_train_glove], [X_test_w2v, X_test_ft, X_test_glove], ['w2v', 'ft', 'glove']):
        print(f"Training {classifier.__class__.__name__} with {name} embeddings")
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        results.append({'classifier': classifier.__class__.__name__, 'embedding': name, 'f1': metrics(y_test, y_pred)['f1'], 'precision': metrics(y_test, y_pred)['precision'], 'recall': metrics(y_test, y_pred)['recall']})

Training LogisticRegression with w2v embeddings


Training LogisticRegression with ft embeddings
Training LogisticRegression with glove embeddings
Training RandomForestClassifier with w2v embeddings
Training RandomForestClassifier with ft embeddings
Training RandomForestClassifier with glove embeddings
Training SVC with w2v embeddings
Training SVC with ft embeddings
Training SVC with glove embeddings


In [19]:
results_df = pd.DataFrame(results)
results_df.to_csv('results.csv', index=False)

Additionally, we used the `transformers` library to potentially achieve better results.

In [8]:
import torch
device = torch.device("cuda:7")

In [14]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [11]:
train_dataset = SentimentDataset(X_train, y_train, tokenizer)
test_dataset = SentimentDataset(X_test, y_test, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16)

Let's see how the model performs on the test data without finetuning.

In [15]:
from sklearn.metrics import precision_score, recall_score, f1_score

model.to(device)
model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=-1)

        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

precision = precision_score(all_labels, all_predictions)
recall = recall_score(all_labels, all_predictions)
f1 = f1_score(all_labels, all_predictions)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Precision: 0.5001801513271148
Recall: 0.99952
F1 Score: 0.6667200298834014


The results are not promising, so we decided to fine-tune the model for 1 epoch.

In [12]:
from transformers import AdamW
from tqdm import tqdm

model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-5)

model.train()
for epoch in range(1):
    loop = tqdm(train_dataloader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 1563/1563 [13:01<00:00,  2.00it/s, loss=0.328] 


In [13]:
from sklearn.metrics import precision_score, recall_score, f1_score

model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=-1)

        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

precision = precision_score(all_labels, all_predictions)
recall = recall_score(all_labels, all_predictions)
f1 = f1_score(all_labels, all_predictions)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Precision: 0.9224198730507014
Recall: 0.94168
F1 Score: 0.9319504374331974


Even though we tested several models, the `BERT` model achieved the best results, especially when fine-tuned.

In [2]:
pd.read_csv('results.csv').sort_values('f1', ascending=False)

Unnamed: 0,classifier,embedding,f1,precision,recall
9,BERT,BERT,0.93195,0.92242,0.94168
2,LogisticRegression,glove,0.752823,0.761858,0.744
8,SVC,glove,0.751232,0.765033,0.73792
5,RandomForestClassifier,glove,0.744247,0.744575,0.74392
1,LogisticRegression,ft,0.659037,0.500978,0.9628
7,SVC,ft,0.439388,0.52975,0.37536
3,RandomForestClassifier,w2v,0.369796,0.523043,0.286
4,RandomForestClassifier,ft,0.224998,0.522601,0.14336
6,SVC,w2v,0.03654,0.75974,0.01872
0,LogisticRegression,w2v,0.009844,0.639175,0.00496
