In [2]:
%%capture
!pip install datasets transformers

In [None]:
from transformers import BertTokenizer
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader
from torch import nn
import matplotlib.pyplot as plt
from datasets import load_dataset
import unicodedata
import re

## Tokenizer


In [2]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

In [3]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
sent = "I love footballing"
ids = tokenizer.encode(sent)
for i in ids:
    print(i, tokenizer.ids_to_tokens[i])

101 [CLS]
1045 i
2293 love
2374 football
2075 ##ing
102 [SEP]


**Dataset**

We will use the IMDB dataset. It is a dataset with 2 classes: Positive and Negative. We download it from HuggingFace Hub [here](https://huggingface.co/datasets/fancyzhx/ag_news)

We will use only a very small subset of **64** annotated texts. This is a paradigm called "few shots learnings".


In [5]:
dataset = load_dataset("stanfordnlp/imdb")
dataset = dataset["train"]
dataset = dataset.shuffle(seed=42)
dataset = dataset.train_test_split(test_size=0.3, seed=1234)
dataset["train"] = dataset["train"].select(range(64))
dataset["test"] = dataset["test"].select(range(1000))

print(dataset)


def preprocess_text(x):
    # TODO lower case
    text = x["text"]
    text = text.lower()

    # TODO string normalization.
    text = unicodedata.normalize("NFD", text).encode("ascii", "ignore").decode()

    # TODO remove non alpha numeric characters.
    text = re.sub(r"[^a-z0-9]", " ", text)

    # TODO replace numbers by the <NUM> token.
    text = re.sub(r"\d+", "<NUM>", text)

    # TODO remove double whitespaces.
    text = re.sub(" +", " ", text.strip())
    ids = tokenizer(text, truncation=True, max_length=256, padding=False)["input_ids"]
    return {"input_ids": ids}


# Clean the dataset and tokenize it directly
dataset = dataset.map(preprocess_text)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 64
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
})


Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
class DataCollator:
    def __init__(self, tokenizer, max_len=128):
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __call__(self, batch):
        # Tokenize the texts
        labels = [example["label"] for example in batch]
        tokenized_texts = [example["input_ids"] for example in batch]
        # Pad the tokenized texts
        max_len = max(len(text) for text in tokenized_texts)
        padded_texts = [
            text + [self.tokenizer.pad_token_id] * (max_len - len(text))
            for text in tokenized_texts
        ]
        pad_mask = [
            [1] * len(text) + [0] * (max_len - len(text)) for text in tokenized_texts
        ]
        return {
            "input_ids": torch.tensor(padded_texts),
            "pad_mask": torch.tensor(pad_mask),
            "labels": torch.tensor(labels),
        }

In [None]:
# TODO
def validation_step(valid_dataloader, model, criterion):
    n_valid = len(valid_dataloader.dataset)
    model.eval()
    total_loss = 0.0
    correct = 0
    n_iter = 0
    with torch.no_grad():
        for batch in valid_dataloader:
            input_ids = batch["input_ids"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)
            pad_mask = batch["pad_mask"].to(DEVICE)
            output = model(input_ids, pad_mask)
            loss = criterion(output, labels)
            total_loss += loss.item()
            correct += (output.argmax(axis=-1) == labels).sum().item()
            n_iter += 1
    return total_loss / n_iter, correct / n_valid


def train_one_epoch(train_dataloader, model, optimizer, criterion):
    model.train()
    total_loss = 0.0
    correct = 0
    n_train = len(train_dataloader.dataset)
    n_iter = 0
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)
        pad_mask = batch["pad_mask"].to(DEVICE)
        class_scores = model(input_ids, pad_mask)  # (B, 4)

        loss = criterion(class_scores, labels)  # scalaire (1,)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        correct += (class_scores.argmax(axis=-1) == labels).sum().item()
        n_iter += 1

    return total_loss / n_iter, correct / n_train


def train(model, train_dataloader, valid_dataloader, lr=0.01, n_epochs=5):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # Track training loss, training accuracy, validation loss and validation accuracy and plot in the end
    train_losses = []
    train_accuracies = []
    valid_losses = []
    valid_accuracies = []
    model.to(DEVICE)
    for epoch in range(n_epochs):
        train_loss, train_accuracy = train_one_epoch(
            train_dataloader, model, optimizer, criterion
        )
        valid_loss, valid_accuracy = validation_step(valid_dataloader, model, criterion)
        print(
            f"Epoch {epoch + 1}: train_loss: {train_loss:.4f}, train_accuracy: {train_accuracy:.4f}, valid_loss: {valid_loss:.4f}, valid_accuracy: {valid_accuracy:.4f}"
        )
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)
        valid_losses.append(valid_loss)
        valid_accuracies.append(valid_accuracy)

    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label="train loss")
    plt.plot(valid_losses, label="valid loss")
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(train_accuracies, label="train accuracy")
    plt.plot(valid_accuracies, label="valid accuracy")
    plt.legend()

In [None]:
batch_size = 4
n_train = len(dataset["train"])
n_valid = len(dataset["test"])
data_collator = DataCollator(tokenizer)
train_dataloader = DataLoader(
    dataset["train"], batch_size=batch_size, collate_fn=data_collator, shuffle=True
)
valid_dataloader = DataLoader(
    dataset["test"], batch_size=batch_size, collate_fn=data_collator, shuffle=True
)

In [None]:
class WordEmbedClassifier(nn.Module):
    def __init__(self, vocab_size, d, n_classes=4):
        super().__init__()
        self.embedding_layer = nn.Embedding(vocab_size, d)
        self.class_projection = nn.Linear(d, n_classes)

    def forward(self, input_ids, pad_mask):
        x_embed = self.embedding_layer(input_ids)  # (B, L, d)

        # x_vector = x_embeds.mean(axis=1) # (B, d)
        n_non_pad = pad_mask.sum(
            axis=1, keepdim=True
        )  # (B, 1), 1 because of keepdim=True

        # pad_mask (B, L), x_embed (B, L, d)
        x_embed = x_embed * pad_mask[:, :, None]  # or pad_mask.unsqueeze(-1)
        x_vector = x_embed.sum(axis=1) / n_non_pad

        class_scores = self.class_projection(x_vector)  # (B, n_classes)

        return class_scores

In [None]:
model = WordEmbedClassifier(
    d=50,
    n_classes=4,
    vocab_size=len(tokenizer),
)
train(
    model,
    train_dataloader=train_dataloader,
    valid_dataloader=valid_dataloader,
    lr=0.01,
    n_epochs=10,
)