In [10]:
import os
import nltk
import yaml
import torch
import pandas as pd

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/user/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [12]:
with open("../configs/main.yml", 'r') as file:
    config = yaml.safe_load(file)

In [13]:
path_to_train = os.path.join(config['BASE_DIR'], config['TRAIN_TEST_PATH'], 'train.csv')
path_to_test = os.path.join(config['BASE_DIR'], config['TRAIN_TEST_PATH'], 'test.csv')

In [16]:
train = pd.read_csv(path_to_train)
val = pd.read_csv(path_to_test)

In [4]:
from torchtext.vocab import build_vocab_from_iterator

def yield_tokens(df):
    for _, sample in df.iterrows():
        yield sample.to_list()[0]


UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3

special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

vocab = build_vocab_from_iterator(yield_tokens(train), specials=special_symbols)
vocab.set_default_index(UNK_IDX)

In [5]:
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for _text, _label in batch:
        label_list.append(_label)
        processed_text = torch.tensor(vocab(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = [int(float_num) for float_num in label_list]
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

train_dataloader = DataLoader(
    train.to_numpy(), batch_size=128, shuffle=True, collate_fn=collate_batch
)

val_dataloader = DataLoader(
    val.to_numpy(), batch_size=128, shuffle=False, collate_fn=collate_batch
)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/user/pmldl/assignment1/.venv/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/user/pmldl/assignment1/.venv/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/home/user/pm

In [6]:
import torch.nn as nn

class TextClassificationModel(nn.Module):
    def __init__(self, num_classes, vocab_len):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_len, 256)
        self.classifier = nn.Sequential(
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )

    def forward(self, text, offsets):
        embedded = self.embedding(input=text, offsets=offsets)
        logits = self.classifier(embedded)
        return logits

In [7]:
from tqdm.autonotebook import tqdm

def train_one_epoch(
    model,
    loader,
    optimizer,
    scheduler,
    loss_fn,
    epoch_num=-1
):
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Epoch {epoch_num}: train",
        leave=True,
    )
    model.train()
    train_loss = 0.0
    for i, batch in loop:
        labels, texts, offsets = batch
        # zero the parameter gradients
        model.zero_grad()

        # forward pass
        outputs = model(texts, offsets)
        # loss calculation
        loss = loss_fn(outputs, labels)

        # backward pass
        loss.backward()

        # optimizer run
        optimizer.step()
        optimizer.zero_grad()

        train_loss += loss.item()
        loop.set_postfix({"loss": train_loss/(i * len(labels))})
    scheduler.step()

def val_one_epoch(
    model,
    loader,
    loss_fn,
    epoch_num=-1,
    best_so_far=0.0,
    ckpt_path='best.pt'
):

    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Epoch {epoch_num}: val",
        leave=True,
    )
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        model.eval()  # evaluation mode
        for i, batch in loop:
            labels, texts, offsets = batch

            # forward pass
            outputs = model(texts, offsets)
            # loss calculation
            loss = loss_fn(outputs, labels)

            _, predicted = torch.max(outputs, dim=1)
            total += len(labels)
            correct += (predicted == labels).sum().item()

            val_loss += loss.item()
            loop.set_postfix({"loss": val_loss / total, "acc": correct / total})

        accuracy = correct / total
        if accuracy > best_so_far:
            best_so_far = accuracy
            torch.save(model.state_dict(), ckpt_path)

    return best_so_far

In [8]:
epochs = 10
model = TextClassificationModel(3, len(vocab))
model = model.to(device)
optimizer = torch.optim.RMSprop(model.parameters())
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
loss_fn = torch.nn.CrossEntropyLoss()

In [9]:
best = -float('inf')
for epoch in range(epochs):
    train_one_epoch(model, train_dataloader, optimizer, scheduler, loss_fn, epoch_num=epoch)
    best = val_one_epoch(model, val_dataloader, loss_fn, epoch, best_so_far=best)

Epoch 0: train:   0%|          | 0/204 [00:00<?, ?it/s]

Epoch 0: val:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 1: train:   0%|          | 0/204 [00:00<?, ?it/s]

Epoch 1: val:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 2: train:   0%|          | 0/204 [00:00<?, ?it/s]

Epoch 2: val:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 3: train:   0%|          | 0/204 [00:00<?, ?it/s]

Epoch 3: val:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 4: train:   0%|          | 0/204 [00:00<?, ?it/s]

Epoch 4: val:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 5: train:   0%|          | 0/204 [00:00<?, ?it/s]

Epoch 5: val:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 6: train:   0%|          | 0/204 [00:00<?, ?it/s]

Epoch 6: val:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 7: train:   0%|          | 0/204 [00:00<?, ?it/s]

Epoch 7: val:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 8: train:   0%|          | 0/204 [00:00<?, ?it/s]

Epoch 8: val:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 9: train:   0%|          | 0/204 [00:00<?, ?it/s]

Epoch 9: val:   0%|          | 0/51 [00:00<?, ?it/s]