In [2]:
import os
import random

import numpy as np
import pandas as pd
import torch
from pytorch_lightning import Trainer, LightningModule
from sklearn.metrics import classification_report, f1_score
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (
    AdamW, 
    AutoTokenizer, 
    RobertaForSequenceClassification, 
    get_linear_schedule_with_warmup
)

torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

### Setup

In [3]:
WORKING_DIR = os.path.dirname("__file__")
DATA_DIR = os.path.join(WORKING_DIR, "..", "data")
MODEL_NAME = "roberta-base"
CATEGORY_TO_ID = {
    "stat.AP": 0,
    "stat.CO": 1,
    "stat.ME": 2,
    "stat.ML": 3,
    "stat.TH": 4
}
ID_TO_CATEGORY = {
    v: k for k, v in CATEGORY_TO_ID.items()
}
N_CLASSES = len(CATEGORY_TO_ID)

train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
test_df = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [4]:
# Set up data stuff

class ArXivDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=256):
        self.titles = data["Title"]
        self.abstracts = data["Abstract"]
        self.labels = data["Primary Category"].map(CATEGORY_TO_ID)
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.titles)
    
    def encode_text(self, idx):
        text = self.titles.iloc[idx] + self.tokenizer.sep_token + self.abstracts.iloc[idx]

        return self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

    def __getitem__(self, idx):
        encoding = self.encode_text(idx)
        input_ids = encoding["input_ids"].squeeze(0)
        attention_mask = encoding["attention_mask"].squeeze(0)
        label = torch.tensor(self.labels.iloc[idx], dtype=torch.long)

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "label": label
        }

In [5]:
train_dataset = ArXivDataset(train_df, tokenizer)
test_dataset = ArXivDataset(test_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=3)
test_loader = DataLoader(test_dataset, batch_size=16, num_workers=3)

### RoBERTa Baseline

(i.e. only train classification head while freezing embedding weights)

In [6]:
class RobertaBaseline(LightningModule):
    def __init__(self, epochs=10):
        super().__init__()
        self.model = RobertaForSequenceClassification.from_pretrained(
            MODEL_NAME, 
            num_labels=N_CLASSES
        )

        # Freeze pretrained part
        for param in self.model.roberta.parameters():
            param.requires_grad = False
        
        self.epochs = epochs

    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["label"]
        outputs = self(input_ids, attention_mask, labels)
        loss = outputs.loss
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)

        return loss

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=1e-3)
        total_steps = len(train_loader) * self.epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer, 
            num_warmup_steps=0, 
            num_training_steps=total_steps
        )
        scheduler_config = {'scheduler': scheduler, 'interval': 'step', 'frequency': 1}

        return [optimizer], [scheduler_config]

    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["label"]
        outputs = self(input_ids, attention_mask, labels)
        loss = outputs.loss
        self.log("test_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)

        return loss

In [7]:
# Training
num_epochs = 10
model = RobertaBaseline(epochs=num_epochs)

trainer = Trainer(
    max_epochs=num_epochs,
    accelerator="cuda",
    devices=-1,
    enable_progress_bar=True
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
/home/cpsc452_eh665/.conda/envs/nlp-arxiv/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/cpsc452_eh665/.conda/envs/nlp-arxiv/lib/python ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/cpsc452_eh665/.conda/envs/nlp-arxiv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/log

In [8]:
trainer.fit(model, train_loader)

/home/cpsc452_eh665/.conda/envs/nlp-arxiv/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/cpsc452_eh665/.conda/envs/nlp-arxiv/lib/python ...
/home/cpsc452_eh665/.conda/envs/nlp-arxiv/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/cpsc452_eh665/.conda/envs/nlp-arxiv/lib/python ...
You are using a CUDA device ('NVIDIA A100-PCIE-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/gener

Epoch 9: 100%|██████████| 1327/1327 [01:57<00:00, 11.30it/s, v_num=24, train_loss_step=0.729, train_loss_epoch=0.862]

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 1327/1327 [01:58<00:00, 11.20it/s, v_num=24, train_loss_step=0.729, train_loss_epoch=0.862]


In [9]:
trainer.save_checkpoint("../models/roberta_baseline.ckpt")

In [12]:
# Evaluate
model_baseline = RobertaBaseline.load_from_checkpoint("../models/roberta_baseline.ckpt")
model_baseline.to("cuda")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaBaseline(
  (model): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0-11): 12 x RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (dense)

In [13]:
model_baseline.eval()
y_true = []
y_pred = []

for batch in tqdm(test_loader, unit="batch"):
    input_ids = batch["input_ids"].to("cuda")
    attention_mask = batch["attention_mask"].to("cuda")

    with torch.no_grad():
        outputs = model_baseline(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()
    true_labels = batch["label"].cpu().numpy()

    y_true.extend(true_labels)
    y_pred.extend(predicted_labels)

print(classification_report(y_true, y_pred, target_names=CATEGORY_TO_ID.keys()))
print(f"Macro-Averaged F1-Score: {f1_score(y_true, y_pred, average='macro')}")

100%|██████████| 1138/1138 [01:06<00:00, 17.21batch/s]


              precision    recall  f1-score   support

     stat.AP       0.73      0.49      0.58      2576
     stat.CO       0.49      0.42      0.45      1007
     stat.ME       0.56      0.75      0.64      5540
     stat.ML       0.76      0.73      0.74      4824
     stat.TH       0.74      0.61      0.67      4248

    accuracy                           0.66     18195
   macro avg       0.66      0.60      0.62     18195
weighted avg       0.67      0.66      0.66     18195

Macro-Averaged F1-Score: 0.6175602389198405
