In [1]:
import os
import random

import numpy as np
import pandas as pd
import torch
from pytorch_lightning import Trainer, LightningModule
from sklearn.metrics import classification_report, f1_score
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (
    AdamW, 
    AutoTokenizer, 
    RobertaForSequenceClassification, 
    get_linear_schedule_with_warmup
)

torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

  from .autonotebook import tqdm as notebook_tqdm


### Setup

In [2]:
WORKING_DIR = os.path.dirname("__file__")
DATA_DIR = os.path.join(WORKING_DIR, "..", "data")
MODEL_NAME = "roberta-base"
CATEGORY_TO_ID = {
    "stat.AP": 0,
    "stat.CO": 1,
    "stat.ME": 2,
    "stat.ML": 3,
    "stat.TH": 4
}
ID_TO_CATEGORY = {
    v: k for k, v in CATEGORY_TO_ID.items()
}
N_CLASSES = len(CATEGORY_TO_ID)

train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
test_df = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [3]:
# Set up data stuff

class ArXivDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.titles = data["Title"]
        self.abstracts = data["Abstract"]
        self.labels = data["Primary Category"].map(CATEGORY_TO_ID)
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.titles)
    
    def encode_text(self, idx):
        text = self.titles.iloc[idx] + "\n" + self.abstracts.iloc[idx]

        return self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

    def __getitem__(self, idx):
        encoding = self.encode_text(idx)
        input_ids = encoding["input_ids"].squeeze(0)
        attention_mask = encoding["attention_mask"].squeeze(0)
        label = torch.tensor(self.labels.iloc[idx], dtype=torch.long)

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "label": label
        }

In [4]:
train_dataset = ArXivDataset(train_df, tokenizer)
test_dataset = ArXivDataset(test_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=3)
test_loader = DataLoader(test_dataset, batch_size=8, num_workers=3)

### Baseline: RoBERTa without Fine-Tuning

In [7]:
model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=N_CLASSES)
model.to("cuda")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [9]:
model.eval()
y_true = []
y_pred = []

for batch in tqdm(test_loader, unit="batch"):
    input_ids = batch["input_ids"].to("cuda")
    attention_mask = batch["attention_mask"].to("cuda")

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()
    true_labels = batch["label"].cpu().numpy()

    y_true.extend(true_labels)
    y_pred.extend(predicted_labels)

print(classification_report(y_true, y_pred, target_names=CATEGORY_TO_ID.keys()))
print(f"F1 Score: {f1_score(y_true, y_pred, average='macro')}")

100%|██████████| 2275/2275 [02:55<00:00, 12.94batch/s]

              precision    recall  f1-score   support

     stat.AP       0.00      0.00      0.00      2576
     stat.CO       0.00      0.00      0.00      1007
     stat.ME       0.00      0.00      0.00      5540
     stat.ML       0.27      1.00      0.42      4824
     stat.TH       0.00      0.00      0.00      4248

    accuracy                           0.27     18195
   macro avg       0.05      0.20      0.08     18195
weighted avg       0.07      0.27      0.11     18195

F1 Score: 0.08382640427472957



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### RoBERTa Fine-Tuning

In [5]:
class ArXivStatClassifier(LightningModule):
    def __init__(self, epochs=10):
        super().__init__()
        self.model = RobertaForSequenceClassification.from_pretrained(
            MODEL_NAME, 
            num_labels=N_CLASSES
        )
        self.epochs = epochs

    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["label"]
        outputs = self(input_ids, attention_mask, labels)
        loss = outputs.loss
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        return loss

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=5e-6)
        total_steps = len(train_loader) * self.epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer, 
            num_warmup_steps=0, 
            num_training_steps=total_steps
        )
        scheduler_config = {'scheduler': scheduler, 'interval': 'step', 'frequency': 1}

        return [optimizer], [scheduler_config]

    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["label"]
        outputs = self(input_ids, attention_mask, labels)
        loss = outputs.loss
        self.log("test_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)

        return loss

In [6]:
# Training
num_epochs = 30
model = ArXivStatClassifier(epochs=num_epochs)

trainer = Trainer(
    max_epochs=num_epochs,
    accelerator="cuda",
    devices=-1,
    enable_progress_bar=True
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
/home/cpsc452_eh665/.conda/envs/nlp-arxiv/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/cpsc452_eh665/.conda/envs/nlp-arxiv/lib/python ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/cpsc452_eh665/.conda/envs/nlp-arxiv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/log

In [7]:
trainer.fit(model, train_loader)

/home/cpsc452_eh665/.conda/envs/nlp-arxiv/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/cpsc452_eh665/.conda/envs/nlp-arxiv/lib/python ...
/home/cpsc452_eh665/.conda/envs/nlp-arxiv/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/cpsc452_eh665/.conda/envs/nlp-arxiv/lib/python ...
You are using a CUDA device ('NVIDIA RTX A5000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/

Epoch 0: 100%|██████████| 2654/2654 [10:56<00:00,  4.04it/s, v_num=10, train_loss_step=0.278]

/home/cpsc452_eh665/.conda/envs/nlp-arxiv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 29: 100%|██████████| 2654/2654 [10:56<00:00,  4.04it/s, v_num=10, train_loss_step=1.33e-5, train_loss_epoch=0.00372] 

`Trainer.fit` stopped: `max_epochs=30` reached.


Epoch 29: 100%|██████████| 2654/2654 [10:59<00:00,  4.03it/s, v_num=10, train_loss_step=1.33e-5, train_loss_epoch=0.00372]


In [8]:
trainer.save_checkpoint("../models/checkpoint_roberta_long.ckpt")

In [9]:
# Evaluate
model_finetuned = ArXivStatClassifier.load_from_checkpoint("../models/checkpoint_roberta_long.ckpt", epochs=num_epochs)
model_finetuned.to("cuda")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ArXivStatClassifier(
  (model): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0-11): 12 x RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (de

In [10]:
model_finetuned.eval()
y_true = []
y_pred = []

for batch in tqdm(test_loader, unit="batch"):
    input_ids = batch["input_ids"].to("cuda")
    attention_mask = batch["attention_mask"].to("cuda")

    with torch.no_grad():
        outputs = model_finetuned(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()
    true_labels = batch["label"].cpu().numpy()

    y_true.extend(true_labels)
    y_pred.extend(predicted_labels)

print(classification_report(y_true, y_pred, target_names=CATEGORY_TO_ID.keys()))
print(f"F1 Score: {f1_score(y_true, y_pred, average='macro')}")

100%|██████████| 2275/2275 [02:56<00:00, 12.92batch/s]

              precision    recall  f1-score   support

     stat.AP       0.66      0.59      0.62      2576
     stat.CO       0.49      0.50      0.50      1007
     stat.ME       0.63      0.68      0.65      5540
     stat.ML       0.81      0.75      0.78      4824
     stat.TH       0.69      0.73      0.71      4248

    accuracy                           0.69     18195
   macro avg       0.66      0.65      0.65     18195
weighted avg       0.69      0.69      0.69     18195

F1 Score: 0.6517949304043265



