In [1]:
# https://www.kaggle.com/datasets/spsayakpaul/arxiv-paper-abstracts/data
# predict category from title/abstract

In [2]:
import typing as t
from ast import literal_eval

from transformer.models.seq2seq import Seq2SeqLM
from transformer.dataloaders.seq2seq import Seq2SeqDataModule
from transformer.params import TransformerParams

import torch
import numpy as np
import pandas as pd
from lightning import Trainer
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from transformers import LlamaTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# load and preview data
data = pd.read_csv("data/arxiv.csv")
data.titles = data.titles.str.replace("\n", " ")
data.abstracts = data.abstracts.str.replace("\n", " ")
data.tail()

Unnamed: 0,terms,titles,abstracts
56176,"['cs.CV', 'cs.IR']",Mining Spatio-temporal Data on Industrializati...,Despite the growing availability of big data i...
56177,"['cs.LG', 'cs.AI', 'cs.CL', 'I.2.6; I.2.7']",Wav2Letter: an End-to-End ConvNet-based Speech...,This paper presents a simple end-to-end model ...
56178,['cs.LG'],Deep Reinforcement Learning with Double Q-lear...,The popular Q-learning algorithm is known to o...
56179,"['stat.ML', 'cs.LG', 'math.OC']",Generalized Low Rank Models,Principal components analysis (PCA) is a well-...
56180,"['cs.LG', 'cs.AI', 'stat.ML']",Chi-square Tests Driven Method for Learning th...,SDYNA is a general framework designed to addre...


In [4]:
# create data module
class ArxivSummarizationDataModule(Seq2SeqDataModule):
    def setup(self: t.Self, stage: str) -> None:
        self.data = data[["abstracts", "titles"]].to_numpy()
        super().setup(stage=stage)

In [5]:
# initialize pretrained tokenizer
# - llama does not add an EOS token by default, so override this
# - llama also does not use a padding token, so this needs to be added
tokenizer = LlamaTokenizer.from_pretrained(
    "huggyllama/llama-7b", add_eos_token=True, legacy=False
)
tokenizer.add_special_tokens({"pad_token": "<pad>"})

1

In [6]:
# initialize the transformer - note that for this seq2seq task, it is appropriate to use the same tokenizer for input and output
context_length = 64
model = Seq2SeqLM(
    config=TransformerParams(context_length=context_length),
    input_tokenizer=tokenizer,
    output_tokenizer=tokenizer,
)

In [7]:
# tokenize & encode data and prepare train/test splits
datamodule = ArxivSummarizationDataModule(
    input_tokenizer=tokenizer,
    output_tokenizer=tokenizer,
    context_length=context_length,
    batch_size=32,
    val_size=0.2,
    test_size=0.1,
    num_workers=9,
    persistent_workers=True,
    limit=None,
    random_state=1,
)

In [8]:
%%time
# train the model
trainer = Trainer(
    max_epochs=5,
    callbacks=EarlyStopping(monitor="val_loss", mode="min", patience=5),
    accelerator="gpu",
)
trainer.fit(model=model, datamodule=datamodule)

MisconfigurationException: No supported gpu backend found!

In [11]:
# calculate test metrics
trainer.test(model=model, datamodule=datamodule)

Testing DataLoader 0: 100%|██████████| 176/176 [00:08<00:00, 21.04it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss            0.773556113243103
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.773556113243103}]

In [12]:
# view first batch of test set predictions
pred = trainer.predict(model=model, datamodule=datamodule)
pred[:10]

Predicting DataLoader 0: 100%|██████████| 176/176 [00:12<00:00, 14.02it/s]


[[('Tensor train rank minimization with nonlocal self-similarity for tensor completion',
   1,
   0),
  ('DyCo3D: Robust Instance Segmentation of 3D Point Clouds through Dynamic Convolution',
   0,
   0),
  ('Cloud Transformers: A Universal Approach To Point Cloud Processing Tasks',
   1,
   0),
  ('Attentional Local Contrast Networks for Infrared Small Target Detection',
   0,
   0),
  ('Single Image 3D Object Estimation with Primitive Graph Networks', 0, 0),
  ('Res-GCNN: A Lightweight Residual Graph Convolutional Neural Networks for Human Trajectory Forecasting',
   1,
   0),
  ('An Empirical Evaluation of the t-SNE Algorithm for Data Visualization in Structural Engineering',
   1,
   1),
  ('Density-Aware Convolutional Networks with Context Encoding for Airborne LiDAR Point Cloud Classification',
   0,
   0),
  ('Classification under Streaming Emerging New Classes: A Solution using Completely Random Trees',
   1,
   1),
  ('Line Artist: A Multiple Style Sketch to Painting Synthesis

In [13]:
# calculate accuracy
torch.tensor([x[1] == x[2] for batch in pred for x in batch]).float().mean()

tensor(0.6371)