In [None]:
# https://www.kaggle.com/datasets/spsayakpaul/arxiv-paper-abstracts/data
# predict category from title/abstract

In [20]:
import typing as t
from ast import literal_eval

from transformer.models.classifier import ClassifierLM
from transformer.dataloaders.inference import InferenceDataModule
from transformer.params import TransformerParams


import pandas as pd
from sklearn.preprocessing import LabelEncoder
from lightning import Trainer
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from transformers import LlamaTokenizer

In [8]:
# load and preview data
data = pd.read_csv("data/arxiv.csv")
data.tail()

Unnamed: 0,terms,titles,abstracts
56176,"['cs.CV', 'cs.IR']",Mining Spatio-temporal Data on Industrializati...,Despite the growing availability of big data i...
56177,"['cs.LG', 'cs.AI', 'cs.CL', 'I.2.6; I.2.7']",Wav2Letter: an End-to-End ConvNet-based Speech...,This paper presents a simple end-to-end model ...
56178,['cs.LG'],Deep Reinforcement Learning with Double Q-lear...,The popular Q-learning algorithm is known to o...
56179,"['stat.ML', 'cs.LG', 'math.OC']",Generalized Low Rank Models,Principal components analysis (PCA) is a well-...
56180,"['cs.LG', 'cs.AI', 'stat.ML']",Chi-square Tests Driven Method for Learning th...,SDYNA is a general framework designed to addre...


In [9]:
# get titles and primary category
X = data.titles.to_list()
y = data.terms.apply(literal_eval).str[0].to_numpy()

In [13]:
# encode categories
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [16]:
# create data module
class ArxivDataModule(InferenceDataModule):
    def setup(self: t.Self, stage: str) -> None:
        self.X, self.y = X, y
        super().setup(stage=stage)

{0, 1, 2}

In [21]:
# initialize pretrained tokenizer
# - llama does not add an EOS token by default, so override this
# - llama also does not use a padding token, so this needs to be added
tokenizer = LlamaTokenizer.from_pretrained(
    "huggyllama/llama-7b", add_eos_token=True, legacy=False
)
tokenizer.add_special_tokens({"pad_token": "<pad>"})

1

In [None]:
# initialize the transformer
context_length = 64
model = ClassifierLM(
    config=TransformerParams(context_length=context_length),
    tokenizer=tokenizer,
)

In [None]:
# tokenize & encode data and prepare train/test splits
datamodule = ArxivDataModule(
    tokenizer=tokenizer,
    context_length=context_length,
    batch_size=32,
    val_size=0.2,
    test_size=0.1,
    num_workers=9,
    persistent_workers=True,
    limit=None,
    random_state=1,
)

In [None]:
# train the model
trainer = Trainer(
    max_epochs=500,
    callbacks=EarlyStopping(monitor="val_loss", mode="min", patience=5),
    accelerator="cpu",
)
trainer.fit(model=model, datamodule=datamodule)

In [None]:
# calculate test metrics
trainer.test(model=model, datamodule=datamodule)

In [None]:
# view first batch of test set predictions
pred = trainer.predict(model=model, datamodule=datamodule)