In [1]:
# https://www.kaggle.com/code/bcruise/adventures-of-florida-man/input

In [3]:
import typing as t

from transformer.models.causal import CausalLM
from transformer.dataloaders.teacher_forcing import TeacherForcingDataModule
from transformer.params import TransformerParams

import pandas as pd
from lightning import Trainer
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from transformers import LlamaTokenizer

In [4]:
# load and preview data
titles = pd.read_csv("data/florida_man.csv").title
titles.tail()

42768    Florida woman assaults boyfriend after he refu...
42769    Florida Woman Arrested After Dispute Over Moth...
42770    Law firm demands Florida man remove racist ‘co...
42771    Florida Man arrested for assaulting wife with ...
42772    Half of the articles linked in /r/FloridaMan d...
Name: title, dtype: object

In [5]:
# create data module
class FloridaManDataModule(TeacherForcingDataModule):
    def setup(self: t.Self, stage: str) -> None:
        # read titles with 200 or fewer characters from CSV
        self.data = titles.loc[titles.str.len() <= 200].to_list()
        super().setup(stage=stage)

In [7]:
# initialize pretrained tokenizer for causal language modelling
# - llama does not add an EOS token by default, so override this
# - llama also does not use a padding token, so this needs to be added
tokenizer = LlamaTokenizer.from_pretrained(
    "huggyllama/llama-7b", add_eos_token=True, legacy=False
)
tokenizer.add_special_tokens({"pad_token": "<pad>"})

1

In [8]:
# initialize the transformer
context_length = 64
model = CausalLM(
    config=TransformerParams(context_length=context_length),
    tokenizer=tokenizer,
)

In [9]:
# tokenize & encode data and prepare train/test splits
datamodule = FloridaManDataModule(
    tokenizer=tokenizer,
    context_length=context_length,
    batch_size=32,
    val_size=0.2,
    test_size=0.1,
    num_workers=9,
    persistent_workers=True,
    limit=None,
    random_state=1,
)

In [11]:
# train the model
trainer = Trainer(
    max_epochs=500,
    callbacks=EarlyStopping(monitor="val_loss", mode="min", patience=5),
    accelerator="cpu",
)
trainer.fit(model=model, datamodule=datamodule)

MisconfigurationException: No supported gpu backend found!

In [None]:
# calculate test metrics
trainer.test(model=model, datamodule=datamodule)

In [12]:
# view first batch of test set predictions
# note: these are still produced using teacher-forcing, so not purely generated
pred = trainer.predict(model=model, datamodule=datamodule)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Predicting DataLoader 0: 100%|██████████| 4/4 [00:01<00:00,  2.43it/s]


In [65]:
model.generate()

'bin різ go trip testimsaksakդTimestampTimestamp manSERVERSERVERSERVER'

In [30]:

model.generate("Florida man")

'Florida man Denkmallanglefern'