In [1]:
# https://www.kaggle.com/code/bcruise/adventures-of-florida-man/input

In [2]:
from __future__ import annotations

from transformer.models.causal import CausalLM
from transformer.dataloaders.teacher_forcing import TeacherForcingDataModule
from transformer.params import TransformerParams

from transformers import LlamaTokenizerFast

import pandas as pd
from lightning import Trainer
from lightning.pytorch.callbacks.early_stopping import EarlyStopping

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# load and analyse data
titles = pd.read_csv("data/florida_man.csv").title
titles.head()

0    Florida woman backing pickup out of driveway h...
1    Florida man accused of assaulting girl lured a...
2    Am available to fuck message me on snap :morga...
3    Florida Woman thought to have stuffed 93 year-...
4    Florida man strikes again, Brandon Duhaime abs...
Name: title, dtype: object

In [4]:
class FloridaManDataModule(TeacherForcingDataModule):
    def setup(self: FloridaManDataModule, stage: str) -> None:
        # read titles with 200 or fewer characters from CSV
        self.data = titles.loc[titles.str.len() <= 200].tolist()
        super().setup(stage=stage)

In [5]:
# initialize pretrained tokenizer for causal language modelling
# - llama does not add an EOS token by default, so override this
# - llama also does not use a padding token, so this needs to be added
tokenizer = LlamaTokenizerFast.from_pretrained(
    "huggyllama/llama-7b", add_eos_token=True, from_slow=True
)
tokenizer.add_special_tokens({"pad_token": "<pad>"})

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggin

1

In [7]:
# initialize the transformer
context_length = 64
model = CausalLM(
    config=TransformerParams(context_length=context_length),
    tokenizer=tokenizer,
)

In [10]:
# tokenize & encode data and prepare train/test splits
datamodule = FloridaManDataModule(
    tokenizer=tokenizer,
    context_length=context_length,
    batch_size=32,
    val_size=0.2,
    test_size=0.1,
    num_workers=9,
    persistent_workers=True,
    limit=None,
    random_state=1,
)

In [66]:
# train the model
trainer = Trainer(
    max_epochs=500,
    callbacks=EarlyStopping(monitor="val_loss", mode="min", patience=5),
    accelerator="cpu",
)
trainer.fit(model=model, datamodule=datamodule)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/edwinonuonga/env/llm-arm64/lib/python3.11/site-packages/lightning/pytorch/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | ModuleDict | 35.3 M | train
---------------------------------------------
35.3 M    Trainable params
0         Non-trainable params
35.3 M    Total params
141.158   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
# calculate test metrics
trainer.test(model=model, datamodule=datamodule)

In [12]:
# view first batch of test set predictions
# note: these are still produced using teacher-forcing, so not purely generated
pred = trainer.predict(model=model, datamodule=datamodule)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Predicting DataLoader 0: 100%|██████████| 4/4 [00:01<00:00,  2.43it/s]


In [65]:
model.generate()

'bin різ go trip testimsaksakդTimestampTimestamp manSERVERSERVERSERVER'

In [30]:

model.generate("Florida man")

'Florida man Denkmallanglefern'