In [None]:
# https://www.kaggle.com/competitions/commonlitreadabilityprize
# predict ease of readability of passage

In [None]:
import typing as t
from ast import literal_eval

from transformer.models.regressor import RegressorLM
from transformer.dataloaders.inference import InferenceDataModule
from transformer.params import TransformerParams

import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from lightning import Trainer
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from transformers import LlamaTokenizer

In [None]:
# load and preview data
data = pd.read_csv("data/commonlit.csv")
# exclude:
# - excerpts outside [-3, 1] as these are unreliable
# - single outlier with zero easiness rating standard error
data = data.loc[
    data["BT Easiness"].between(-3, 1) & (data["BT s.e."] > 0), 
    ["Excerpt", "BT Easiness", "BT s.e."]
]
# convert newlines to spaces
data.Excerpt = data.Excerpt.str.replace("\n", " ")
data.head()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(10, 3))
data["BT Easiness"].plot.hist(ax=axs[0], title="Easiness Rating")
data["BT s.e."].plot.hist(ax=axs[1], title="Std. Error")
plt.tight_layout()
plt.show()

In [None]:
# convert text to list of strings
X = data["Excerpt"].to_list()

In [None]:
# use min-max scaled readability score as output
scaler = MinMaxScaler()
scaler.fit(np.array([[-3.0], [1.0]]))
y = torch.from_numpy(scaler.transform(data[["BT Easiness"]])).float()

In [None]:
# use reciprocal of standard error as loss function sample weights
weights = torch.from_numpy(data[["BT s.e."]].pow(-1).to_numpy()).float()

In [None]:
# create data module
class CommonlitReadabilityDataModule(InferenceDataModule):
    def setup(self: t.Self, stage: str) -> None:
        self.X, self.y, self.weights = X, y, weights
        super().setup(stage=stage)

In [None]:
# initialize pretrained tokenizer
# - llama does not add an EOS token by default, so override this
# - llama also does not use a padding token, so this needs to be added
tokenizer = LlamaTokenizer.from_pretrained(
    "huggyllama/llama-7b", add_eos_token=True, legacy=False
)
tokenizer.add_special_tokens({"pad_token": "<pad>"})

In [None]:
# view token sequence length distribution
data["Excerpt"].apply(tokenizer.tokenize).str.len().plot.hist(bins=50)

In [None]:
# initialize the transformer
context_length = 300
model = RegressorLM(
    config=TransformerParams(context_length=context_length),
    tokenizer=tokenizer,
)

In [None]:
# tokenize & encode data and prepare train/test splits
datamodule = CommonlitReadabilityDataModule(
    tokenizer=tokenizer,
    context_length=context_length,
    batch_size=32,
    val_size=0.2,
    test_size=0.1,
    num_workers=9,
    persistent_workers=True,
    limit=None,
    random_state=1,
)

In [None]:
%%time
# train the model
trainer = Trainer(
    max_epochs=50,
    callbacks=EarlyStopping(monitor="val_loss", mode="min", patience=5),
    accelerator="cpu",
)
trainer.fit(model=model, datamodule=datamodule)

In [None]:
# view first batch of test set predictions
pred = trainer.predict(model=model, datamodule=datamodule)
pred[:10]

In [None]:
# calculate accuracy
torch.tensor([x[1] == x[2] for batch in pred for x in batch]).float().mean()