In [None]:
!pip install transformers evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m71.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m87.7 

In [None]:
BATCH_SIZE = 256
EPOCHS = 15

In [None]:
import torch
from torch.utils.data import Dataset

class SpellingDataset(Dataset):
  def __init__(self):
    self.raw_data = open("spelling.txt", "r").readlines()
    self.raw_dataset = []

    for line in self.raw_data:
      self.create_raw_examples(line)

  def create_raw_examples(self, line):
    split_line = line.strip().split(" ")
    correct = split_line[0].replace(":", "")
    self.raw_dataset.append({"src": correct, "trg": correct})
    for data in split_line[1:]:
      self.raw_dataset.append({"src": data, "trg": correct})

  def __len__(self):
    return len(self.raw_dataset)

  def __getitem__(self, index):
    example = self.raw_dataset[index]
    return example["src"], example["trg"]

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-Flan-T5-77M")

def tokenize_function(examples):
  return tokenizer(examples)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [13]:
from torch.utils.data import random_split, DataLoader

ds = SpellingDataset()
generator = torch.Generator().manual_seed(42)
train_ds, val_ds, test_ds = random_split(ds, [0.8, 0.1, 0.1])

train_dl = DataLoader(train_ds, shuffle=True, batch_size=BATCH_SIZE)
val_dl = DataLoader(val_ds, batch_size=BATCH_SIZE)

In [None]:
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained("MBZUAI/LaMini-Flan-T5-77M")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from transformers import get_scheduler

optimizer = AdamW(model.parameters(), lr=1e-3)

n_training_steps = EPOCHS * len(train_dl)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=n_training_steps
  )

In [None]:
model.to("cuda")

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

In [None]:
from tqdm.auto import tqdm
from torch.nn.functional import one_hot

model.train()
for epoch in range(EPOCHS):
  progress_bar = tqdm(range(len(train_dl)))
  print("Training epoch", epoch+1)

  for batch in train_dl:
    tokenized_data = tokenizer(batch[0], return_tensors="pt", padding=True)
    input_ids = tokenized_data.input_ids.to("cuda")
    attention_mask = tokenized_data.attention_mask.to("cuda")

    tokenized_labels = tokenizer(batch[1], return_tensors="pt", padding=True).input_ids
    tokenized_labels[tokenized_labels == tokenizer.pad_token_id] = -100
    tokenized_labels = tokenized_labels.to("cuda")

    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=tokenized_labels)
    loss = outputs.loss
    loss.backward()

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)
    

  0%|          | 0/2 [00:00<?, ?it/s]

Training epoch 1


  0%|          | 0/2 [00:00<?, ?it/s]

Training epoch 2


  0%|          | 0/2 [00:00<?, ?it/s]

Training epoch 3


  0%|          | 0/2 [00:00<?, ?it/s]

Training epoch 4


  0%|          | 0/2 [00:00<?, ?it/s]

Training epoch 5


  0%|          | 0/2 [00:00<?, ?it/s]

Training epoch 6


  0%|          | 0/2 [00:00<?, ?it/s]

Training epoch 7


  0%|          | 0/2 [00:00<?, ?it/s]

Training epoch 8


  0%|          | 0/2 [00:00<?, ?it/s]

Training epoch 9


  0%|          | 0/2 [00:00<?, ?it/s]

Training epoch 10


  0%|          | 0/2 [00:00<?, ?it/s]

Training epoch 11


  0%|          | 0/2 [00:00<?, ?it/s]

Training epoch 12


  0%|          | 0/2 [00:00<?, ?it/s]

Training epoch 13


  0%|          | 0/2 [00:00<?, ?it/s]

Training epoch 14


  0%|          | 0/2 [00:00<?, ?it/s]

Training epoch 15


In [None]:
import evaluate

metric = evaluate.load("exact_match")
model.eval()
for batch in val_dl:
  input, gold = batch
  with torch.no_grad():
    tokenized_input = tokenizer(input, return_tensors="pt", padding=True).to("cuda")
    outputs = model.generate(
        input_ids=tokenized_input["input_ids"],
        attention_mask=tokenized_input["attention_mask"],
        max_new_tokens=25,
        do_sample=False
    )
    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    metric.add_batch(predictions=decoded_outputs, references=gold)
    print("Input:", input[0].replace(" ", ""))
    print("Sys:", decoded_outputs[0].replace(" ", ""))
    print("Gold:", gold[0].replace(" ", ""))
    print()

print(metric.compute())

Downloading builder script:   0%|          | 0.00/5.67k [00:00<?, ?B/s]

Input: tak
Sys: here
Gold: take

Input: accruell
Sys: accrually
Gold: accrued

Input: get
Sys: get
Gold: any

Input: unfavourites
Sys: unfavorites
Gold: unfavourites

Input: vallery
Sys: vallas
Gold: valley

Input: sisscors
Sys: scissors
Gold: scissors

Input: varie
Sys: varie
Gold: vary

Input: quart
Sys: quart
Gold: quart

Input: Epistile
Sys: Episodetic
Gold: epistle

Input: annevarcery
Sys: anomaly
Gold: anniversary

Input: considerble
Sys: considered
Gold: considerable

Input: emiomy
Sys: elementary
Gold: enemy

Input: abserded
Sys: abserted
Gold: absurd

Input: nest
Sys: nest
Gold: nest

Input: father
Sys: father
Gold: farther

Input: million
Sys: million
Gold: million

Input: ordinarialy
Sys: ordinarily
Gold: ordinarily

Input: gorgious
Sys: magnificent
Gold: gorgeous

Input: refregarator
Sys: refrigerator
Gold: refrigerator

Input: thier
Sys: better
Gold: there

Input: sherk
Sys: sherk
Gold: searched

Input: ingesing
Sys: ingesting
Gold: increasing

Input: war
Sys: war
Gold: we