# Training GPT-Wee

If you want to use this notebook to train your own very small GPT-2 model, you have to adapt all ```/path/to/``` to use your own local path:

### Tokenizer

Imports:

In [None]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer)
from tqdm import tqdm 
from tokenizers.normalizers import Lowercase, Strip, StripAccents, NFD
from datasets import load_dataset
import torch

Initialize with BPE:

In [None]:
tokenizer = Tokenizer(models.BPE())

Normalizer that sets everything to normal unicode, lowercase, and strips white spaces and accents

(explanations here: https://huggingface.co/docs/tokenizers/components)

In [None]:
normalizer = normalizers.Sequence([NFD(), Lowercase(), Strip(), StripAccents()])

Test:

In [None]:
normalizer.normalize_str("Héllò hôw are ü?")

In [None]:
tokenizer.normalizer = normalizer

Pre-tokenization (division of text into tokens on which BPE can be performed):

In [None]:
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

In [None]:
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test pre-tokenization!")

In [None]:
trainer = trainers.BpeTrainer(vocab_size=8000, special_tokens=["<|endoftext|>"])

Data from https://github.com/babylm/babylm.github.io/raw/main/babylm_data.zip:

In [None]:
textfiles = ["/path/to/babylm_data/babylm_10M/aochildes.train",
                 "/path/to/babylm_data/babylm_10M/bnc_spoken.train",
                 "/path/to/babylm_data/babylm_10M/cbt.train",
                 "/path/to/babylm_data/babylm_10M/children_stories.train",
                 "/path/to/babylm_data/babylm_10M/gutenberg.train",
                 "/path/to/babylm_data/babylm_10M/open_subtitles.train",
                 "/path/to/babylm_data/babylm_10M/qed.train",
                 "/path/to/babylm_data/babylm_10M/simple_wikipedia.train",
                 "/path/to/babylm_data/babylm_10M/switchboard.train",
                 "/path/to/babylm_data/babylm_10M/wikipedia.train"]

In [None]:
tokenizer.train(files = textfiles, trainer=trainer)

In [None]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don’t want the offsets to include these whitespaces, then this PostProcessor must be used:

(https://huggingface.co/docs/tokenizers/api/post-processors)

In [None]:
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)

In [None]:
sentence = "Let's test this tokenizer."
encoding = tokenizer.encode(sentence)
start, end = encoding.offsets[4]
sentence[start:end]

In [None]:
tokenizer

In [None]:
tokenizer.decoder = decoders.ByteLevel()

In [None]:
tokenizer.decode(encoding.ids)

Save it:

In [None]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token="<|endoftext|>",
    eos_token="<|endoftext|>")

In [None]:
wrapped_tokenizer.save_pretrained("/path/to/tokenizer/")

### Training 

Load tokenizer:

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("/path/to/tokenizer/")

#### For regular learning:

Load files:

In [None]:
training_files = ["/path/to/babylm_data/babylm_10M/aochildes.train",
                 "/path/to/babylm_data/babylm_10M/bnc_spoken.train",
                 "/path/to/babylm_data/babylm_10M/cbt.train",
                 "/path/to/babylm_data/babylm_10M/children_stories.train",
                 "/path/to/babylm_data/babylm_10M/gutenberg.train",
                 "/path/to/babylm_data/babylm_10M/open_subtitles.train",
                 "/path/to/babylm_data/babylm_10M/qed.train",
                 "/path/to/babylm_data/babylm_10M/simple_wikipedia.train",
                 "/path/to/babylm_data/babylm_10M/switchboard.train",
                 "/path/to/babylm_data/babylm_10M/wikipedia.train"]

eval_files = ["/path/to/babylm_data/babylm_dev/aochildes.dev",
             "/path/to/babylm_data/babylm_dev/bnc_spoken.dev",
             "/path/to/babylm_data/babylm_dev/cbt.dev",
             "/path/to/babylm_data/babylm_dev/children_stories.dev",
             "/path/to/babylm_data/babylm_dev/gutenberg.dev",
             "/path/to/babylm_data/babylm_dev/open_subtitles.dev",
             "//path/to//babylm_data/babylm_dev/qed.dev",
             "/path/to//babylm_data/babylm_dev/simple_wikipedia.dev",
             "/path/to/babylm_data/babylm_dev/switchboard.dev",
             "/path/to/babylm_data/babylm_dev/wikipedia.dev"]

In [None]:
raw_datasets = load_dataset('text', data_files={'train': training_files, 
                                           'validation': eval_files})

#### For curriculum learning:

Load training data in ```streaming```-mode, so that it gets loaded progressively (quick and dirty implementation of curriculum ordering)

In [None]:
training_files = ["/path/to/babylm-curriculum/ordered_text.txt"]

Ordered text from ```sentence_scoring.ipynb```

In [None]:
raw_datasets = load_dataset("text", data_files={"train": training_files, 
                                           "validation": eval_files}, streaming = True)

In [None]:
raw_datasets

Create batches

In [None]:
context_length = 128

In [None]:
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=False,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        #if length == context_length:
        input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_datasets

Initiate new model (and specify model architecture according to https://huggingface.co/docs/transformers/v4.30.0/en/model_doc/gpt2#transformers.GPT2Config):

In [None]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    n_embd = 128,
    n_layer = 2,
    n_head = 2,
)

Show how many parameters:

In [None]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

Training arguments (https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments) can be optimized here:

In [None]:
from transformers import Trainer, TrainingArguments
from datasets import Dataset



args = TrainingArguments(
    output_dir="toy_model_outputs",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=10,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    #use_mps_device=True, # enable when training on Mac with Apple Silicon
    max_steps = 44000 # enable for curriculum learning, disable for normal
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],#[:8000]['input_ids'],
    eval_dataset=tokenized_datasets['validation'],#[:2000]['input_ids'],

)

In [None]:
trainer.train()

In [None]:
trainer.save_model('/path/t/babyGPTs/')

In [None]:
import pandas as pd
pd.DataFrame(trainer.state.log_history).to_csv("/path/to/stats.csv")

In [None]:
import torch
from transformers import pipeline

pipe = pipeline("text-generation", model='/path/to/babyGPTs')

In [None]:
txt = "The lady"

In [None]:
pipe(txt, num_return_sequences=20)