# GPT2 Language Model Training with Texts from Shakespeare
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/fcakyon/gpt2-shakespeare/blob/main/gpt2-shakespeare.ipynb)

## 0. Install requirements

!pip install -U transformers datasets torch sentencepiece

## 1. Initialize Model and Tokenizer

- Import required modules:

In [1]:
import torch
import math
from transformers import GPT2Tokenizer, GPT2LMHeadModel, HfArgumentParser, TrainingArguments, Trainer, default_data_collator
from datasets import load_dataset

- Initialize a GPT2 model with a language modelling head:

In [2]:
 model = GPT2LMHeadModel.from_pretrained('sshleifer/tiny-gpt2')

- Initialize GPT2 tokenizer:

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('sshleifer/tiny-gpt2')

In [4]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 2)

## 2. Initialize Dataset

To fine-tune the dataset, we will need to run a utility (run_language_modeling.py) provided in the Hugging Face library. Load this utility using the following command:

In [5]:
# Downloading and loading a dataset from the hub.
datasets = load_dataset("tiny_shakespeare", cache_dir="lm_dataset/")
if "validation" not in datasets.keys():
    datasets["validation"] = load_dataset(
        data_args.dataset_name,
        data_args.dataset_config_name,
        split=f"train[:{data_args.validation_split_percentage}%]",
        cache_dir=model_args.cache_dir,
    )
    datasets["train"] = load_dataset(
        data_args.dataset_name,
        data_args.dataset_config_name,
        split=f"train[{data_args.validation_split_percentage}%:]",
        cache_dir=model_args.cache_dir,
    )

Using custom data configuration default


Downloading and preparing dataset tiny_shakespeare/default (download: 1.06 MiB, generated: 1.06 MiB, post-processed: Unknown size, total: 2.13 MiB) to lm_dataset/tiny_shakespeare/default/1.0.0/b5b13969f09fe8707337f6cb296314fbe06960bd9a868dca39e713e163d27b5e...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435071.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset tiny_shakespeare downloaded and prepared to lm_dataset/tiny_shakespeare/default/1.0.0/b5b13969f09fe8707337f6cb296314fbe06960bd9a868dca39e713e163d27b5e. Subsequent calls will reuse this data.


In [6]:
# Preprocessing the datasets.
# First we tokenize all the texts.
column_names = datasets["train"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]

In [7]:
def tokenize_function(examples):
    # truncate dataset with max accepted size of the model
    output = tokenizer(examples[text_column_name])
    return output

# tokenize dataset
tokenized_datasets = datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=column_names,
    desc="Running tokenizer on dataset",
)

HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on dataset', max=1.0, style=ProgressSty…

Token indices sequence length is longer than the specified maximum sequence length for this model (301966 > 1024). Running this sequence through the model will result in indexing errors





HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on dataset', max=1.0, style=ProgressSty…




HBox(children=(FloatProgress(value=0.0, description='Running tokenizer on dataset', max=1.0, style=ProgressSty…




In [8]:
# get block size (max input length of the model)
block_size = tokenizer.model_max_length
if block_size > 1024:
    block_size = 1024
    
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

# split total dataset into smaller sets of length block_size
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    desc=f"Grouping texts in chunks of {block_size}",
)

HBox(children=(FloatProgress(value=0.0, description='Grouping texts in chunks of 1024', max=1.0, style=Progres…




HBox(children=(FloatProgress(value=0.0, description='Grouping texts in chunks of 1024', max=1.0, style=Progres…




HBox(children=(FloatProgress(value=0.0, description='Grouping texts in chunks of 1024', max=1.0, style=Progres…




In [9]:
train_dataset = lm_datasets["train"]
eval_dataset = lm_datasets["validation"]

## 3. Initialize Trainer

In [10]:
training_args = TrainingArguments(output_dir = "output/")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    # Data collator will default to DataCollatorWithPadding, so we change it.
    data_collator=default_data_collator,
)

In [11]:
# perform training
train_result = trainer.train()

# saves the tokenizer
trainer.save_model()

# save training metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)

# save training state
trainer.save_state()

RuntimeError: CUDA out of memory. Tried to allocate 1.53 GiB (GPU 0; 5.80 GiB total capacity; 3.48 GiB already allocated; 522.62 MiB free; 3.48 GiB reserved in total by PyTorch)

In [None]:
# perform evaluation over validation data
metrics = trainer.evaluate()

# calculate perplexity
try:
    perplexity = math.exp(metrics["eval_loss"])
except OverflowError:
    perplexity = float("inf")
    
# save perplexity
metrics["perplexity"] = perplexity
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)