# Tranformer Modeling with GPT2

Creating a model based on various criteria

A lot of the below is adapted from the gpt2 tutorial at https://huggingface.co/docs/transformers/v4.22.2/en/tasks/language_modeling

## Imports and Preliminaries

In [1]:
# data formatting for model
from datasets import Dataset, DatasetDict

# train/test split
from sklearn.model_selection import train_test_split

# tokenizer
from transformers import AutoTokenizer

# lm collator
from transformers import DataCollatorForLanguageModeling

# model and support
from transformers import TFAutoModelForCausalLM, create_optimizer, AdamWeightDecay

# other utilities
from itertools import chain
import os
import random
import re

In [2]:
# set the model we are using
MODELS = [
    'gpt', # original GPT
    'distilgpt2', # 84M features
    'gpt2', # 117M features
    'gpt2-medium', # 355M features
    'gpt2-large', # 744M features
    'ctrl',
    'transformerxl',
    'reformer',
    'xlnet'
]
    
model_type = 'gpt2'

In [3]:
# directories
MODEL_NAME = 'multi'
N_EPOCHS = 8
DIR_MODEL = '../models/'
DIR_DATA = '../data/'

In [4]:
# regexes
RE_SENTENCE = re.compile(r'\w.*?[.?!:;]', re.S)
RE_WHITESPACE = re.compile(r'\s+')
RE_BLANKLINE = re.compile(r'\n\n')

In [5]:
# other special constants
EOLINE_TOKEN = '<|eol|>'
EOSENTENCE_TOKEN = '<|eos|>'

## Load and Format Data

In [6]:
# load data
paths = [
    os.path.join(DIR_DATA, 'shakespeare-sonnets.clean.txt'),
    os.path.join(DIR_DATA, 'browning-sonnets.clean.txt'),
    os.path.join(DIR_DATA, 'daniel-constable-sonnets.clean.txt'),
    os.path.join(DIR_DATA, 'drayton-griffin-smith-sonnet-cycles.clean.txt'),
    os.path.join(DIR_DATA, 'farjeon-sonnets.clean.txt'),
    os.path.join(DIR_DATA, 'lovell-southey-sonnets.clean.txt')
]

text = list()

for path in paths:
    with open(path, 'r') as f:
        text.append([line.strip() for line in f.readlines()])

text = '\n'.join(chain(*text))

In [7]:
# Split into relevant subsets
# POEMS
poems = RE_BLANKLINE.split(text)

# LINES
lines = [line.strip() for line in text.split('\n')]

# SENTENCES
sentences = RE_SENTENCE.findall(text)
sentences = [RE_WHITESPACE.sub(' ', sentence) for sentence in sentences]

print(f'# Poems: {len(poems)}\n# Sentences: {len(sentences)}\n# Lines: {len(lines)}\n# Chars: {len(text)}')

# Poems: 585
# Sentences: 3836
# Lines: 8419
# Chars: 338875


In [8]:
# split train and test
lines_train, lines_test = train_test_split(sentences, test_size=0.05)
len(lines_train), len(lines_test)

(3644, 192)

## Cleaning and Data Preparation

In [9]:
# create Dataset and DatasetDict instances - I think this is needed for model
train_dataset = Dataset.from_dict({'text': lines_train})
test_dataset = Dataset.from_dict({'text': lines_test})
datasets = DatasetDict({'train': train_dataset, 'test': test_dataset})
datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 3644
    })
    test: Dataset({
        features: ['text'],
        num_rows: 192
    })
})

In [10]:
# Create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_type)

In [11]:
# Preprocessing function for tokenizer to use with map() method of datasetdict
def token_preproc(data):
    return tokenizer(data['text'])

In [12]:
# tokenize data
tokened_data = datasets.map(token_preproc, batched=True, num_proc=4, remove_columns=['text'])
tokened_data

      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 3644
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 192
    })
})

In [13]:
for _ in range(4):
    n = random.randint(0, len(tokened_data['train']))
    print(n, tokenizer.convert_ids_to_tokens(tokened_data['train'][n]['input_ids']), lines_train[n])

3478 ['And', 'Ġrather', 'Ġmake', 'Ġthem', 'Ġborn', 'Ġto', 'Ġour', 'Ġdesire', 'ĠThan', 'Ġthink', 'Ġthat', 'Ġwe', 'Ġbefore', 'Ġhave', 'Ġheard', 'Ġthem', 'Ġtold', '.'] And rather make them born to our desire Than think that we before have heard them told.
1480 ['All', 'Ġthis', 'Ġmy', 'Ġheart', 'Ġfrom', 'Ġlove', 'Ġcan', 'Ġnever', 'Ġmove', '.'] All this my heart from love can never move.
693 ['O', '!'] O!
1482 ['Oh', ',', 'Ġyes', '!'] Oh, yes!


In [14]:
# pad encodings and prep for modeling
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors='tf')
collator

DataCollatorForLanguageModeling(tokenizer=PreTrainedTokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_len=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}), mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='tf')

## Modeling

In [15]:
# instantiate model
model_path = os.path.join(DIR_MODEL, f'{model_type}.{MODEL_NAME}.{str(N_EPOCHS)}')

if not os.path.exists(model_path):
    model = TFAutoModelForCausalLM.from_pretrained(model_type, pad_token_id = tokenizer.eos_token_id)
else:
    model = TFAutoModelForCausalLM.from_pretrained(model_path)
    
#model.resize_token_embeddings(len(tokenizer))

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [16]:
# convert data to special format for tf model
tokenizer.pad_token = tokenizer.eos_token
tf_train_set = model.prepare_tf_dataset(tokened_data['train'], shuffle=True, batch_size=32, collate_fn=collator)
tf_test_set = model.prepare_tf_dataset(tokened_data['test'], shuffle=False, batch_size=32, collate_fn=collator)
tf_train_set

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


<PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(32, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(32, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(32, None), dtype=tf.int64, name=None))>

In [17]:
# compile model
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [18]:
# fit model (if pretrained does not exist)
if not os.path.exists(model_path):
    model.fit(tf_train_set, validation_data=tf_test_set, epochs=N_EPOCHS)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [22]:
# save model
if not os.path.exists(model_path):
    os.makedirs(model_path)
    model.save_pretrained(model_path)

## Test Model

In [23]:
# function to get predicted text
def test(text, max_new=50, temp=1, top_k=50, rep_penalty=1.5, len_penalty=0.75, n_seq=1):
    tokened = tokenizer(text, return_tensors='tf')
    output = model.generate(**tokened,
                            do_sample=True,
                            max_new_tokens=max_new, 
                            temperature=temp, 
                            top_k=top_k, 
                            repetition_penalty=rep_penalty,
                            length_penalty=len_penalty,
                            num_return_sequences=n_seq)
    return tokenizer.decode(output[0], skip_special_tokens=False)

In [21]:
test_lines = [lines[random.randint(0,len(lines) - 1)] for _ in range(10)]

for line in test_lines:
    print(f'Original: {line}')
    output = test(line,
                  temp=0.5,
                  max_new=100,
                  top_k=200,
                  rep_penalty=1.5,
                  len_penalty=0.75,
                  n_seq=1)
    print(f'Output: {output}\n')

Original: Whilst we both make the world admire at us,
Output: Whilst we both make the world admire at us, Which is of such worth as never was made before. For thy beauty thou must be remembered; for if it should remain so long In a dull state then grace would not suffice To show our beautys youth to maturity: And thus my verse doth tell The most perfect truth that ever I saw! So in this way shall you find your true love express—I call on thee now and speak unto me again When these two truths come together That no one knows better than myself do Know how they are

Original: By granting me thy favour to obtain.
Output: By granting me thy favour to obtain. Thou art my friend, and I am his guest; And when thou hast loved him so much the world can see My love in this way is not only made stronger by thee but more! When these two are reconciled they will be like unto one another: For it doth seem that truth gives true strength To prove what all men say about us? But how should we know if he 

InvalidArgumentError: cannot compute ConcatV2 as input #1(zero-based) was expected to be a float tensor but is a int32 tensor [Op:ConcatV2] name: concat