Test notebook to get the hang of using OpenAI's Transformers package, including GPT2

In [1]:
# A lot of the below is adapted from the gpt2 tutorial:
# https://huggingface.co/docs/transformers/v4.22.2/en/tasks/language_modeling

## Imports and Preliminaries

In [19]:
# data formatting for model
from datasets import Dataset, DatasetDict

# train/test split
from sklearn.model_selection import train_test_split

# tokenizer
from transformers import AutoTokenizer

# lm collator
from transformers import DataCollatorForLanguageModeling

# model and support
from transformers import TFAutoModelForCausalLM, create_optimizer, AdamWeightDecay

# other utilities
from itertools import chain
import os

In [3]:
# set the model we are using
MODELS = [
    'gpt',
    'gpt2',
    'ctrl',
    'transformerxl',
    'reformer',
    'xlnet'
]
    
model_type = 'gpt2-medium'

In [18]:
# directories
MODEL_DIR = './models/'

## Load and Format Data

In [4]:
# load data
paths = [
    './shakespeare-sonnets.clean.txt',
    './browning-sonnets.clean.txt',
    './daniel-constable-sonnets.clean.txt',
    './drayton-griffin-smith-sonnet-cycles.clean.txt',
    './seward-sonnets-and-odes.stripped.split.txt'
]
data = []
for path in paths:
    with open(path, 'r') as f:
        data.append([line.strip() for line in f.readlines() if line.strip()])
        print(f'# of lines: {len(data[-1])}\nFirst 5: {data[-1][:5]}')

lines = list(chain(*data))
print(f'TOTAL LINES: {len(lines)}')

# of lines: 2155
First 5: ['From fairest creatures we desire increase,', 'That thereby beautys rose might never die,', 'But as the riper should by time decease,', 'His tender heir might bear his memory:', 'But thou, contracted to thine own bright eyes,']
# of lines: 617
First 5: ['\ufeff', 'I thought once how Theocritus had sung', 'Of the sweet years, the dear and wished-for years,', 'Who each one in a gracious hand appears', 'To bear a gift for mortals, old or young:']
# of lines: 2046
First 5: ['\ufeff    Wonder of these, glory of other times,', 'O thou whom envy evn is forced tadmire!', 'Great Patroness of these my humble rhymes,', 'Which thou from out thy greatness dost inspire!', 'Since only thou has deigned to raise them higher,']
# of lines: 2524
First 5: ['\ufeff', 'Into these loves who but for passion looks,', 'At this first sight here let him lay them by,', 'And seek elsewhere in turning other books,', 'Which better may his labour satisfy.']
# of lines: 2579
First 5: ['\ufeff

In [5]:
# various functions to help with preprocessing

# combine 2 lines into 1
def double_lines(lines):
    return ['\n'.join(double) for double in zip(lines[0::2], lines[1::2])]

# return all lines concatenated
def full_text(lines):
    return '\n'.join(lines)

In [6]:
# split train and test
lines_train, lines_test = train_test_split(lines, test_size=0.05)
len(lines_train), len(lines_test)

(9424, 497)

## Cleaning and Data Preparation

In [7]:
# create Dataset and DatasetDict instances - I think this is needed for model
train_dataset = Dataset.from_dict({'text': lines_train})
test_dataset = Dataset.from_dict({'text': lines_test})
datasets = DatasetDict({'train': train_dataset, 'test': test_dataset})
datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 9424
    })
    test: Dataset({
        features: ['text'],
        num_rows: 497
    })
})

In [8]:
# Create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_type)

In [9]:
# Preprocessing function for tokenizer to use with map() method of datasetdict
def token_preproc(data):
    return tokenizer(data['text'])

In [10]:
# tokenize data
tokened_data = datasets.map(token_preproc, batched=True, num_proc=4, remove_columns=['text'])
tokened_data

      

#0:   0%|          | 0/3 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/3 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/3 [00:00<?, ?ba/s]

#3:   0%|          | 0/3 [00:00<?, ?ba/s]

      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 9424
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 497
    })
})

In [11]:
# pad encodings and prep for modeling
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors='tf')
collator

DataCollatorForLanguageModeling(tokenizer=PreTrainedTokenizerFast(name_or_path='gpt2-medium', vocab_size=50257, model_max_len=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}), mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='tf')

## Modeling

In [12]:
# instantiate model
model = TFAutoModelForCausalLM.from_pretrained(model_type, pad_token_id = tokenizer.eos_token_id)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2-medium.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [13]:
# convert data to special format for tf model
tokenizer.pad_token = tokenizer.eos_token
tf_train_set = model.prepare_tf_dataset(tokened_data['train'], shuffle=True, batch_size=16, collate_fn=collator)
tf_test_set = model.prepare_tf_dataset(tokened_data['test'], shuffle=False, batch_size=16, collate_fn=collator)
tf_train_set

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


<PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(16, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(16, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(16, None), dtype=tf.int64, name=None))>

In [14]:
# compile model
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [15]:
# fit model
model.fit(tf_train_set, validation_data=tf_test_set, epochs=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x1699c5cc0>

In [20]:
# save model
if not os.path.exists(MODEL_DIR + model_type):
    os.makedirs(MODEL_DIR + model_type)

model.save_pretrained(MODEL_DIR + model_type)

## Test Model

In [16]:
# function to get predicted text
def test(text, max_length=50, temp=1, top_k=100):
    tokened = tokenizer(text, return_tensors='np')
    output = model.generate(**tokened, max_length=max_length, temperature=temp, top_k=top_k, repetition_penalty=1.5)
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [17]:
test_lines = [
    'Tomorrow I will',
    'Yesterday we were',
    'For naught I may',
    'My love has been',
    'I am',
    'Thou art',
    'The little love-god lying once asleep',
    'In loving thee thou'
]

for line in test_lines:
    print(f'Original: {line}\nOutput: {test(line)}\n')

Original: Tomorrow I will
Output: Tomorrow I will not sleep, but must die.  And yet my love is so strong;--and still it is! O well then: thou art a saint!—O beauteous one!--I sing,—Ah me! how oft

Original: Yesterday we were
Output: Yesterday we were both young, and yet so old.  What a difference! I am thine; you are my love:--and then the story begins!--I was born in Fidessa's landscape Gardens Conservatory Gymnasium Building

Original: For naught I may
Output: For naught I may love thee, but that thou mightst know.  For this reason my heart is in strife; and yet I am not free:--and thus it lies. O! how much more doth she please me than you

Original: My love has been
Output: My love has been a fire, and my flame hath not died.  And now I know it is cold; yea colder than snow!  Then did I feel the chill of night: then was mine eye dry—now doth seem white.—

Original: I am
Output: I am not a man, but an angel.  And yet I love thee with such zeal;--and so do you: and thus we both!—so it