# Modeling on Lines
Creating a transformers model based on poem lines

A lot of the below is adapted from the gpt2 tutorial at https://huggingface.co/docs/transformers/v4.22.2/en/tasks/language_modeling

## Imports and Preliminaries

In [15]:
# data formatting for model
from datasets import Dataset, DatasetDict

# train/test split
from sklearn.model_selection import train_test_split

# tokenizer
from transformers import AutoTokenizer

# lm collator
from transformers import DataCollatorForLanguageModeling

# model and support
from transformers import TFAutoModelForCausalLM, create_optimizer, AdamWeightDecay

# other utilities
from itertools import chain
import os
import random

In [5]:
# set the model we are using
MODELS = [
    'gpt', # original GPT
    'distilgpt2', # 84M features
    'gpt2', # 117M features
    'gpt2-medium', # 355M features
    'gpt2-large', # 744M features
    'ctrl',
    'transformerxl',
    'reformer',
    'xlnet'
]
    
model_type = 'gpt2-medium'

In [1]:
# directories
DATA_FORMAT = 'lines'
DIR_MODEL = '../models/'
DIR_DATA = '../data/'

## Load and Format Data

In [7]:
# load data
paths = [
    os.path.join(DIR_DATA, 'shakespeare-sonnets.clean.txt'),
    os.path.join(DIR_DATA, 'browning-sonnets.clean.txt'),
    os.path.join(DIR_DATA, 'daniel-constable-sonnets.clean.txt'),
    os.path.join(DIR_DATA, 'drayton-griffin-smith-sonnet-cycles.clean.txt')
]
                 
data = []
for path in paths:
    with open(path, 'r') as f:
        data.append([line.strip() for line in f.readlines() if line.strip()])
        print(f'# of lines: {len(data[-1])}\nFirst 5: {data[-1][:5]}')

lines = list(chain(*data))
print(f'TOTAL LINES: {len(lines)}')

# of lines: 2155
First 5: ['From fairest creatures we desire increase,', 'That thereby beautys rose might never die,', 'But as the riper should by time decease,', 'His tender heir might bear his memory:', 'But thou, contracted to thine own bright eyes,']
# of lines: 617
First 5: ['\ufeff', 'I thought once how Theocritus had sung', 'Of the sweet years, the dear and wished-for years,', 'Who each one in a gracious hand appears', 'To bear a gift for mortals, old or young:']
# of lines: 2046
First 5: ['\ufeff    Wonder of these, glory of other times,', 'O thou whom envy evn is forced tadmire!', 'Great Patroness of these my humble rhymes,', 'Which thou from out thy greatness dost inspire!', 'Since only thou has deigned to raise them higher,']
# of lines: 2524
First 5: ['\ufeff', 'Into these loves who but for passion looks,', 'At this first sight here let him lay them by,', 'And seek elsewhere in turning other books,', 'Which better may his labour satisfy.']
TOTAL LINES: 7342


In [9]:
# split train and test
lines_train, lines_test = train_test_split(lines, test_size=0.05)
len(lines_train), len(lines_test)

(6974, 368)

## Cleaning and Data Preparation

In [10]:
# create Dataset and DatasetDict instances - I think this is needed for model
train_dataset = Dataset.from_dict({'text': lines_train})
test_dataset = Dataset.from_dict({'text': lines_test})
datasets = DatasetDict({'train': train_dataset, 'test': test_dataset})
datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 6974
    })
    test: Dataset({
        features: ['text'],
        num_rows: 368
    })
})

In [11]:
# Create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_type)

In [12]:
# Preprocessing function for tokenizer to use with map() method of datasetdict
def token_preproc(data):
    return tokenizer(data['text'])

In [13]:
# tokenize data
tokened_data = datasets.map(token_preproc, batched=True, num_proc=4, remove_columns=['text'])
tokened_data

      

#0:   0%|          | 0/2 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/2 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/2 [00:00<?, ?ba/s]

#3:   0%|          | 0/2 [00:00<?, ?ba/s]

      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 6974
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 368
    })
})

In [19]:
for _ in range(10):
    n = random.randint(0, len(tokened_data['train']))
    print(n, tokenizer.convert_ids_to_tokens(tokened_data['train'][n]['input_ids']), lines_train[n])

1198 ['Should', 'Ġin', 'Ġhis', 'Ġabsence', 'Ġbe', 'Ġto', 'Ġher', 'Ġso', 'Ġn', 'igh', '.'] Should in his absence be to her so nigh.
2779 ['My', 'self', 'Ġcorrupt', 'ing', ',', 'Ġsal', 'ving', 'Ġthy', 'Ġam', 'iss', ','] Myself corrupting, salving thy amiss,
919 ['And', 'Ġsaw', 'Ġno', 'Ġfootprint', ',', 'Ġheard', 'Ġthe', 'Ġsilence', 'Ġsink'] And saw no footprint, heard the silence sink
6117 ['I', 'Ġwill', 'Ġbe', 'Ġtrue', 'Ġdespite', 'Ġthy', 'Ġsc', 'y', 'the', 'Ġand', 'Ġthee', '.'] I will be true despite thy scythe and thee.
1134 ['So', 'Ġyou', 'Ġover', '-', 'green', 'Ġmy', 'Ġbad', ',', 'Ġmy', 'Ġgood', 'Ġallow', '?'] So you over-green my bad, my good allow?
2229 ['As', 'Ġthey', 'Ġwere', 'Ġwont', ',', 'Ġbut', 'Ġwhen', 'Ġthey', 'Ġhear', 'Ġme', 'Ġcry'] As they were wont, but when they hear me cry
3629 ['Where', 'Ġmost', 'ĠI', 'Ġlost', ',', 'Ġthere', 'Ġmost', 'Ġof', 'Ġall', 'ĠI', 'Ġwon', ';'] Where most I lost, there most of all I won;
1193 ['Her', 'Ġeyes', 'Ġrevive', 'Ġdecaying', 'Ġlife', 'Ġi

In [20]:
# pad encodings and prep for modeling
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors='tf')
collator

DataCollatorForLanguageModeling(tokenizer=PreTrainedTokenizerFast(name_or_path='gpt2-medium', vocab_size=50257, model_max_len=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}), mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='tf')

## Modeling

In [21]:
# instantiate model
model_path = os.path.join(DIR_MODEL, f'{model_type}-{MODEL_FORMAT}')

if not os.path.exists(model_path):
    model = TFAutoModelForCausalLM.from_pretrained(model_type, pad_token_id = tokenizer.eos_token_id)
else:
    model = TFAutoModelForCausalLM.from_pretrained(model_path)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at ../models/gpt2-medium.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [22]:
# convert data to special format for tf model
tokenizer.pad_token = tokenizer.eos_token
tf_train_set = model.prepare_tf_dataset(tokened_data['train'], shuffle=True, batch_size=16, collate_fn=collator)
tf_test_set = model.prepare_tf_dataset(tokened_data['test'], shuffle=False, batch_size=16, collate_fn=collator)
tf_train_set

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


<PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(16, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(16, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(16, None), dtype=tf.int64, name=None))>

In [23]:
# compile model
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [24]:
# fit model (if pretrained does not exist)
if not os.path.exists(model_path):
    model.fit(tf_train_set, validation_data=tf_test_set, epochs=4)

In [25]:
# save model
if not os.path.exists(model_path):
    os.makedirs(model_path)
    model.save_pretrained(model_path)

## Test Model

In [29]:
# function to get predicted text
def test(text, max_length=50, temp=1, top_k=100):
    tokened = tokenizer(text, return_tensors='np')
    output = model.generate(**tokened, max_length=max_length, temperature=temp, top_k=top_k, repetition_penalty=1.5)
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [30]:
test_lines = [
    'Tomorrow I will',
    'Yesterday we were',
    'For naught I may',
    'My love has been',
    'I am',
    'Thou art',
    'The little love-god lying once asleep',
    'In loving thee thou'
]

for line in test_lines:
    print(f'Original: {line}\nOutput: {test(line, temp=0.5, max_length=100)}\n')

Original: Tomorrow I will
Output: Tomorrow I will not sleep, but must die.  And yet my love is so strong;--and still it is! O well then: thou art a saint!—O beauteous one!--I sing,—Ah me! how oft, and why —why? Ah, ah yes, what to say, when asked... Dear friend of mine, tell us more than ever you know, or can think (or see) ; for this we both need, dear boy, speak truth

Original: Yesterday we were
Output: Yesterday we were both young, and yet so old.  What a difference! I am thine; you are my love:--and then the story begins!--I was born in Fidessa's landscape Gardens Conservatory Gymnasium Building, where you live today (now) Sitting by me on grassy hillside Plain Backgrounded with green, I stand gazing, You look away from us Both looking at each other Looking back to one another, now gone Now past, never seen Before

Original: For naught I may
Output: For naught I may love thee, but that thou mightst know.  For this reason my heart is in strife; and yet I am not free:--and thus it l