# Modeling on Sentences

Creating a model based on sentences

A lot of the below is adapted from the gpt2 tutorial at https://huggingface.co/docs/transformers/v4.22.2/en/tasks/language_modeling

## Imports and Preliminaries

In [1]:
# data formatting for model
from datasets import Dataset, DatasetDict

# train/test split
from sklearn.model_selection import train_test_split

# tokenizer
from transformers import AutoTokenizer

# lm collator
from transformers import DataCollatorForLanguageModeling

# model and support
from transformers import TFAutoModelForCausalLM, create_optimizer, AdamWeightDecay

# other utilities
from itertools import chain
import os
import random
import re

In [2]:
# set the model we are using
MODELS = [
    'gpt', # original GPT
    'distilgpt2', # 84M features
    'gpt2', # 117M features
    'gpt2-medium', # 355M features
    'gpt2-large', # 744M features
    'ctrl',
    'transformerxl',
    'reformer',
    'xlnet'
]
    
model_type = 'gpt2'

In [3]:
# directories
MODEL_FORMAT = 'sentences'
DIR_MODEL = '../models/'
DIR_DATA = '../data/'

In [4]:
# regexes
RE_SENTENCE = re.compile(r'\w.*?[.?!]', re.S)
RE_WHITESPACE = re.compile(r'\s+')

## Load and Format Data

In [5]:
# load data
paths = [
    os.path.join(DIR_DATA, 'shakespeare-sonnets.clean.txt'),
    os.path.join(DIR_DATA, 'browning-sonnets.clean.txt'),
    os.path.join(DIR_DATA, 'daniel-constable-sonnets.clean.txt'),
    os.path.join(DIR_DATA, 'drayton-griffin-smith-sonnet-cycles.clean.txt'),
    os.path.join(DIR_DATA, 'farjeon-sonnets.clean.txt'),
    os.path.join(DIR_DATA, 'lovell-southey-sonnets.clean.txt')
    #os.path.join(DIR_DATA, 'shakespeareplays.txt')
]
                 
for path in paths:
    with open(path, 'r') as f:
        fulltext = f.read()

lines = RE_SENTENCE.findall(fulltext)
lines = [RE_WHITESPACE.sub(' ', line) for line in lines]
fulltext[:1000], lines[:8]

('\ufeff      ARISTE! soon to sojourn with the crowd,\n        In soul abstracted must thy minstrel go;\n        Mix in the giddy, fond, fantastic show,\n      Mix with the gay, the envious, and the proud.\n      I go: but still my soul remains with thee,\n        Still will the eye of fancy paint thy charms,\n      Still, lovely Maid, thy imaged form I see,\n        And every pulse will vibrate with alarms.\n      When scandal spreads abroad her odious tale,\n        When envy at a rivals beauty sighs,\n      When rancour prompts the female tongue to rail,\n        And rage and malice fire the gamesters eyes,\n      I turn my wearied soul to her for ease,\n    Who only names to praise, who only speaks to please.\n\n\n    Be his to court the Muse, whose humble breast\n      The glow of genius never could inspire;\n    Who never, by the future song possest,\n      Struck the bold strings, and waked the daring lyre.\n    Let him invoke the Muses from their grove,\n    Who never felt the 

In [6]:
# split train and test
lines_train, lines_test = train_test_split(lines, test_size=0.05)
len(lines_train), len(lines_test)

(57, 3)

## Cleaning and Data Preparation

In [7]:
# create Dataset and DatasetDict instances - I think this is needed for model
train_dataset = Dataset.from_dict({'text': lines_train})
test_dataset = Dataset.from_dict({'text': lines_test})
datasets = DatasetDict({'train': train_dataset, 'test': test_dataset})
datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 57
    })
    test: Dataset({
        features: ['text'],
        num_rows: 3
    })
})

In [8]:
# Create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_type)

In [9]:
# Preprocessing function for tokenizer to use with map() method of datasetdict
def token_preproc(data):
    return tokenizer(data['text'])

In [10]:
# tokenize data
tokened_data = datasets.map(token_preproc, batched=True, num_proc=4, remove_columns=['text'])
tokened_data

      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

num_proc must be <= 3. Reducing num_proc to 3 for dataset of size 3.


     

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 57
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 3
    })
})

In [11]:
for _ in range(10):
    n = random.randint(0, len(tokened_data['train']))
    print(n, tokenizer.convert_ids_to_tokens(tokened_data['train'][n]['input_ids']), lines_train[n])

32 ['was', 'Ġthat', 'Ġpoor', 'Ġwand', 'erers', 'Ġpride', '!'] was that poor wanderers pride!
32 ['was', 'Ġthat', 'Ġpoor', 'Ġwand', 'erers', 'Ġpride', '!'] was that poor wanderers pride!
31 ['Ill', 'Ġcourt', 'Ġthy', 'Ġlone', 'Ġbow', 'r', ',', 'ĠSens', 'ibility', '!'] Ill court thy lone bowr, Sensibility!
49 ['O', ',', 'Ġlost', 'Ġto', 'Ġlove', 'Ġand', 'Ġtruth', '!'] O, lost to love and truth!
37 ['Be', 'Ġhis', 'Ġto', 'Ġcourt', 'Ġthe', 'ĠMuse', ',', 'Ġwhose', 'Ġhumble', 'Ġbreast', 'ĠThe', 'Ġglow', 'Ġof', 'Ġgenius', 'Ġnever', 'Ġcould', 'Ġinspire', ';', 'ĠWho', 'Ġnever', ',', 'Ġby', 'Ġthe', 'Ġfuture', 'Ġsong', 'Ġposs', 'est', ',', 'ĠSt', 'ruck', 'Ġthe', 'Ġbold', 'Ġstrings', ',', 'Ġand', 'Ġw', 'aked', 'Ġthe', 'Ġdaring', 'Ġly', 're', '.'] Be his to court the Muse, whose humble breast The glow of genius never could inspire; Who never, by the future song possest, Struck the bold strings, and waked the daring lyre.
17 ['As', 'Ġslow', 'Ġand', 'Ġsolemn', 'Ġy', 'onder', 'Ġdeepening', 'Ġkn', 'ell', 

In [12]:
# pad encodings and prep for modeling
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors='tf')
collator

DataCollatorForLanguageModeling(tokenizer=PreTrainedTokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_len=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}), mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='tf')

## Modeling

In [13]:
# instantiate model
model_path = os.path.join(DIR_MODEL, f'{model_type}.{MODEL_FORMAT}')

if not os.path.exists(model_path):
    model = TFAutoModelForCausalLM.from_pretrained(model_type, pad_token_id = tokenizer.eos_token_id)
else:
    model = TFAutoModelForCausalLM.from_pretrained(model_path)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at ../models/gpt2.sentences.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [14]:
# convert data to special format for tf model
tokenizer.pad_token = tokenizer.eos_token
tf_train_set = model.prepare_tf_dataset(tokened_data['train'], shuffle=True, batch_size=16, collate_fn=collator)
tf_test_set = model.prepare_tf_dataset(tokened_data['test'], shuffle=False, batch_size=16, collate_fn=collator)
tf_train_set

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


<PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(16, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(16, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(16, None), dtype=tf.int64, name=None))>

In [15]:
# compile model
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [16]:
# fit model (if pretrained does not exist)
if not os.path.exists(model_path):
    model.fit(tf_train_set, validation_data=tf_test_set, epochs=8)

In [17]:
# save model
if not os.path.exists(model_path):
    os.makedirs(model_path)
    model.save_pretrained(model_path)

## Test Model

In [30]:
# function to get predicted text
def test(text, max_new=50, temp=1, top_k=50, rep_penalty=1.5, len_penalty=0.75, n_seq=1):
    tokened = tokenizer(text, return_tensors='tf')
    output = model.generate(**tokened,
                            do_sample=True,
                            max_new_tokens=max_new, 
                            temperature=temp, 
                            top_k=top_k, 
                            repetition_penalty=rep_penalty,
                            length_penalty=len_penalty,
                            num_return_sequences=n_seq)
    print(output)
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [29]:
test_lines = [
    'Tomorrow I will',
    'Tomorrow',
    'Tomorrow I will',
    'Yesterday we were',
    'For naught I may',
    'My love has been',
    'I am',
    'Thou art',
    'The little love-god lying once asleep',
    'In loving thee thou'
]

for line in test_lines:
    output = test(line,
                  temp=0.5,
                  max_new=100,
                  top_k=200,
                  rep_penalty=1.5,
                  len_penalty=0.75,
                  n_seq=1)
    print(f'Original: {line}\nOutput: {output}\n')

Original: Tomorrow I will
Output: Tomorrow I will remember the glory of a world where men have made their home, and now they are scattered abroad. And how shall you please to know that my love is not lost when we live?
The soul has never been so hard on her breast as in its bosom; yet it always seeks after reason with all speed: It does nothing but wait for an opportunity which may bring nearer our sight! But if thy beauty be gone away from thee at last she must return again,—she sighs over this

Original: Tomorrow
Output: Tomorrow, my lord! I will not leave thee alone. Yet thou art so bright that withers the night; but thy eye still remains on me: and when we meet again our voice shall be heard in all its majesty—the sound of a man's heart doth turn to mourn over us through grief."
"If you wish," said he as they passed by him at last upon his way home from work,— "I take care now only for your sake.—But what is it then

Original: Tomorrow I will
Output: Tomorrow I will not speak, nor 