# Modeling on Sentences

Creating a model based on sentences

A lot of the below is adapted from the gpt2 tutorial at https://huggingface.co/docs/transformers/v4.22.2/en/tasks/language_modeling

## Imports and Preliminaries

In [1]:
# data formatting for model
from datasets import Dataset, DatasetDict

# train/test split
from sklearn.model_selection import train_test_split

# tokenizer
from transformers import AutoTokenizer

# lm collator
from transformers import DataCollatorForLanguageModeling

# model and support
from transformers import TFAutoModelForCausalLM, create_optimizer, AdamWeightDecay

# other utilities
from itertools import chain
import os
import random
import re

In [2]:
# set the model we are using
MODELS = [
    'gpt', # original GPT
    'distilgpt2', # 84M features
    'gpt2', # 117M features
    'gpt2-medium', # 355M features
    'gpt2-large', # 744M features
    'ctrl',
    'transformerxl',
    'reformer',
    'xlnet'
]
    
model_type = 'gpt2'

In [3]:
# directories
MODEL_FORMAT = 'sentences-2'
DIR_MODEL = '../models/'
DIR_DATA = '../data/'

In [4]:
# regexes
RE_SENTENCE = re.compile(r'\w.*?[.?!]', re.S)
RE_WHITESPACE = re.compile(r'\s+')

In [5]:
# other special constants
EOL_TOKEN = '<|eol|>'

## Load and Format Data

In [6]:
# load data
paths = [
    os.path.join(DIR_DATA, 'shakespeare-sonnets.clean.txt'),
    #os.path.join(DIR_DATA, 'browning-sonnets.clean.txt'),
    #os.path.join(DIR_DATA, 'daniel-constable-sonnets.clean.txt'),
    #os.path.join(DIR_DATA, 'drayton-griffin-smith-sonnet-cycles.clean.txt'),
    #os.path.join(DIR_DATA, 'farjeon-sonnets.clean.txt'),
    #os.path.join(DIR_DATA, 'lovell-southey-sonnets.clean.txt')
]

text = list()

for path in paths:
    with open(path, 'r') as f:
        text.append([line.strip() + EOL_TOKEN for line in f.readlines() if line.strip()])

text = ' '.join(chain(*text))
lines = RE_SENTENCE.findall(text)
lines = [RE_WHITESPACE.sub(' ', line) for line in lines]
len(text), len(lines), lines[0]

(108475,
 557,
 'From fairest creatures we desire increase,<|eol|> That thereby beautys rose might never die,<|eol|> But as the riper should by time decease,<|eol|> His tender heir might bear his memory:<|eol|> But thou, contracted to thine own bright eyes,<|eol|> Feedst thy lights flame with self-substantial fuel,<|eol|> Making a famine where abundance lies,<|eol|> Thyself thy foe, to thy sweet self too cruel:<|eol|> Thou that art now the worlds fresh ornament,<|eol|> And only herald to the gaudy spring,<|eol|> Within thine own bud buriest thy content,<|eol|> And tender churl makst waste in niggarding:<|eol|> Pity the world, or else this glutton be,<|eol|> To eat the worlds due, by the grave and thee.')

In [7]:
# split train and test
lines_train, lines_test = train_test_split(lines, test_size=0.05)
len(lines_train), len(lines_test)

(529, 28)

## Cleaning and Data Preparation

In [8]:
# create Dataset and DatasetDict instances - I think this is needed for model
train_dataset = Dataset.from_dict({'text': lines_train})
test_dataset = Dataset.from_dict({'text': lines_test})
datasets = DatasetDict({'train': train_dataset, 'test': test_dataset})
datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 529
    })
    test: Dataset({
        features: ['text'],
        num_rows: 28
    })
})

In [10]:
# Create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_type, additional_special_tokens=[EOL_TOKEN])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
# Preprocessing function for tokenizer to use with map() method of datasetdict
def token_preproc(data):
    return tokenizer(data['text'])

In [12]:
# tokenize data
tokened_data = datasets.map(token_preproc, batched=True, num_proc=4, remove_columns=['text'])
tokened_data

      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 529
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 28
    })
})

In [13]:
for _ in range(10):
    n = random.randint(0, len(tokened_data['train']))
    print(n, tokenizer.convert_ids_to_tokens(tokened_data['train'][n]['input_ids']), lines_train[n])

161 ['that', 'Ġour', 'Ġnight', 'Ġof', 'Ġw', 'oe', 'Ġmight', 'Ġhave', 'Ġremembered', '<|eol|>', 'ĠMy', 'Ġdeepest', 'Ġsense', ',', 'Ġhow', 'Ġhard', 'Ġtrue', 'Ġsorrow', 'Ġhits', ',', '<|eol|>', 'ĠAnd', 'Ġsoon', 'Ġto', 'Ġyou', ',', 'Ġas', 'Ġyou', 'Ġto', 'Ġme', ',', 'Ġthen', 'Ġtend', 'ered', '<|eol|>', 'ĠThe', 'Ġhumble', 'Ġsal', 've', ',', 'Ġwhich', 'Ġwounded', 'Ġbos', 'oms', 'Ġfits', '!'] that our night of woe might have remembered<|eol|> My deepest sense, how hard true sorrow hits,<|eol|> And soon to you, as you to me, then tendered<|eol|> The humble salve, which wounded bosoms fits!
61 ['e', 'ol', '|', '>', 'ĠWhen', 'Ġin', 'Ġthe', 'Ġchron', 'icle', 'Ġof', 'Ġwasted', 'Ġtime', '<|eol|>', 'ĠI', 'Ġsee', 'Ġdescriptions', 'Ġof', 'Ġthe', 'Ġfaire', 'st', 'Ġw', 'ights', ',', '<|eol|>', 'ĠAnd', 'Ġbeauty', 'Ġmaking', 'Ġbeautiful', 'Ġold', 'Ġr', 'ime', ',', '<|eol|>', 'ĠIn', 'Ġpraise', 'Ġof', 'Ġladies', 'Ġdead', 'Ġand', 'Ġlovely', 'Ġknights', ',', '<|eol|>', 'ĠThen', ',', 'Ġin', 'Ġthe', 'Ġbl', 'azon

In [14]:
# pad encodings and prep for modeling
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors='tf')
collator

DataCollatorForLanguageModeling(tokenizer=PreTrainedTokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_len=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'additional_special_tokens': ['<|eol|>']}), mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='tf')

## Modeling

In [15]:
# instantiate model
model_path = os.path.join(DIR_MODEL, f'{model_type}.{MODEL_FORMAT}')

if not os.path.exists(model_path):
    model = TFAutoModelForCausalLM.from_pretrained(model_type, pad_token_id = tokenizer.eos_token_id)
else:
    model = TFAutoModelForCausalLM.from_pretrained(model_path)
    
model.resize_token_embeddings(len(tokenizer))

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


<transformers.modeling_tf_utils.TFSharedEmbeddings at 0x172613e20>

In [16]:
# convert data to special format for tf model
tokenizer.pad_token = tokenizer.eos_token
tf_train_set = model.prepare_tf_dataset(tokened_data['train'], shuffle=True, batch_size=16, collate_fn=collator)
tf_test_set = model.prepare_tf_dataset(tokened_data['test'], shuffle=False, batch_size=16, collate_fn=collator)
tf_train_set

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


<PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(16, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(16, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(16, None), dtype=tf.int64, name=None))>

In [17]:
# compile model
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [None]:
# fit model (if pretrained does not exist)
if not os.path.exists(model_path):
    model.fit(tf_train_set, validation_data=tf_test_set, epochs=8)

Epoch 1/8

In [97]:
# save model
if not os.path.exists(model_path):
    os.makedirs(model_path)
    model.save_pretrained(model_path)

## Test Model

In [98]:
# function to get predicted text
def test(text, max_new=50, temp=1, top_k=50, rep_penalty=1.5, len_penalty=0.75, n_seq=1):
    tokened = tokenizer(text, return_tensors='tf')
    output = model.generate(**tokened,
                            do_sample=True,
                            max_new_tokens=max_new, 
                            temperature=temp, 
                            top_k=top_k, 
                            repetition_penalty=rep_penalty,
                            length_penalty=len_penalty,
                            num_return_sequences=n_seq)
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [99]:
test_lines = [
    'Tomorrow I will',
    'Tomorrow',
    'Tomorrow I will',
    'Yesterday we were',
    'For naught I may',
    'My love has been',
    'I am',
    'Thou art',
    'The little love-god lying once asleep',
    'In loving thee thou'
]

for line in test_lines:
    output = test(line,
                  temp=0.5,
                  max_new=100,
                  top_k=200,
                  rep_penalty=1.5,
                  len_penalty=0.75,
                  n_seq=1)
    print(f'Original: {line}\nOutput: {output}\n')

tf.Tensor(
[[49488   314   481   407   307  1498 48920  1675  1064   616  1842   287
    262   995    11  4249  1683   284  2107    13  5896   318   329   502
    257  1517   286   645   779    26  1865   340   373   288   849   787
    523    25   960 23205  3607 17903   428  1204   290   326   543 14210
    266  2326  4425     0  4718 12311 11906   944   788   611   345   743
    475  3520   612   890  1576 34976   392   766   703  1290   534 18522
     82  2121   783  1165  2739    30   440  1309   514   467   319   588
   1450   355   356   423  3750   878 13402  1532   777  6066   466  3387
    294   500  4151   393  2612   484  1276]], shape=(1, 103), dtype=int32)
Original: Tomorrow I will
Output: Tomorrow I will not be able LINE To find my love in the world, nor ever to live. Love is for me a thing of no use; yet it was doth make so:—love gives thee this life and that which thou wilt lose! Relieve thyself then if you may but remain there long enough,—and see how far your griefs 