# Modeling on Sentences

Creating a model based on sentences

A lot of the below is adapted from the gpt2 tutorial at https://huggingface.co/docs/transformers/v4.22.2/en/tasks/language_modeling

## Imports and Preliminaries

In [1]:
# data formatting for model
from datasets import Dataset, DatasetDict

# train/test split
from sklearn.model_selection import train_test_split

# tokenizer
from transformers import AutoTokenizer

# lm collator
from transformers import DataCollatorForLanguageModeling

# model and support
from transformers import TFAutoModelForCausalLM, create_optimizer, AdamWeightDecay

# other utilities
from itertools import chain
import os
import random
import re

In [2]:
# set the model we are using
MODELS = [
    'gpt', # original GPT
    'distilgpt2', # 84M features
    'gpt2', # 117M features
    'gpt2-medium', # 355M features
    'gpt2-large', # 744M features
    'ctrl',
    'transformerxl',
    'reformer',
    'xlnet'
]
    
model_type = 'gpt2'

In [3]:
# directories
MODEL_FORMAT = 'sentences'
DIR_MODEL = '../models/'
DIR_DATA = '../data/'

In [4]:
# regexes
RE_SENTENCE = re.compile(r'\w.*?[.?!]', re.S)
RE_WHITESPACE = re.compile(r'\s+')

## Load and Format Data

In [5]:
# load data
paths = [
    os.path.join(DIR_DATA, 'shakespeare-sonnets.clean.txt'),
    os.path.join(DIR_DATA, 'browning-sonnets.clean.txt'),
    os.path.join(DIR_DATA, 'daniel-constable-sonnets.clean.txt'),
    os.path.join(DIR_DATA, 'drayton-griffin-smith-sonnet-cycles.clean.txt'),
    #os.path.join(DIR_DATA, 'shakespeareplays.txt')
]
                 
for path in paths:
    with open(path, 'r') as f:
        fulltext = f.read()

lines = RE_SENTENCE.findall(fulltext)
lines = [RE_WHITESPACE.sub(' ', line) for line in lines]
fulltext[:1000], lines[:8]

('\ufeff\n    Into these loves who but for passion looks,\n    At this first sight here let him lay them by,\n    And seek elsewhere in turning other books,\n    Which better may his labour satisfy.\n      No far-fetched sigh shall ever wound my breast;\n    Love from mine eye a tear shall never wring;\n    Nor in Ah mes! my whining sonnets drest,\n    A libertine fantasticly I sing.\n      My verse is the true image of my mind,\n    Ever in motion, still desiring change;\n    To choice of all variety inclined,\n    And in all humours sportively I range.\n      My muse is rightly of the English strain,\n      That cannot long one fashion entertain.\n\n    Like an adventurous sea-farer am I,\n    Who hath some long and dangrous voyage been,\n    And called to tell of his discovery,\n    How far he sailed, what countries he had seen,\n      Proceeding from the port whence he put forth,\n    Shows by his compass how his course he steered,\n    When east, when west, when south, and when by

In [6]:
# split train and test
lines_train, lines_test = train_test_split(lines, test_size=0.05)
len(lines_train), len(lines_test)

(772, 41)

## Cleaning and Data Preparation

In [7]:
# create Dataset and DatasetDict instances - I think this is needed for model
train_dataset = Dataset.from_dict({'text': lines_train})
test_dataset = Dataset.from_dict({'text': lines_test})
datasets = DatasetDict({'train': train_dataset, 'test': test_dataset})
datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 772
    })
    test: Dataset({
        features: ['text'],
        num_rows: 41
    })
})

In [8]:
# Create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_type)

In [9]:
# Preprocessing function for tokenizer to use with map() method of datasetdict
def token_preproc(data):
    return tokenizer(data['text'])

In [10]:
# tokenize data
tokened_data = datasets.map(token_preproc, batched=True, num_proc=4, remove_columns=['text'])
tokened_data

      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 772
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 41
    })
})

In [11]:
for _ in range(10):
    n = random.randint(0, len(tokened_data['train']))
    print(n, tokenizer.convert_ids_to_tokens(tokened_data['train'][n]['input_ids']), lines_train[n])

351 ['Else', 'Ġshould', 'Ġmy', 'Ġlines', 'Ġglide', 'Ġon', 'Ġthe', 'Ġwaves', 'Ġof', 'ĠRh', 'ine', ',', 'ĠAnd', 'Ġcrown', 'Ġthe', 'ĠPy', 'ren', 's', 'Ġwith', 'Ġmy', 'Ġliving', 'Ġsong', '.'] Else should my lines glide on the waves of Rhine, And crown the Pyrens with my living song.
519 ['And', 'Ġlet', 'Ġthe', 'Ġb', 'ards', 'Ġwithin', 'Ġthat', 'ĠIrish', 'Ġis', 'le', ',', 'ĠTo', 'Ġwhom', 'Ġmy', 'ĠMuse', 'Ġwith', 'Ġfiery', 'Ġwings', 'Ġshall', 'Ġpass', ',', 'ĠCall', 'Ġback', 'Ġthe', 'Ġstiff', '-', 'neck', 'ed', 'Ġrebels', 'Ġfrom', 'Ġexile', ',', 'ĠAnd', 'Ġm', 'oll', 'ify', 'Ġthe', 'Ġslaughter', 'ing', 'Ġgall', 'ow', 'glass', ';', 'ĠAnd', 'Ġwhen', 'Ġmy', 'Ġflowing', 'Ġnumbers', 'Ġthey', 'Ġrehe', 'arse', ',', 'ĠLet', 'Ġwolves', 'Ġand', 'Ġbears', 'Ġbe', 'Ġcharm', 'Ã¨', 'd', 'Ġwith', 'Ġmy', 'Ġverse', '.'] And let the bards within that Irish isle, To whom my Muse with fiery wings shall pass, Call back the stiff-necked rebels from exile, And mollify the slaughtering gallowglass; And when my flowing

In [12]:
# pad encodings and prep for modeling
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors='tf')
collator

DataCollatorForLanguageModeling(tokenizer=PreTrainedTokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_len=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}), mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='tf')

## Modeling

In [13]:
# instantiate model
model_path = os.path.join(DIR_MODEL, f'{model_type}.{MODEL_FORMAT}')

if not os.path.exists(model_path):
    model = TFAutoModelForCausalLM.from_pretrained(model_type, pad_token_id = tokenizer.eos_token_id)
else:
    model = TFAutoModelForCausalLM.from_pretrained(model_path)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [14]:
# convert data to special format for tf model
tokenizer.pad_token = tokenizer.eos_token
tf_train_set = model.prepare_tf_dataset(tokened_data['train'], shuffle=True, batch_size=16, collate_fn=collator)
tf_test_set = model.prepare_tf_dataset(tokened_data['test'], shuffle=False, batch_size=16, collate_fn=collator)
tf_train_set

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


<PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(16, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(16, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(16, None), dtype=tf.int64, name=None))>

In [15]:
# compile model
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [16]:
# fit model (if pretrained does not exist)
if not os.path.exists(model_path):
    model.fit(tf_train_set, validation_data=tf_test_set, epochs=4)

Epoch 1/4
 6/48 [==>...........................] - ETA: 3:01 - loss: 5.7309

KeyboardInterrupt: 

In [None]:
# save model
if not os.path.exists(model_path):
    os.makedirs(model_path)
    model.save_pretrained(model_path)

## Test Model

In [None]:
# function to get predicted text
def test(text, max_new=50, temp=1, top_k=100, rep_penalty=1.5, len_penalty=1.25):
    tokened = tokenizer(text, return_tensors='np')
    output = model.generate(**tokened, 
                            max_new_tokens=max_new, 
                            temperature=temp, 
                            top_k=top_k, 
                            repetition_penalty=rep_penalty,
                            length_penalty=len_penalty)
    return tokenizer.decode(output[0], skip_special_tokens=False)

In [None]:
test_lines = [
    'Tomorrow I will',
    'Tomorrow',
    'Tomorrow I will',
    'Yesterday we were',
    'For naught I may',
    'My love has been',
    'I am',
    'Thou art',
    'The little love-god lying once asleep',
    'In loving thee thou'
]

for line in test_lines[:3]:
    output = test(line,
                  temp=0.1,
                  max_new=100,
                  top_k=200,
                  rep_penalty=1.5,
                  len_penalty=1.75)
    print(f'Original: {line}\nOutput: {output}\n')