# GA Capstone
## Causal Language Transformer Modeling with GPT2

Creating a model to generate text

A lot of the below is adapted from the gpt2 tutorial at https://huggingface.co/docs/transformers/v4.22.2/en/tasks/language_modeling

### Imports and Preliminaries

In [1]:
# data formatting for model
from datasets import Dataset, DatasetDict

# train/test split
from sklearn.model_selection import train_test_split

# tokenizer
from transformers import AutoTokenizer

# lm collator
from transformers import DataCollatorForLanguageModeling

# model and support
from transformers import TFAutoModelForCausalLM, create_optimizer, AdamWeightDecay

# other utilities
from itertools import chain
import os
import random
import re
import json
from utilities.utilities import load_config, get_dataset_from_config
from utilities.utilities import split_text_and_labels

In [2]:
CONFIG_FILE = 'config.json'

config_vars = load_config(CONFIG_FILE)
config_vars

{'MODEL_DIR': '../models/',
 'DATA_DIR': '../data/',
 'CAUSAL_N_EPOCHS': 4,
 'CLASS_N_EPOCHS': 8,
 'BATCH_SIZE': 16,
 'CAUSAL_MODEL': 'distilgpt2',
 'CLASS_MODEL': 'distilbert-base-uncased',
 'MODEL_NAME': 'shakespeare',
 'DATA_SHAKESPEARE': ['shakespeare-sonnets.clean.txt', 'shakespeareplays.txt'],
 'DATA_OTHER': ['belloc_hilaire-sonnets_and_verse.clean.txt',
  'blake_william-poems.clean.txt',
  'browning_elizabeth-sonnets_from_the_portuguese.clean.txt',
  'daniel_samuel_and_constable_henry-elizabethan_sonnet_cycles.clean.txt',
  'donne_john-poetry_vol_1.clean.txt',
  'drayton_michael_et_al-elizabethan_sonnet_cycles.clean.txt',
  'farjeon_eleanor-sonnets_and_poems.clean.txt',
  'keats_john-poems_1820.clean.txt',
  'lodge_thomas_and_fletcher_giles-elizabethan_sonnet_cycles.clean.txt',
  'lovell_robert_and_southey_robert-poems.clean.txt',
  'milton_john-poetical_works.clean.txt',
  'seward_anna-sonnets-and-odes.clean.txt',
  'shelley_percy-complete_poetic_works.clean.txt',
  'wilde_osca

In [3]:
# pretrained model designator
MODEL_TYPE = config_vars['CAUSAL_MODEL'] if 'CAUSAL_MODEL' in config_vars else 'distilgpt2'

# model batch size
BATCH_SIZE = config_vars['BATCH_SIZE'] if 'BATCH_SIZE' in config_vars else 16

# model num epochs
N_EPOCHS = config_vars['CAUSAL_N_EPOCHS'] if 'CAUSAL_N_EPOCHS' in config_vars else 8

# whether to downsample
SAMPLE = config_vars['N_SAMPLES'] if 'N_SAMPLES' in config_vars else 1

In [5]:
# directories and other constants, from config.json

# model name for saving
MODEL_NAME = config_vars['MODEL_NAME'] if 'MODEL_NAME' in config_vars else 'shakespeare'

# directory for saved models
MODEL_DIR = config_vars['MODEL_DIR'] if 'MODEL_DIR' in config_vars else '../models/'

# full model save path
MODEL_PATH = os.path.join(MODEL_DIR, f'{MODEL_NAME}.{MODEL_TYPE}.{str(N_EPOCHS)}')

### Load and Format Data

In [6]:
# load data - just load the shakespeare stuff
data = get_dataset_from_config(config_vars, limit=SAMPLE)[1]
len(data), data[:2]

(76578,
 [('From fairest creatures we desire increase, That thereby beauty’s rose might never die, But as the riper should by time decease, His tender heir might bear his memory:',
   1),
  ('But thou, contracted to thine own bright eyes, Feed’st thy light’s flame with self-substantial fuel, Making a famine where abundance lies, Thyself thy foe, to thy sweet self too cruel:',
   1)])

In [7]:
# split train and test - we don't need a test set here
data_train, data_val = train_test_split(data, test_size=0.01)

# this is the labeled dataset - split into text and label lists
data_train = split_text_and_labels(data_train)
data_val = split_text_and_labels(data_val)

# we don't need labels for causal LM
data_train = data_train['text']
data_val = data_val['text']

len(data_train), len(data_val)

(75812, 766)

### Cleaning and Data Preparation

In [8]:
# create Dataset and DatasetDict instances - I think this is needed for model
train_dataset = Dataset.from_dict({'text': data_train})
val_dataset = Dataset.from_dict({'text': data_val})
datasets = DatasetDict({'train': train_dataset, 'val': val_dataset})
datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 75812
    })
    val: Dataset({
        features: ['text'],
        num_rows: 766
    })
})

In [9]:
# Create tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_TYPE)

In [10]:
# Preprocessing function for tokenizer to use with map() method of datasetdict
def token_preproc(data):
    return tokenizer(data['text'])

In [11]:
# tokenize data
tokened_data = datasets.map(token_preproc, batched=True, num_proc=4, remove_columns=['text'])
tokened_data

     

#0:   0%|          | 0/19 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/19 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/19 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/19 [00:00<?, ?ba/s]

      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 75812
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 766
    })
})

In [12]:
# pad encodings and prep for modeling
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors='tf')

### Modeling

In [13]:
# instantiate model

if not os.path.exists(MODEL_PATH):
    model = TFAutoModelForCausalLM.from_pretrained(MODEL_TYPE, pad_token_id = tokenizer.eos_token_id)
else:
    model = TFAutoModelForCausalLM.from_pretrained(MODEL_PATH)
    
#model.resize_token_embeddings(len(tokenizer))

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at distilgpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [14]:
# convert data to special format for tf model
tokenizer.pad_token = tokenizer.eos_token
tf_train_set = model.prepare_tf_dataset(tokened_data['train'], shuffle=True, batch_size=32, collate_fn=collator)
tf_val_set = model.prepare_tf_dataset(tokened_data['val'], shuffle=False, batch_size=32, collate_fn=collator)
tf_train_set

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


<PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(32, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(32, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(32, None), dtype=tf.int64, name=None))>

In [15]:
# compile model
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [16]:
# fit model (if pretrained does not exist)
if not os.path.exists(MODEL_PATH):
    model.fit(tf_train_set, validation_data=tf_val_set, epochs=N_EPOCHS)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [17]:
model.fit(tf_train_set, validation_data=tf_val_set, epochs=N_EPOCHS)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x175374b80>

In [18]:
# save model
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
    model.save_pretrained(MODEL_PATH)

### Test Model

In [19]:
# function to get predicted text
def test(text, max_new=50, temp=1, top_k=50, rep_penalty=1.5, len_penalty=0.75, n_seq=1):
    tokened = tokenizer(text, return_tensors='tf')
    output = model.generate(**tokened,
                            do_sample=True,
                            max_new_tokens=max_new, 
                            temperature=temp, 
                            top_k=top_k, 
                            repetition_penalty=rep_penalty,
                            length_penalty=len_penalty,
                            num_return_sequences=n_seq)
    return tokenizer.decode(output[0], skip_special_tokens=False)

In [22]:
test_lines = [data_train[random.randint(0,len(data) - 1)] for _ in range(10)]
fragment_ratio = 0.4
print(test_lines)

for line in test_lines:
    words = line.split()
    stop_ix = int(len(words) * fragment_ratio) or 1
    fragment = ' '.join(words[0:stop_ix])
    print(f'Original: {fragment}')
    output = test(fragment,
                  temp=0.5,
                  max_new=100,
                  top_k=200,
                  rep_penalty=1.5,
                  len_penalty=0.75,
                  n_seq=1)
    print(f'Output: {output}\n')

['The pale moon shines by night, And when I wander here and there, I then do most go right.', 'Methinks these peers of France should smile at that.', 'So, love, be thou, although to-day thou fill Thy hungry eyes, even till they wink with fulness, To-morrow see again, and do not kill The spirit of love, with a perpetual dulness.', 'Believe me, I speak as my understanding instructs me and as mine honesty puts it to utterance.', 'But thou, ’gainst all proportion, didst bring in Wonder to wait on treason and on murder, And whatsoever cunning fiend it was That wrought upon thee so preposterously Hath got the voice in hell for excellence.', 'I’ll question her.', 'Then meet me forthwith at the notary’s.', 'Better Macbeth Than such an one to reign.', 'One scene of it comes near the circumstance Which I have told thee of my father’s death.', 'My lord, I warrant you we will play our part As he shall think by our true diligence He is no less than what we say he is.']
Original: The pale moon shine

### Conclusion

Text is generated that at many times sounds reasonably Shakespearian. Soon to be combined with classification model to test this out.