In [1]:
# A lot of the below is taken pretty verbatim from the gpt2 example notebook:
# https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb

In [2]:
from transformers import AutoTokenizer, TFAutoModelForCausalLM
from transformers import create_optimizer, AdamWeightDecay
import tensorflow as tf
from datasets import Dataset, DatasetDict
import re
from utilities import extract_sentences, extract_words
from itertools import chain
import math
import random

model_type = 'distilgpt2'

2022-10-06 10:19:29.582853: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-06 10:19:29.924642: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-10-06 10:19:29.924664: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-10-06 10:19:29.966672: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-10-06 10:19:30.841892: W tensorflow/stream_executor/platform/de

In [3]:
# get sample text (shakespeare's sonnets)
with open('./shakespeare-sonnets.stripped.split.txt', 'r') as f:
    text = f.read()
text[:1000]

'From fairest creatures we desire increase,\n  That thereby beauty’s rose might never die,\n  But as the riper should by time decease,\n  His tender heir might bear his memory:\n  But thou, contracted to thine own bright eyes,\n  Feed’st thy light’s flame with self-substantial fuel,\n  Making a famine where abundance lies,\n  Thyself thy foe, to thy sweet self too cruel:\n  Thou that art now the world’s fresh ornament,\n  And only herald to the gaudy spring,\n  Within thine own bud buriest thy content,\n  And tender churl mak’st waste in niggarding:\n    Pity the world, or else this glutton be,\n    To eat the world’s due, by the grave and thee.\n\nWhen forty winters shall besiege thy brow,\n  And dig deep trenches in thy beauty’s field,\n  Thy youth’s proud livery so gazed on now,\n  Will be a tatter’d weed of small worth held:\n  Then being asked, where all thy beauty lies,\n  Where all the treasure of thy lusty days;\n  To say, within thine own deep sunken eyes,\n  Were an all-eatin

In [4]:
# because I think the model doesn't like lots of odd apostraphes, I'm
# subbing the obvious shakespearean contractions with an 'e'
RE_CONTRACT = re.compile(r'(r|d|k|l)’(d|st)')
RE_QUOTE = re.compile(r'(“|”|‘|’)')

text = RE_CONTRACT.sub(r'\1e\2', text)
text = RE_QUOTE.sub('', text)

In [5]:
# split into separate poems (separated by full blank line)
RE_BLANKLINE = re.compile(r'\n\s*?\n')

poems = RE_BLANKLINE.split(text)
poems[0]

'From fairest creatures we desire increase,\n  That thereby beautys rose might never die,\n  But as the riper should by time decease,\n  His tender heir might bear his memory:\n  But thou, contracted to thine own bright eyes,\n  Feedest thy lights flame with self-substantial fuel,\n  Making a famine where abundance lies,\n  Thyself thy foe, to thy sweet self too cruel:\n  Thou that art now the worlds fresh ornament,\n  And only herald to the gaudy spring,\n  Within thine own bud buriest thy content,\n  And tender churl makest waste in niggarding:\n    Pity the world, or else this glutton be,\n    To eat the worlds due, by the grave and thee.'

In [6]:
len(poems) # 154 sonnets

154

In [7]:
# use last 10 as test set
train = poems[:-10]
test = poems[-10:]
train[-1], test[0]

('Two loves I have of comfort and despair,\n  Which like two spirits do suggest me still:\n  The better angel is a man right fair,\n  The worser spirit a woman coloured ill.\n  To win me soon to hell, my female evil,\n  Tempteth my better angel from my side,\n  And would corrupt my saint to be a devil,\n  Wooing his purity with her foul pride.\n  And whether that my angel be turnd fiend,\n  Suspect I may, yet not directly tell;\n  But being both from me, both to each friend,\n  I guess one angel in anothers hell:\n    Yet this shall I neer know, but live in doubt,\n    Till my bad angel fire my good one out.',
 'Those lips that Loves own hand did make,\n  Breathed forth the sound that said ‘I hate,\n  To me that languishd for her sake:\n  But when she saw my woeful state,\n  Straight in her heart did mercy come,\n  Chiding that tongue that ever sweet\n  Was usd in giving gentle doom;\n  And taught it thus anew to greet;\n  ‘I hate she altered with an end,\n  That followed it as gentle 

In [8]:
train_src = Dataset.from_list([{'text': p} for p in train])
test_src = Dataset.from_list([{'text': p} for p in test])
#datasets = DatasetDict({"train": train_src, "validation": test_src})
datasets = DatasetDict({"train": Dataset.from_list([{'text': p} for p in poems])})
datasets['train'][0]

{'text': 'From fairest creatures we desire increase,\n  That thereby beautys rose might never die,\n  But as the riper should by time decease,\n  His tender heir might bear his memory:\n  But thou, contracted to thine own bright eyes,\n  Feedest thy lights flame with self-substantial fuel,\n  Making a famine where abundance lies,\n  Thyself thy foe, to thy sweet self too cruel:\n  Thou that art now the worlds fresh ornament,\n  And only herald to the gaudy spring,\n  Within thine own bud buriest thy content,\n  And tender churl makest waste in niggarding:\n    Pity the world, or else this glutton be,\n    To eat the worlds due, by the grave and thee.'}

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_type)

def tokenize(src):
    return tokenizer(src['text'])

#train_enc = tokenizer(train_src['text'])
#test_enc = tokenizer(test_src['text'])
#train_enc.keys()
tokened_data = datasets.map(tokenize, remove_columns=['text'])

  0%|          | 0/154 [00:00<?, ?ex/s]

In [10]:
tokened_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 154
    })
})

In [11]:
# size of a block of encodings
block_size = 128

# function to group encoded texts into a single long encoding per set
def group_texts(encoded_texts):
    #TODO: I think I need to rewrite this. It isn't working
    concat_texts = {k: sum(encoded_texts[k],[]) for k in encoded_texts.keys()}
    total_len = len(concat_texts[list(encoded_texts.keys())[0]])
    total_len = (total_len // block_size) * block_size
    
    result = {
        k: [t[i : i + block_size] for i in range(0, total_len, block_size)]
        for k, t in concat_texts.items()
    }
    result['labels'] = result['input_ids'].copy()
    return result

In [12]:
combo_data = tokened_data.map(group_texts, batched=True)
tokenizer.decode(combo_data['train'][1]['input_ids'])

  0%|          | 0/1 [00:00<?, ?ba/s]

'  And tender churl makest waste in niggarding:\n    Pity the world, or else this glutton be,\n    To eat the worlds due, by the grave and thee.When forty winters shall besiege thy brow,\n  And dig deep trenches in thy beautys field,\n  Thy youths proud livery so gazed on now,\n  Will be a tattered weed of small worth held:\n  Then being asked, where all thy beauty lies,\n  Where all the treasure of thy lusty days;\n  To say, within thine own deep sunken'

In [13]:
model = TFAutoModelForCausalLM.from_pretrained(model_type)

2022-10-06 10:19:34.348851: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-10-06 10:19:34.349028: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2022-10-06 10:19:34.349049: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (archzolam): /proc/driver/nvidia/version does not exist
2022-10-06 10:19:34.349494: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel w

In [14]:
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

In [15]:
model.compile(optimizer=optimizer, jit_compile=True)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [21]:
train_set = model.prepare_tf_dataset(combo_data['train'],
                                     shuffle=True,
                                     batch_size=32)
#validation_set = model.prepare_tf_dataset(combo_data['validation'],
#                                          shuffle=True,
#                                          batch_size=16)

In [None]:
model.fit(train_set, epochs=1)

2022-10-06 10:23:57.560105: W tensorflow/compiler/tf2xla/kernels/assert_op.cc:38] Ignoring Assert operator tfgpt2lm_head_model/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


In [18]:
# test model
test = "There is a very small bug"

tokenized = tokenizer(test, return_tensors="np")
outputs = model.generate(**tokenized, max_length=50)
print(outputs)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


tf.Tensor(
[[1858  318  257  845 1402 5434   11  475  340  318  845 1593  284  760
   326  262 5434  318  407 5969   13  198  198  198  198  198  198  198
   198  198  198  198  198  198  198  198  198  198  198  198  198  198
   198  198  198  198  198  198  198  198]], shape=(1, 50), dtype=int32)


In [19]:
tokenizer.decode(outputs[0]).strip()

'There is a very small bug, but it is very important to know that the bug is not fixed.'

In [20]:
# test on lines of poem
num = random.randrange(len(poems))
poem = poems[num]
lines = [line.strip() for line in poem.split('\n')]
for line in lines:
    t = tokenizer(line, return_tensors="np")
    out = model.generate(**t, max_length=100)
    print(f'in: {line}\nout: {tokenizer.decode(out[0]).strip()}\n')

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


in: My tongue-tied Muse in manners holds her still,
out: My tongue-tied Muse in manners holds her still, and she is not a man, but a woman, and a woman, and a woman, and a woman, and a woman, and a woman, and a woman, and a woman, and a woman, and a woman, and a woman, and a woman, and a woman, and a woman, and a woman, and a woman, and a woman, and a woman, and a woman, and a woman, and



Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


in: While comments of your praise richly compild,
out: While comments of your praise richly compild, and I will not be able to do so.



Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


in: Reserve their character with golden quill,
out: Reserve their character with golden quill, and they will be the same.



Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


in: And precious phrase by all the Muses fild.
out: And precious phrase by all the Muses fild.



Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


in: I think good thoughts, whilst others write good words,
out: I think good thoughts, whilst others write good words, and I think that I am not a good man, but I am a good man, and I am a good man, and I am a good man, and I am a good man, and I am a good man, and I am a good man, and I am a good man, and I am a good man, and I am a good man, and I am a good man, and I am a good man, and



Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


in: And like unlettered clerk still cry ‘Amen
out: And like unlettered clerk still cry ‘Amen!’



Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


in: To every hymn that able spirit affords,
out: To every hymn that able spirit affords, and that the spirit of the world, which is the spirit of the world, which is the spirit of the world, which is the spirit of the world, which is the spirit of the world, which is the spirit of the world, which is the spirit of the world, which is the spirit of the world, which is the spirit of the world, which is the spirit of the world, which is the spirit of the world, which is



Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


in: In polishd form of well-refined pen.
out: In polishd form of well-refined pen.



Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


in: Hearing you praised, I say ‘tis so, tis true,
out: Hearing you praised, I say ‘tis so, tis true,’



Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


in: And to the most of praise add something more;
out: And to the most of praise add something more;



Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


in: But that is in my thought, whose love to you,
out: But that is in my thought, whose love to you, and my love to you, and my love to you, and my love to you, and my love to you, and my love to you, and my love to you, and my love to you, and my love to you, and my love to you, and my love to you, and my love to you, and my love to you, and my love to you, and my love to you, and my love to



Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


in: Though words come hindmost, holds his rank before.
out: Though words come hindmost, holds his rank before.



Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


in: Then others, for the breath of words respect,
out: Then others, for the breath of words respect, and for the love of the Lord, and for the love of the Lord, and for the love of the Lord, and for the love of the Lord, and for the love of the Lord, and for the love of the Lord, and for the love of the Lord, and for the love of the Lord, and for the love of the Lord, and for the love of the Lord, and for the love of the Lord, and for

in: Me for my dumb thoughts, speaking in effect.
out: Me for my dumb thoughts, speaking in effect.

