In [1]:
# A lot of the below is taken pretty verbatim from the gpt2 example notebook:
# https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb

In [2]:
from transformers import AutoTokenizer, TFAutoModelForCausalLM
from transformers import create_optimizer, AdamWeightDecay
import tensorflow as tf
from datasets import Dataset, DatasetDict
import re
from utilities import extract_sentences, extract_words
from itertools import chain
import math
import random

# set the model type we w
model_type = 'distilgpt2'

In [3]:
# get sample text (shakespeare's sonnets)
with open('./shakespeare-sonnets.stripped.split.txt', 'r') as f:
    text = f.read()
text[:1000]

'From fairest creatures we desire increase,\n  That thereby beauty’s rose might never die,\n  But as the riper should by time decease,\n  His tender heir might bear his memory:\n  But thou, contracted to thine own bright eyes,\n  Feed’st thy light’s flame with self-substantial fuel,\n  Making a famine where abundance lies,\n  Thyself thy foe, to thy sweet self too cruel:\n  Thou that art now the world’s fresh ornament,\n  And only herald to the gaudy spring,\n  Within thine own bud buriest thy content,\n  And tender churl mak’st waste in niggarding:\n    Pity the world, or else this glutton be,\n    To eat the world’s due, by the grave and thee.\n\nWhen forty winters shall besiege thy brow,\n  And dig deep trenches in thy beauty’s field,\n  Thy youth’s proud livery so gazed on now,\n  Will be a tatter’d weed of small worth held:\n  Then being asked, where all thy beauty lies,\n  Where all the treasure of thy lusty days;\n  To say, within thine own deep sunken eyes,\n  Were an all-eatin

In [4]:
# because I think the model doesn't like lots of odd apostraphes, I'm
# subbing the obvious shakespearean contractions with an 'e'
RE_CONTRACT = re.compile(r'(r|d|k|l)’(d|st)')
RE_QUOTE = re.compile(r'(“|”|‘|’)')

text = RE_CONTRACT.sub(r'\1e\2', text)
text = RE_QUOTE.sub('', text)

In [5]:
# split into separate poems (separated by full blank line)
RE_BLANKLINE = re.compile(r'\n\s*?\n')

poems = RE_BLANKLINE.split(text)
poems[0]

'From fairest creatures we desire increase,\n  That thereby beautys rose might never die,\n  But as the riper should by time decease,\n  His tender heir might bear his memory:\n  But thou, contracted to thine own bright eyes,\n  Feedest thy lights flame with self-substantial fuel,\n  Making a famine where abundance lies,\n  Thyself thy foe, to thy sweet self too cruel:\n  Thou that art now the worlds fresh ornament,\n  And only herald to the gaudy spring,\n  Within thine own bud buriest thy content,\n  And tender churl makest waste in niggarding:\n    Pity the world, or else this glutton be,\n    To eat the worlds due, by the grave and thee.'

In [6]:
len(poems) # 154 sonnets

154

In [7]:
# use last 10 as test set
train = poems[:-10]
test = poems[-10:]
train[-1], test[0]

('Two loves I have of comfort and despair,\n  Which like two spirits do suggest me still:\n  The better angel is a man right fair,\n  The worser spirit a woman coloured ill.\n  To win me soon to hell, my female evil,\n  Tempteth my better angel from my side,\n  And would corrupt my saint to be a devil,\n  Wooing his purity with her foul pride.\n  And whether that my angel be turnd fiend,\n  Suspect I may, yet not directly tell;\n  But being both from me, both to each friend,\n  I guess one angel in anothers hell:\n    Yet this shall I neer know, but live in doubt,\n    Till my bad angel fire my good one out.',
 'Those lips that Loves own hand did make,\n  Breathed forth the sound that said I hate,\n  To me that languishd for her sake:\n  But when she saw my woeful state,\n  Straight in her heart did mercy come,\n  Chiding that tongue that ever sweet\n  Was usd in giving gentle doom;\n  And taught it thus anew to greet;\n  I hate she altered with an end,\n  That followed it as gentle da

In [8]:
train_src = Dataset.from_list([{'text': p} for p in train])
test_src = Dataset.from_list([{'text': p} for p in test])
#datasets = DatasetDict({"train": train_src, "validation": test_src})
datasets = DatasetDict({"train": Dataset.from_list([{'text': p} for p in poems])})
datasets['train'][0]

{'text': 'From fairest creatures we desire increase,\n  That thereby beautys rose might never die,\n  But as the riper should by time decease,\n  His tender heir might bear his memory:\n  But thou, contracted to thine own bright eyes,\n  Feedest thy lights flame with self-substantial fuel,\n  Making a famine where abundance lies,\n  Thyself thy foe, to thy sweet self too cruel:\n  Thou that art now the worlds fresh ornament,\n  And only herald to the gaudy spring,\n  Within thine own bud buriest thy content,\n  And tender churl makest waste in niggarding:\n    Pity the world, or else this glutton be,\n    To eat the worlds due, by the grave and thee.'}

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_type)

def tokenize(src):
    return tokenizer(src['text'])

#train_enc = tokenizer(train_src['text'])
#test_enc = tokenizer(test_src['text'])
#train_enc.keys()
tokened_data = datasets.map(tokenize, remove_columns=['text'])

Downloading:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

  0%|          | 0/154 [00:00<?, ?ex/s]

In [10]:
tokened_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 154
    })
})

In [11]:
# size of a block of encodings
block_size = 128

# function to group encoded texts into a single long encoding per set
def group_texts(encoded_texts):
    #TODO: I think I need to rewrite this. It isn't working
    concat_texts = {k: sum(encoded_texts[k],[]) for k in encoded_texts.keys()}
    total_len = len(concat_texts[list(encoded_texts.keys())[0]])
    total_len = (total_len // block_size) * block_size
    
    result = {
        k: [t[i : i + block_size] for i in range(0, total_len, block_size)]
        for k, t in concat_texts.items()
    }
    result['labels'] = result['input_ids'].copy()
    return result

In [12]:
combo_data = tokened_data.map(group_texts, batched=True)
tokenizer.decode(combo_data['train'][1]['input_ids'])

  0%|          | 0/1 [00:00<?, ?ba/s]

'  And tender churl makest waste in niggarding:\n    Pity the world, or else this glutton be,\n    To eat the worlds due, by the grave and thee.When forty winters shall besiege thy brow,\n  And dig deep trenches in thy beautys field,\n  Thy youths proud livery so gazed on now,\n  Will be a tattered weed of small worth held:\n  Then being asked, where all thy beauty lies,\n  Where all the treasure of thy lusty days;\n  To say, within thine own deep sunken'

In [13]:
model = TFAutoModelForCausalLM.from_pretrained(model_type)

Downloading:   0%|          | 0.00/328M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at distilgpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [14]:
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

In [None]:
model.compile(optimizer=optimizer, jit_compile=True)

In [None]:
train_set = model.prepare_tf_dataset(combo_data['train'],
                                     shuffle=True,
                                     batch_size=32)
#validation_set = model.prepare_tf_dataset(combo_data['validation'],
#                                          shuffle=True,
#                                          batch_size=16)

In [None]:
model.fit(train_set, epochs=1)

In [None]:
# test model
test = "There is a very small bug"

tokenized = tokenizer(test, return_tensors="np")
outputs = model.generate(**tokenized, max_length=50)
print(outputs)

In [None]:
tokenizer.decode(outputs[0]).strip()

In [None]:
# test on lines of poem
num = random.randrange(len(poems))
poem = poems[num]
lines = [line.strip() for line in poem.split('\n')]
for line in lines:
    t = tokenizer(line, return_tensors="np")
    out = model.generate(**t, max_length=100)
    print(f'in: {line}\nout: {tokenizer.decode(out[0]).strip()}\n')