In [1]:
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2TokenizerFast
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load pretrained GPT-2 model and tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
model = TFGPT2LMHeadModel.from_pretrained('gpt2')

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [7]:
# read the data
df = pd.read_csv('merged_data.csv')
df = df.sample(20000)
lines = df['Verse'].values.tolist()

In [43]:
# Tokenize the lines
tokenizer.pad_token = tokenizer.eos_token
max_length = max([len(tokenizer.encode(line)) for line in lines])
tokenized_lines = tokenizer(
    lines,
    truncation=True,
    padding='max_length',
    max_length=max_length+1,
    add_special_tokens=True
)['input_ids']

In [44]:
max_length

19

In [45]:
lines[0:2]

["but as a horseman checks the courser's speed",
 'and prides no longer in his beauteous form']

In [46]:
tokenized_lines[0]

[4360,
 355,
 257,
 8223,
 805,
 8794,
 262,
 1093,
 2655,
 338,
 2866,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256]

In [47]:
tokenized_lines[4].index(tokenizer.eos_token_id)

7

In [56]:
# generate input and label sequences
input_sequences = [line[:-1] for line in tokenized_lines]
labels = [line[1:] for line in tokenized_lines]

In [57]:
input_sequences[0]

[4360,
 355,
 257,
 8223,
 805,
 8794,
 262,
 1093,
 2655,
 338,
 2866,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256]

In [58]:
labels[0]

[355,
 257,
 8223,
 805,
 8794,
 262,
 1093,
 2655,
 338,
 2866,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256]

In [59]:
input_dataset = tf.data.Dataset.from_tensor_slices(input_sequences)
labels_dataset = tf.data.Dataset.from_tensor_slices(labels)

In [61]:
# Combine input and label datasets
dataset = tf.data.Dataset.zip((input_dataset, labels_dataset))

In [62]:
# Shuffle and batch the dataset
batch_size = 128
dataset = dataset.shuffle(buffer_size=len(input_sequences))
dataset = dataset.batch(batch_size)

In [63]:
# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))

In [19]:
# Fine-tuning
model.fit(dataset, epochs=5)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x16561080908>

In [26]:
# Generate text
seed_test = "happy"
input_ids = tokenizer.encode(seed_test, return_tensors='tf')

sample_outputs = model.generate(
    input_ids,
    do_sample=True,
    max_length=100,
    top_k=0,
    top_p=0.9,
    temperature=1,
    num_return_sequences=5,
    pad_token_id=tokenizer.eos_token_id
)

# Print generated output
for i, output in enumerate(sample_outputs):
    print(f"Output {i + 1}: {tokenizer.decode(output, skip_special_tokens=True)}")
#  "chill'd the fair dawning of the yearin honour of th' auspicious morn",


Output 1: happy the birth of my redest lovewhat a matchless wind at their distant war
Output 2: happy every godly precept all the same lyreto whom all gladsome pleasure and paintoward that solemn solemn day
Output 3: happy britannia slain men his wonted herdson good the wise should say
Output 4: happy envy rewards on who oh no lie his yoke giveshe meets what haste and madness slumber
Output 5: happy a soul with no means for readingo and behold a shining light shine


In [22]:
# save model
model.save_pretrained('gpt2_fine_tuned_model_english')
# save tokenizer
tokenizer.save_pretrained('gpt2_tokenizer_fine_tuned_model_english')

('gpt2_tokenizer_fine_tuned_model_english\\tokenizer_config.json',
 'gpt2_tokenizer_fine_tuned_model_english\\special_tokens_map.json',
 'gpt2_tokenizer_fine_tuned_model_english\\vocab.json',
 'gpt2_tokenizer_fine_tuned_model_english\\merges.txt',
 'gpt2_tokenizer_fine_tuned_model_english\\added_tokens.json',
 'gpt2_tokenizer_fine_tuned_model_english\\tokenizer.json')