# Import Library

In [None]:
!pip install tokenizer
!pip install transformers

In [3]:
import pandas as pd
import tensorflow as tf
from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer
from transformers import WEIGHTS_NAME, CONFIG_NAME
import os



# Data

In [4]:
data = pd.read_csv('data/PoetryFoundationData.csv')
data = data.dropna()
data = data['Poem'].str.lower()

In [5]:
string = ''
for x in data:
    string += x + "</s>"

In [6]:
with open('data/poetry.txt', 'w', encoding='utf-8') as f:
    f.write(string)

# Load tokenizer and model

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [8]:
configuration = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id
)
model = TFGPT2LMHeadModel(configuration)



# Load Dataset

In [9]:
textfile = open("data/poetry.txt", "r", encoding='utf-8')
text = textfile.read()
textfile.close()

# Tokenize data

In [10]:
string_tokenized = tokenizer.encode(text)
print("Done tokenizing")

Token indices sequence length is longer than the specified maximum sequence length for this model (6122430 > 1024). Running this sequence through the model will result in indexing errors


Done tokenizing


# Create dataset

In [11]:
examples = []
block_size = 100
BATCH_SIZE = 12
BUFFER_SIZE = 1000
for i in range(0, len(string_tokenized) - block_size + 1, block_size):
    examples.append(string_tokenized[i:i + block_size])
inputs, labels = [], []

In [12]:
for ex in examples:
    inputs.append(ex[:-1])
    labels.append(ex[1:])
dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print("Done creating dataset")

Done creating dataset


# Create model

In [13]:
optimizer = tf.keras.optimizers.Adam(
    learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [14]:
model.compile(optimizer=optimizer, loss=loss)
model.fit(dataset, epochs=30)

Epoch 1/30
   5/5102 [..............................] - ETA: 8:11:52 - loss: 9.6142

KeyboardInterrupt: 

In [None]:
save_location = "models"
if not os.path.exists(save_location):
    os.makedirs(save_location)
model.save_pretrained(save_location)
tokenizer.save_pretrained(save_location)

In [None]:
text = "I used to love life"
input_ids = tokenizer.encode(text, return_tensors='tf')

beam_output = model.generate(
    input_ids,
    max_length=50,
    num_beams=5,
    temperature=0.7,
    no_repeat_ngram_size=2,
    num_return_sequences=5
)

print(tokenizer.decode(beam_output[0], skip_special_tokens=True))