In [1]:
# Specify your own home directory as needed! This notebook is designed to run in Google Colab.

# home_directory = ''
home_directory = 'drive/My Drive/Colab Notebooks/poe/'

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers



# Import Libraries & Load Data

In [3]:
import numpy as np
import pandas as pd 

import random
import time
import datetime

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler

In [4]:
poem_stanza_df = pd.read_csv(home_directory + 'poe_poems_stanzas.csv')
poem_stanza_df = poem_stanza_df.fillna('')

# Text Generation - GPT-2
## Process Text and Create Dataset

http://jalammar.github.io/illustrated-gpt2/
https://medium.com/@stasinopoulos.dimitrios/a-beginners-guide-to-training-and-generating-text-using-gpt2-c2f2e1fbd10a
https://huggingface.co/transformers/model_doc/gpt2.html
https://towardsdatascience.com/step-by-step-guide-on-how-to-train-gpt-2-on-books-using-google-colab-b3c6fa15fef0
https://medium.com/swlh/fine-tuning-gpt-2-for-magic-the-gathering-flavour-text-generation-3bafd0f9bb93
https://colab.research.google.com/drive/16UTbQOhspQOF3XlxDFyI28S-0nAkTzk_#scrollTo=v4XhewaV93-_

In [5]:
RANDOM_SEED = 73
BATCH_SIZE = 2
EPOCHS = 8
MAX_LEN = 1024

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'pad_token': '<PAD>'}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)

In [7]:
combined_poems = poem_stanza_df.groupby(['title'])['stanza_text'].transform(lambda x: ' /n /n '.join(x)).drop_duplicates().reset_index(drop=True)

In [8]:
max_poem_length = max([len(tokenizer.encode(poem)) for poem in combined_poems])
min_poem_length = min([len(tokenizer.encode(poem)) for poem in combined_poems])

Token indices sequence length is longer than the specified maximum sequence length for this model (1753 > 1024). Running this sequence through the model will result in indexing errors


In [9]:
print('Longest Edgar Allen Poe Poem:', max_poem_length, 'tokens long.')
print('Shortest Edgar Allen Poe Poem:', min_poem_length, 'tokens long.')

Longest Edgar Allen Poe Poem: 6465 tokens long.
Shortest Edgar Allen Poe Poem: 55 tokens long.


In [10]:
stanza_length = [len(tokenizer.encode(stanza)) for stanza in poem_stanza_df['stanza_text'].values]
max_stanza_length = max(stanza_length)
min_stanza_length = min(stanza_length)

In [11]:
print('Number of stanzas longer than max length (1024 tokens): ', sum([st_len > MAX_LEN for st_len in stanza_length])) 

Number of stanzas longer than max length (1024 tokens):  1


In [12]:
print('Longest Edgar Allen Poe Stanza:', max_stanza_length, 'tokens long.')
print('Shortest Edgar Allen Poe Stanza:', min_stanza_length, 'tokens long.')

Longest Edgar Allen Poe Stanza: 1948 tokens long.
Shortest Edgar Allen Poe Stanza: 15 tokens long.


In [13]:
class PoePoemDataset(Dataset):
    
    def __init__(self, data, tokenizer, gpt2_type='gpt2', max_length=MAX_LEN):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []
        
        for i in data:
            encodings_dict = tokenizer('<BOS>' + i + '<EOS>',
                                     truncation=True,
                                     max_length=max_length,
                                     padding='max_length'
                                    )

            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        
        return self.input_ids[idx], self.attn_masks[idx]
        

In [14]:
poem_stanza_dataset = PoePoemDataset(poem_stanza_df['stanza_text'].values, tokenizer, max_length=MAX_LEN)

### Train/Validation Split

In [15]:
def train_val_split(split, dataset):
    train_size = int(split * len(dataset))
    val_size = len(dataset) - train_size
    return train_size, val_size

In [16]:
poem_stanza_train_size, poem_stanza_val_size = train_val_split(0.8, poem_stanza_dataset)

# random split imported from troch.utils
poem_stanza_train_dataset, poem_stanza_val_dataset = random_split(poem_stanza_dataset, [poem_stanza_train_size, poem_stanza_val_size])

### Apply Random Seeds

In [17]:
torch.cuda.manual_seed_all(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7f2c6ee73600>

### Instantiate DataLoaders and Define Model Creation Function

In [18]:
poem_stanza_train_dataloader = DataLoader(poem_stanza_train_dataset,
                              sampler=RandomSampler(poem_stanza_train_dataset),
                              batch_size=BATCH_SIZE)

poem_stanza_val_dataloader = DataLoader(poem_stanza_val_dataset,
                            sampler=SequentialSampler(poem_stanza_val_dataset),
                            batch_size=BATCH_SIZE)

In [19]:
# helper function for logging time
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

# hyperparameters
learning_rate = 1e-4
eps = 1e-8
warmup_steps = 50

# create text generation seed prompt
device = torch.device('cuda')

prompt = "<BOS>"
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

# Create Poem Stanza Model

In [20]:
configuration = GPT2Config(vocab_size=len(tokenizer), n_positions=MAX_LEN).from_pretrained('gpt2', output_hidden_states=True)

poem_stanza_model = GPT2LMHeadModel.from_pretrained('gpt2', config=configuration)
poem_stanza_model.resize_token_embeddings(len(tokenizer))

poem_stanza_model.cuda()
optimizer = AdamW(poem_stanza_model.parameters(), lr=learning_rate, eps=eps)

total_steps = len(poem_stanza_train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=warmup_steps,
                                            num_training_steps=total_steps)

start_time = time.time()
poem_stanza_model = poem_stanza_model.to(device)

for epoch_i in range(0, EPOCHS):

    print(f'Epoch {epoch_i + 1} of {EPOCHS}')

    t0 = time.time()
    total_train_loss = 0
    poem_stanza_model.train()

    for step, batch in enumerate(poem_stanza_train_dataloader):

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        poem_stanza_model.zero_grad()        

        outputs = poem_stanza_model(b_input_ids,
                                    labels=b_labels,
                                    attention_mask=b_masks,
                                    token_type_ids=None)

        loss = outputs[0]  

        batch_loss = loss.item()
        total_train_loss += batch_loss

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(poem_stanza_train_dataloader)       
    training_time = format_time(time.time() - t0)

    print(f'Average Training Loss: {avg_train_loss}. Epoch Training Time: {training_time}')

    t0 = time.time()

    poem_stanza_model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in poem_stanza_val_dataloader:
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        with torch.no_grad():        

            outputs  = poem_stanza_model(b_input_ids,
                                         attention_mask=b_masks,
                                         labels=b_labels)

            loss = outputs[0]  

        batch_loss = loss.item()
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(poem_stanza_val_dataloader)


    print(f'Average Validation Loss: {avg_val_loss}')

print(f'Total Training Time: {format_time(time.time()-start_time)}')

torch.save(poem_stanza_model.state_dict(), home_directory + 'poem_stanza_model.pth')


Epoch 1 of 8
Average Training Loss: 2.576231449495914. Epoch Training Time: 0:01:10
Average Validation Loss: 0.6122910502282056
Epoch 2 of 8
Average Training Loss: 0.6284494824534239. Epoch Training Time: 0:01:10
Average Validation Loss: 0.5452861582691019
Epoch 3 of 8
Average Training Loss: 0.5744865956001504. Epoch Training Time: 0:01:10
Average Validation Loss: 0.5252606712959029
Epoch 4 of 8
Average Training Loss: 0.5516689658858055. Epoch Training Time: 0:01:09
Average Validation Loss: 0.51527174833146
Epoch 5 of 8
Average Training Loss: 0.5309407965734948. Epoch Training Time: 0:01:10
Average Validation Loss: 0.511662708087401
Epoch 6 of 8
Average Training Loss: 0.5195481184610101. Epoch Training Time: 0:01:09
Average Validation Loss: 0.5067863213745031
Epoch 7 of 8
Average Training Loss: 0.5100360779568206. Epoch Training Time: 0:01:09
Average Validation Loss: 0.506346502087333
Epoch 8 of 8
Average Training Loss: 0.5045993163488632. Epoch Training Time: 0:01:09
Average Validatio

# Generate Poem Stanzas

In [25]:
poem_stanza_model.eval()

sample_outputs = poem_stanza_model.generate(
                                generated, 
                                do_sample=True,   
                                top_k=50, 
                                max_length=MAX_LEN,
                                top_p=0.95, 
                                num_return_sequences=3
                                )

for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


0: We must love thee with all the love and the tenderness of our hearts,
 In thy presence we feel thy anger,
 But not in thy strength thy beauty—and not in thy strength thy grief—
 And not in thy strength thine eyes— And not in thy strength thy love:
 But in thy strength thou art thy pride—
 Not in the power of thy own beauty, but in the power of thy soul—
 And thou art thy delight—
 To the sweetest of the kisses:

 Of these happy memories, when—and then——
 Upon the very face of a young girl,
 A beautiful name, uttered within her infancy—
 Like a song sung to a sad child,
 And from childhood to the hour of her death,
 I feel her love—to thee, then, my dear child,
 And me, whose name is still in me—
 
 Who feel my anger with pride—
 Love and reverence alone can help,
 In a life like that of this maiden,
 And whose name is still in me—
 Whom all love grows to bloom! their own their own sacred places;
 But my spirit has not left me, yet that I may be a young man,
 And in the time I have n