In [1]:
# home_directory = ''
home_directory = 'drive/My Drive/Colab Notebooks/poe/'

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/2c/4e/4f1ede0fd7a36278844a277f8d53c21f88f37f3754abf76a5d6224f76d4a/transformers-3.4.0-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 6.4MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 38.8MB/s 
[?25hCollecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 37.1MB/s 
Collecting tokenizers==0.9.2
[?25l  Downloading https://files.pythonhosted.org/packages/7c/a5/78be1a55b2ac8d6a956f0a211d372726e2b1dd2666bb537fea9b03abd62c/tokenizers-0.9.2-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[

In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Import Libraries & Load Data

In [4]:
import numpy as np
import pandas as pd 

import os
import re
import string
import random
import time
import datetime

from collections import Counter
import itertools
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
plt.style.use('bmh')

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler

pd.set_option('display.max_rows', 100)

In [5]:
poem_line_df = pd.read_csv(home_directory + 'poe_poems_lines.csv')
poem_line_df = poem_line_df.fillna('')

# Text Generation - GPT-2
## Process Text and Create Dataset

http://jalammar.github.io/illustrated-gpt2/
https://medium.com/@stasinopoulos.dimitrios/a-beginners-guide-to-training-and-generating-text-using-gpt2-c2f2e1fbd10a
https://huggingface.co/transformers/model_doc/gpt2.html
https://towardsdatascience.com/step-by-step-guide-on-how-to-train-gpt-2-on-books-using-google-colab-b3c6fa15fef0
https://medium.com/swlh/fine-tuning-gpt-2-for-magic-the-gathering-flavour-text-generation-3bafd0f9bb93
https://colab.research.google.com/drive/16UTbQOhspQOF3XlxDFyI28S-0nAkTzk_#scrollTo=v4XhewaV93-_

In [46]:
RANDOM_SEED = 73
BATCH_SIZE = 4

EPOCHS = 4
SAMPLE_EVERY = 100

MAX_SEQUENCE_LENGTH = 512

In [47]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'pad_token': '<PAD>'}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [55]:
class PoePoemDataset(Dataset):
    
    def __init__(self, data, tokenizer, gpt2_type='gpt2', max_length=MAX_SEQUENCE_LENGTH):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []
        
        for i in data:
            encodings_dict = tokenizer('<BOS>' + i + '<EOS>',
                                     truncation=True,
                                     max_length=max_length,
                                     padding='max_length'
                                    )

            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        
        return self.input_ids[idx], self.attn_masks[idx]
        

In [56]:
poem_line_dataset = PoePoemDataset(poem_line_df['line_text'].values, tokenizer, max_length=MAX_SEQUENCE_LENGTH)

### Train/Validation Split

In [57]:
def train_val_split(split, dataset):
    train_size = int(split * len(dataset))
    val_size = len(dataset) - train_size
    return train_size, val_size

In [58]:
poem_line_train_size, poem_line_val_size = train_val_split(0.8, poem_line_dataset)

# random split imported from troch.utils
poem_line_train_dataset, poem_line_val_dataset = random_split(poem_line_dataset, [poem_line_train_size, poem_line_val_size])

### Apply Random Seeds

In [59]:
torch.cuda.manual_seed_all(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7f1bf0317420>

### Instantiate DataLoaders and Define Model Creation Function

---



In [60]:
def create_dataloaders(train_dataset, val_dataset, bs):
    train_dataloader = DataLoader(train_dataset,
                                  sampler=RandomSampler(train_dataset),
                                  batch_size=bs)

    val_dataloader = DataLoader(val_dataset,
                                sampler=SequentialSampler(val_dataset),
                                batch_size=bs)
    
    return train_dataloader, val_dataloader

In [61]:
poem_line_train_dataloader, poem_line_val_dataloader = create_dataloaders(poem_line_train_dataset, poem_line_val_dataset, BATCH_SIZE)

In [62]:
configuration = GPT2Config(vocab_size=len(tokenizer), n_positions=MAX_SEQUENCE_LENGTH).from_pretrained('gpt2', output_hidden_states=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




In [63]:
# helper function for logging time
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [64]:
# hyperparameters

learning_rate = 5e-4
eps = 1e-8
warmup_steps = 1e2

In [65]:
# create text generation seed prompt
device = torch.device('cuda')
# device = torch.device('cpu')
prompt = "<BOS>"
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

In [66]:
def create_model(train_dataloader, val_dataloader, file_name):

    model = GPT2LMHeadModel.from_pretrained('gpt2', config=configuration)
    model.resize_token_embeddings(len(tokenizer))
    
    model.cuda()
    optimizer = AdamW(model.parameters(), lr=learning_rate, eps=eps)

    total_steps = len(train_dataloader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=warmup_steps,
                                                num_training_steps=total_steps)
    
    total_t0 = time.time()
    model = model.to(device)

    for epoch_i in range(0, EPOCHS):

        print(f'Epoch {epoch_i + 1} of {EPOCHS}')

        t0 = time.time()
        total_train_loss = 0
        model.train()

        for step, batch in enumerate(train_dataloader):

            b_input_ids = batch[0].to(device)
            b_labels = batch[0].to(device)
            b_masks = batch[1].to(device)

            model.zero_grad()        

            outputs = model(b_input_ids,
                            labels=b_labels, 
                            attention_mask=b_masks,
                            token_type_ids=None)

            loss = outputs[0]  

            batch_loss = loss.item()
            total_train_loss += batch_loss

            if step % SAMPLE_EVERY == 0 and step != 0:
                
                model.eval()
                sample_outputs = model.generate(
                                        generated,
                                        do_sample=True,   
                                        top_k=50, 
                                        max_length=200,
                                        top_p=0.95, 
                                        num_return_sequences=1
                                    )
                for i, sample_output in enumerate(sample_outputs):
                      print(f'Example output: {tokenizer.decode(sample_output, skip_special_tokens=True)}')

                model.train()

            loss.backward()
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_train_loss / len(train_dataloader)       
        training_time = format_time(time.time() - t0)

        print(f'Average Training Loss: {avg_train_loss}. Epoch time: {training_time}')

        t0 = time.time()

        print('Evaluating Model')

        model.eval()

        total_eval_loss = 0
        nb_eval_steps = 0

        for batch in val_dataloader:
            b_input_ids = batch[0].to(device)
            b_labels = batch[0].to(device)
            b_masks = batch[1].to(device)

            with torch.no_grad():        

                outputs  = model(b_input_ids,  
                                 attention_mask=b_masks,
                                 labels=b_labels)

                loss = outputs[0]  

            batch_loss = loss.item()
            total_eval_loss += batch_loss        

        avg_val_loss = total_eval_loss / len(val_dataloader)

        validation_time = format_time(time.time() - t0)    

        print(f'Validation loss: {avg_val_loss}. Validation Time: {validation_time}')

    print(f'Total training took {format_time(time.time()-total_t0)}')

    torch.save(model.state_dict(), home_directory + file_name)
    return model

# Create Poem Line Model

In [67]:
poem_line_model = create_model(poem_line_train_dataloader, poem_line_val_dataloader, 'poem_line_model_4_epoch.pth')

# Generate Poem Lines

In [68]:
poem_line_model.eval()

sample_outputs = poem_line_model.generate(
                                generated, 
                                do_sample=True,   
                                top_k=50, 
                                max_length=512,
                                top_p=0.95, 
                                num_return_sequences=100
                                )

for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))