# Import Libraries & Load Data

In [22]:
import numpy as np
import pandas as pd 

import os
import re
import string
import random
import time
import datetime

from collections import Counter
import itertools

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
plt.style.use('bmh')

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler

pd.set_option('display.max_rows', 100)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/scottduda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/scottduda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
!pip install transformers

In [23]:
# home_directory = ''
# home_directory = 'drive/My Drive/Colab Notebooks/plot-generator/'

# from google.colab import drive
# drive.mount('/content/drive')

In [29]:
movie_df = pd.read_csv('20210305_movie_df.csv')
top_df = pd.read_csv('20210305_top100_df.csv')
bottom_df = pd.read_csv('20210305_bottom100_df.csv')

In [30]:
movie_df['plot'] = movie_df['plot'].apply(lambda x: x.split('::')[0])
top_df['plot'] = top_df['plot'].apply(lambda x: x.split('::')[0])
bottom_df['plot'] = bottom_df['plot'].apply(lambda x: x.split('::')[0])

In [31]:
movie_plots = movie_df['plot'].values.tolist() + top_df['plot'].values.tolist() + bottom_df['plot'].values.tolist()

In [32]:
movie_plot_lengths = [len(x.split(' ')) for x in movie_plots]

In [63]:
max(movie_plot_lengths)

431

# Movie Plot Generator

In [65]:
RANDOM_SEED = 73
BATCH_SIZE = 4

EPOCHS = 4
SAMPLE_EVERY = 100

MAX_SEQUENCE_LENGTH = 512

In [66]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'pad_token': '<PAD>'}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1042301.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=456318.0), HTML(value='')))




In [69]:
class MoviePlotDataset(Dataset):
    
    def __init__(self, data, tokenizer, gpt2_type='gpt2', max_length=MAX_SEQUENCE_LENGTH):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []
        
        for i in data:

            encodings_dict = tokenizer('<BOS>' + i + '<EOS>',
                                       truncation=True,
                                       max_length=max_length,
                                       padding='max_length'
                                      )

            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        
        return self.input_ids[idx], self.attn_masks[idx]
        

In [76]:
plot_dataset = MoviePlotDataset(movie_plots, tokenizer, max_length=MAX_SEQUENCE_LENGTH)

In [77]:
# train/test split

def train_val_split(split, dataset):
    train_size = int(split * len(dataset))
    val_size = len(dataset) - train_size
    return train_size, val_size

plot_train_size, plot_val_size = train_val_split(0.8, movie_plot_dataset)

# random split imported from troch.utils
plot_train_dataset, plot_val_dataset = random_split(plot_dataset, [plot_train_size, plot_val_size])

In [79]:
# random seeds

torch.cuda.manual_seed_all(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7ff35738ca70>

In [80]:
def create_dataloaders(train_dataset, val_dataset, bs):
    train_dataloader = DataLoader(train_dataset,
                                  sampler=RandomSampler(train_dataset),
                                  batch_size=bs)

    val_dataloader = DataLoader(val_dataset,
                                sampler=SequentialSampler(val_dataset),
                                batch_size=bs)
    
    return train_dataloader, val_dataloader

In [81]:
plot_train_dataloader, plot_val_dataloader = create_dataloaders(plot_train_dataset, plot_val_dataset, BATCH_SIZE)

In [82]:
configuration = GPT2Config(vocab_size=len(tokenizer), n_positions=MAX_SEQUENCE_LENGTH).from_pretrained('gpt2', output_hidden_states=True)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=665.0), HTML(value='')))




In [83]:
# helper function for logging time
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [84]:
# hyperparameters

learning_rate = 5e-4
eps = 1e-8
warmup_steps = 1e2

In [85]:
# create text generation seed prompt
device = torch.device('cuda')
# device = torch.device('cpu')
prompt = "<BOS>"
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

In [86]:
def create_model(train_dataloader, val_dataloader, file_name):

    model = GPT2LMHeadModel.from_pretrained('gpt2', config=configuration)
    model.resize_token_embeddings(len(tokenizer))
    
    model.cuda()
    optimizer = AdamW(model.parameters(), lr=learning_rate, eps=eps)

    total_steps = len(train_dataloader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=warmup_steps,
                                                num_training_steps=total_steps)
    
    total_t0 = time.time()
    model = model.to(device)

    for epoch_i in range(0, EPOCHS):

        print(f'Epoch {epoch_i + 1} of {EPOCHS}')

        t0 = time.time()
        total_train_loss = 0
        model.train()

        for step, batch in enumerate(train_dataloader):

            b_input_ids = batch[0].to(device)
            b_labels = batch[0].to(device)
            b_masks = batch[1].to(device)

            model.zero_grad()        

            outputs = model(b_input_ids,
                            labels=b_labels, 
                            attention_mask=b_masks,
                            token_type_ids=None)

            loss = outputs[0]  

            batch_loss = loss.item()
            total_train_loss += batch_loss

            if step % SAMPLE_EVERY == 0 and step != 0:
                
                model.eval()
                sample_outputs = model.generate(
                                        generated,
                                        do_sample=True,   
                                        top_k=50, 
                                        max_length=200,
                                        top_p=0.95, 
                                        num_return_sequences=1
                                    )
                for i, sample_output in enumerate(sample_outputs):
                      print(f'Example output: {tokenizer.decode(sample_output, skip_special_tokens=True)}')

                model.train()

            loss.backward()
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_train_loss / len(train_dataloader)       
        training_time = format_time(time.time() - t0)

        print(f'Average Training Loss: {avg_train_loss}. Epoch time: {training_time}')

        t0 = time.time()

        print('Evaluating Model')

        model.eval()

        total_eval_loss = 0
        nb_eval_steps = 0

        for batch in val_dataloader:
            b_input_ids = batch[0].to(device)
            b_labels = batch[0].to(device)
            b_masks = batch[1].to(device)

            with torch.no_grad():        

                outputs  = model(b_input_ids,  
                                 attention_mask=b_masks,
                                 labels=b_labels)

                loss = outputs[0]  

            batch_loss = loss.item()
            total_eval_loss += batch_loss        

        avg_val_loss = total_eval_loss / len(val_dataloader)

        validation_time = format_time(time.time() - t0)    

        print(f'Validation loss: {avg_val_loss}. Validation Time: {validation_time}')

    print(f'Total training took {format_time(time.time()-total_t0)}')

    torch.save(model.state_dict(), home_directory + file_name)
    return model

In [87]:
plot_model = create_model(plot_train_dataloader, plot_val_dataloader, 'plot_model.pth')

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=548118077.0), HTML(value='')))




AssertionError: Torch not compiled with CUDA enabled