In [1]:
import os
import random
import time

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

import torch
from torch import nn
from torch.nn import functional as F
from torch.nn import CrossEntropyLoss
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, random_split
torch.manual_seed(42)

from transformers import pipelines
from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup

from pymongo import MongoClient

from tqdm.notebook import tqdm_notebook as tqdm

import nltk
nltk.download('punkt')

import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nero_\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def time_calc(secs):
    '''Takes time in seconds and returns time in Hours:Minutes:Seconds'''
    sec_time = time.gmtime(sec)
    return  time.strftime("%H:%M:%S",sec_time)

In [10]:
client = MongoClient()

In [11]:
db = client['reviewdata']

In [12]:
collection = db['reciewdata']

In [16]:
int(50e6)

50000000

In [26]:
r_idx = random.sample(range(int(100000)), 30000)

In [32]:
test = []
ind = []

reviews = collection.find()

for index, review in enumerate(reviews[100000:130000]):
    try:
        test.append('{}'.format(review['reviewText']))
    except KeyError:
        ind.append(index)
        pass

In [30]:
test1 = reviews[
test1

{'_id': ObjectId('5fb2d5d189d5902ec8a73614'),
 'overall': 3,
 'verified': False,
 'reviewTime': '02 16, 2004',
 'reviewerID': 'A1UOXCPSW03GZ6',
 'asin': '0002005549',
 'style': {'Format:': ' Hardcover'},
 'reviewerName': 'CSL',
 'reviewText': 'Using facts of computer programming and multitudes of biological research, Crichton has collaborated another page-turner on the futuristic insights of human progress.  With all of the supporting evidence, one might say that this scenario could actually occur.  Already, scientists are cloning and there\'s nothing stopping them from making anything biologically dangerous out of genetic manipulation.\nThe idea of the book is that technology can be too much for us to handle.  Though geniuses we are, we still make stupid mistakes.  In this case: man lives, man creates, and man gets into trouble - or really just one man gets to clean up the mess.\nThis one man turns out to be the typical three-kids-day-care dad who knows how to program codes.  He strug

In [None]:
review_data = pd.DataFrame(data=test, columns=(['reviews']))

In [None]:
review_data

In [None]:
review_data.isna().sum()

In [None]:
reviews = review_data.copy()

In [None]:
reviews = reviews['reviews']

In [None]:
reviewlen = []
for review in tqdm(reviews):
    tokens = nltk.word_tokenize(review)
    reviewlen.append(len(tokens))
    
reviewlen = np.array(reviewlen)

sns.distplot(reviewlen)

In [None]:
len(reviewlen[reviewlen > 768])/len(reviewlen)*100

In [None]:
print('Average review length: {} words.'.format(round(np.average(reviewlen), 3)))

In [None]:
print('Max review length: {} words.'.format(np.max(reviewlen)))

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2',
                                          bos_token='<|sot|>', eos_token='<|eot|>', pad_token='<|pad|>')

In [None]:
print("Max model length is {} for this model".format(tokenizer.model_max_length))
print("Beginning of sentence token {} token has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id))
print("End of sentence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id))
print("Padding token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id))

In [None]:
class GPT_Finetune_Dataset(Dataset):

  def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=200):

    self.tokenizer = tokenizer
    self.input = []
    self.attn = []

    for txt in tqdm(txt_list):

      encodings_dict = tokenizer('<|sot|>'+ txt +'<|eot|>',
                                 truncation=True, max_length=max_length, padding="max_length")

      self.input.append(torch.tensor(encodings_dict['input_ids']))
      self.attn.append(torch.tensor(encodings_dict['attention_mask']))
    
  def __len__(self):
    return len(self.input)

  def __getitem__(self, idx):
    return self.input[idx], self.attn[idx] 

In [None]:
data = GPT_Finetune_Dataset(reviews, tokenizer)

In [None]:
train_size = int(len(data) * .7)
test_size = len(data) - train_size

train_set, test_set = random_split(data, [train_size, test_size])

In [None]:
print('{} training samples'.format(train_size))
print('{} test samples'.format(test_size))

In [None]:
batch_size = 3
train_dataloader = DataLoader(
            train_set,  # The training set
            sampler = RandomSampler(train_set), # Random sampler
            batch_size = batch_size # Trains with this batch size for memory reasons
        )

test_dataloader = DataLoader(
            test_set, # The validation samples.
            sampler = SequentialSampler(test_set), # Pull out batches sequentially since order doesn't matter
            batch_size = batch_size 
        )

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
# Get config
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

# Model instantiation
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

# Necessary because of the custom tokens
model.resize_token_embeddings(len(tokenizer))

# Model to the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.cuda()

seed_val = 42

# Setting seeds
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
# Setting Parameters
epochs = 5
learning_rate = .00005
warmup_steps = 50

# this produces sample output every 100 steps
sample_every = 100

In [None]:
#AdamW is a class from the huggingface library that schedules weights
optimizer = AdamW(model.parameters(),
                  lr = learning_rate
                )

In [None]:
total_steps = len(train_dataloader) * epochs

# Adjusts the learning rate as the model steps through
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)

In [None]:
timestat = time.time()

stats = []

model = model.to(device)

for epoch_i in range(0, epochs):

    # training loop
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))

    timestat = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()        

        outputs = model(  b_input_ids,
                          labels=b_labels, 
                          attention_mask = b_masks,
                          token_type_ids=None
                        )

        loss = outputs[0]  

        batch_loss = loss.item()
        total_train_loss += batch_loss

        # Get sample every x batches.
        if step % sample_every == 0 and not step == 0:

            elapsed = time.time() - timestat
            print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))

            model.eval()

            sample_outputs = model.generate(
                                    bos_token_id=random.randint(1,30000),
                                    do_sample=True,   
                                    top_k=50, 
                                    max_length = 200,
                                    top_p=0.95, 
                                    num_return_sequences=1
                                )
            for i, sample_output in enumerate(sample_outputs):
                  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
            
            model.train()

        loss.backward()

        optimizer.step()

        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)       
    
    # Measure how long this epoch took.
    training_time = time.time() - timestat

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))

    # Testing loop
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in test_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)
        
        with torch.no_grad():        

            outputs  = model(b_input_ids, 
#                            token_type_ids=None, 
                             attention_mask = b_masks,
                            labels=b_labels)
          
            loss = outputs[0]  
            
        batch_loss = loss.item()
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(test_dataloader)
    
    validation_time = time.time() - timestat 

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
torch.cuda.empty_cache()

In [None]:
training_df = pd.DataFrame(data=stats)

training_df = training_df.set_index('epoch')

training_df

In [None]:
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)
plt.plot(training_df['Training Loss'], 'r-o', label="Training")
plt.plot(training_df['Valid. Loss'], 'b-o', label="Testing")
plt.title("Training & Testing Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([1, 2, 3, 4])
plt.show()

In [None]:
params = list(model.named_parameters())

In [None]:
for param in params:
    print(param[0])

In [None]:
output_dir = './model_save/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [None]:
print("Saving model to {}".format(output_dir))

In [None]:
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
prompts = []
for review in reviews[:10]:
    try:
        prompts.append("<|sot|> " + ' '.join(review.split()[:4]))
    except:
        pass

In [None]:
prompts

In [None]:
model.eval()

gen_prompt = []
for prompt in prompts:
    gen_prompt.append(torch.tensor(tokenizer.encode(prompt)).unsqueeze(0).to(device))

In [None]:
reviewlist = []
t = time.time()
for x, promp in enumerate(gen_prompt):
    t1 = time.time()
    print('---------------------------------')
    print('''{}: The prompt is "{}"'''.format(x+1, prompts[x]))
    print('---------------------------------')
    sample_outputs = model.generate(
                                    promp, 
                                    bos_token_id= random.randint(1, 100000),
                                    do_sample=True,   
                                    top_k=30, 
                                    min_length=20,
                                    max_length = 500,
                                    top_p=0.95,
                                    num_return_sequences=20
                                    )
    
    for i, sample_output in enumerate(sample_outputs):
      print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
      reviewlist.append("{}".format(tokenizer.decode(sample_output, skip_special_tokens=True)))
    time_per_gen = time.time() - t1
    print('This generation took {} seconds'.format(round(time_per_gen, 3)))
totes = t - time.time()
print('Total time was {}'.format(time_calc(totes)))
torch.cuda.empty_cache()