In [None]:
import json
import os
import time
import pickle

import seaborn as sns
import matplotlib.pylab as plt
import numpy as np
import pandas as pd

import importlib
import nltk

nltk.download('punkt')

In [None]:
!nvidia-smi

In [None]:
# Install necessary packages

! pip install datasets
! pip install sentencepiece
! pip install rouge_score
! pip install wandb
! pip install bert-score
! pip install evaluate
! pip install transformers -U
! pip install bert-score
! pip install bertviz

In [None]:
# Connect Code to Google Drive (if necessary)

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

gdrive_dir = '/content/drive/MyDrive/CS 282 Project/Checkpoint 2/cs282-project/'

# Configuration

In [None]:
workdir = gdrive_dir

input_filename = 'expanded_all_data.json'

# Full DataSet Size is 15000 (review, paper), but due to GPU cost + time constraints, we use a sub-sample of 5000 reviews
sample_size = 25 

seed = 100

# Specify Extraction Method used to Generate Training Text from Papers
# Either: (intro, ce_extract, hybrid)
extraction_method = 'hybrid'

# Maximum Token Length of Paper Extracts Used To Train Model
max_paper_extract_length = 1024 # 1024 is the max input size of a BART model
max_review_length = 1024
min_text_length = 100

# Pre-Trained Hugging Face Seq2Seq Transformers Model
pre_trained_model_checkpoint = "facebook/bart-large-cnn"

# Summarization Task Configuration
summarization_params = {
    "summarization": {
        "early_stopping": True,
        "length_penalty": 2.0, # BART (favor longer sequences)
        "max_length": max_review_length,
        "min_length": min_text_length,
        "no_repeat_ngram_size": 3, # BART default
        "num_beams": 4 # BART default
    }
}

## Load DataSet

In [None]:
from datasets import concatenate_datasets, DatasetDict, Dataset, load_dataset
from sklearn.model_selection import train_test_split

# Load input data (post-extraction and pre-processing to downsample paper text)
all_input_df = pd.read_json(workdir + input_filename, orient='records')

Load Tokenizer For Filtering

In [None]:
from transformers import AutoTokenizer

# Load Tokenizer used with the corresponding pre-trained mode
tokenizer = AutoTokenizer.from_pretrained(pre_trained_model_checkpoint)

In [None]:
# Filter on token length [not currently used because it decreases the available 
# dataset size for the CE_extraction method]
def filter_on_token_length(df, tokenizer, text_col, text_min=min_text_length, 
                           text_max=max_paper_extract_length, review_min=min_text_length, 
                           review_max=max_review_length):
    
    def test_length_constraints(txt, tokenizer, mn_length, mx_length):
        tokenized_txt = tokenizer(txt, max_length=None, truncation=False)
        num_tokens = len(tokenized_txt['input_ids'])

        return (num_tokens >= mn_length) and (num_tokens <= mx_length)
      
    return df[
              (df[text_col].apply(lambda s: test_length_constraints(s, tokenizer, text_min, text_max))) &
              (df['review'].apply(lambda s: test_length_constraints(s, tokenizer, review_min, review_max)))
           ]

In [None]:
filt_input_df = all_input_df # [not currently used] filter_on_token_length(df_exp, tokenizer, extraction_method)
if sample_size:
    sample_df = filt_input_df.sample(sample_size, random_state=seed)
input_dataset = Dataset.from_pandas(sample_df if sample_size else filt_input_df)
input_dataset_dict = input_dataset.train_test_split(test_size=0.20)

In [None]:
input_dataset_dict

## Tokenize DataSet

In [None]:
def preprocess_function_for_review_task(paper_json, extraction_method='intro'):

    if extraction_method == 'intro':
        input_text = paper_json['intro']
    elif extraction_method == 'ce_extract':
        input_text = paper_json['ce_extract']
    elif extraction_method == 'hybrid':
        input_text = paper_json['ce_extract'] + paper_json['abstract']

    model_inputs = tokenizer(
        input_text,
        max_length=max_paper_extract_length,
        truncation=True,
    )

    labels = tokenizer(
        text_target=paper_json["review"], 
        max_length=max_review_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    # print(len(model_inputs['labels']))
    # print(len(paper_json['text']))
    return model_inputs

In [None]:
tokenized_dataset_for_reviews = input_dataset_dict.map(lambda s: preprocess_function_for_review_task(s, extraction_method=extraction_method))

In [None]:
tokenized_dataset_for_reviews # validate train / test sizes

In [None]:
len(tokenized_dataset_for_reviews['train'][0]['input_ids']) # validate that tokenizer creates encoding of expected token length

# Model Training

## Load Pre-Trained Model

In [None]:
# Last Saved Checkpoint - if training model from previous saved state 
# model_checkpoint = workdir + 'outputmodel/review_generation/' + extraction_method

In [None]:
local_checkpoint = workdir + 'outputmodel/review_generation/' + extraction_method

In [None]:
from transformers import BartForConditionalGeneration

# Load Pre-TrainedModel from BART
if local_checkpoint:
    pre_trained_model_checkpoint = local_checkpoint

model = BartForConditionalGeneration.from_pretrained(pre_trained_model_checkpoint, 
                                                     max_length=max_review_length, 
                                                     min_length=min_text_length,
                                                     task_specific_params=summarization_params)

## Fine-Tune Bart Model using Tokenized, Extracted Training Set

### Evaluation Metrics

In [None]:
import numpy as np
import nltk
from nltk import sent_tokenize
import bert_score
import evaluate

rouge_score = evaluate.load("rouge")

# Compute Evaulation Metric on Seq2Seq Prediction
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    rouge_results = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Compute BERT Scores
    # https://github.com/Tiiiger/bert_score/blob/master/example/Demo.ipynb
    # https://arxiv.org/pdf/1904.09675.pdf
    unscaled_bert_scores = bert_score.score(decoded_preds, decoded_labels, lang='en', 
                                   verbose=False, rescale_with_baseline=False)
    bs_prec, bs_recall, bs_f1 = (unscaled_bert_scores[0].mean(), 
                                 unscaled_bert_scores[1].mean(), 
                                 unscaled_bert_scores[2].mean())

    # Extract the median scores
    results = {k: round(v, 4) for k, v in rouge_results.items()}
    
    results.update({'Raw_BertScore_F1_mean': bs_f1, 
                    'Raw_BertScore_Recall_mean': bs_recall,
                    'Raw_BertScore_Precision_mean': bs_prec,
                    })
    
    scaled_bert_scores = bert_score.score(decoded_preds, decoded_labels, lang='en', 
                                   verbose=False, rescale_with_baseline=True)
    bs_prec, bs_recall, bs_f1 = (scaled_bert_scores[0].mean(), 
                                 scaled_bert_scores[1].mean(), 
                                 scaled_bert_scores[2].mean())

    results.update({'Scaled_BertScore_F1_mean': bs_f1, 
                    'Scaled_BertScore_Recall_mean': bs_recall,
                    'Scaled_BertScore_Precision_mean': bs_prec,
                    })

    return results

### Training Arguments

In [None]:
## Hyperparameters
# Selected based on Hugging Face Guide & Confirmed with validation on small samples
num_train_epochs = 1
learning_rate = 5.0e-5
weight_decay = 0.001

In [None]:
from transformers import Seq2SeqTrainingArguments

# Max Batch Size supported by GPU
batch_size = 1

# Show the training loss with every epoch
logging_steps = len(tokenized_dataset_for_reviews["train"]) // batch_size 

model_name = 'outputmodel/review_generation/' + extraction_method

os.environ["WANDB_DISABLED"] = "true" # disable wandb
args = Seq2SeqTrainingArguments(
    output_dir= workdir + model_name,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=weight_decay,
    save_strategy='epoch',
    save_total_limit=2,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True, 
    generation_max_length=max_review_length, 
    logging_steps=logging_steps,
    report_to=None 
)

### Training Function

In [None]:
# import torch
# import gc

# del trainer
# torch.cuda.empty_cache()
# gc.collect()
# torch.cuda.memory_summary(device=None, abbreviated=False)

In [None]:
from transformers import Seq2SeqTrainer
from transformers import DataCollatorForSeq2Seq

# Set Up Model Training
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_dataset_for_reviews["train"],
    eval_dataset=tokenized_dataset_for_reviews["test"],
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train Model on Tokenized DataSet
trainer.train()

# Save Model
trainer.save_model()

In [None]:
# tokenized_dataset_for_reviews['train'][0:5]

## Evaluate Model

In [None]:
txt = tokenized_dataset_for_reviews['train'][0][extraction_method]

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(workdir + model_name)
inputs = tokenizer(txt, return_tensors="pt", truncation=True).input_ids

model = AutoModelForSeq2SeqLM.from_pretrained(workdir + model_name)
outputs = model.generate(inputs)

decoded_output = ""
for output in outputs:
  decoded_output += tokenizer.decode(output, skip_special_tokens=True)

In [None]:
print(decoded_output)

In [None]:
# preds = trainer.predict(tokenized_dataset_for_reviews['test'])

In [None]:
trainer.evaluate()

In [None]:
model = trainer.model

In [None]:
preds = trainer.predict(tokenized_dataset['train'])

In [None]:
newmodel = torch.nn.Sequential(*(list(model.children())[:-1]))
print(newmodel)

In [None]:
len(tokenized_txt.encodings[0].tokens)
len(tokenized_txt.encodings[0].attention_mask)
tokenized_txt = tokenizer(txt)

In [None]:
# input_text = input_df.head(1)['hybrid'].values[0]
# review =  input_df.head(1)['review'].values[0]
# model_inputs = tokenizer(
#         input_text,
#         max_length=max_paper_extract_length,
#         truncation=True,
#     )

# labels = tokenizer(
#     text_target=review, max_length=max_review_length, truncation=True
# )
# model_inputs["labels"] = labels["input_ids"]
# # print(len(model_inputs['labels']))
# # print(len(paper_json['text']))

In [None]:
from transformers import BartForConditionalGeneration
from transformers import LongformerConfig, LongformerModel

# Initializing a Longformer configuration
configuration = LongformerConfig()

# Initializing a model from the configuration
model = LongformerModel(configuration)

# Accessing the model configuration
configuration = model.config

Note that the token length exceeds 1024, which is the maximum input size for BART, so some of the input will be truncated. We were surprised that this was teh case for the cross-entropy inputs, since we used the authors default configuration for downsampling the paper to an input text and the authors state that they use BART for their Seq2Seq models. It's possible that they are using another tokenizer.

In any case, due to time constraints (of re-running cross-entropy extraction and/or determining a different tokenizer/vocabulary file to use), we truncate inputs for now and leave further investigation to the next steps. 

# Reproduce Results

In [None]:
# Linear Model On Top of Custom Model

In [None]:
# Aspect Scores

In [None]:
!pip install accelerate[torch]
!pip3 install torch torchvision

In [None]:
import os
import torch
from torch import nn
# from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader
from torchvision import transforms
from collections import OrderedDict
from accelerate import Accelerator

In [None]:
class CustomModel(nn.Module):
  def __init__(self, checkpoint, num_labels): 
    super(CustomModel,self).__init__() 
    self.num_labels = num_labels 

    #Load Model with given checkpoint and extract its body
    self.model = model = AutoModel.from_pretrained(checkpoint,config=AutoConfig.from_pretrained(checkpoint))
    self.dropout = nn.Dropout(0.1) 
    self.classifier = nn.Linear(768, num_labels) # load and initialize weights

  def forward(self, input_ids=None, attention_mask=None,labels=None):
    #Extract outputs from the body
    outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

    #Add custom layers
    sequence_output = self.dropout(outputs[0]) #outputs[0]=last hidden state

    logits = self.classifier(sequence_output[:,0,:].view(-1,768)) # calculate losses
    
    loss = None
    if labels is not None:
      loss_fct = nn.CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
    
    return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)

In [None]:
def train_model(model, epochs=None, debug=False):
    """ Train a model. """
    config = get_model_configuration()
    loss_function = config.get("loss_function")()
    optimizer = config.get("optimizer")(model.parameters(), lr=1e-4)
    trainloader = get_dataset()
    accelerator = Accelerator()  

    # Accelerate model
    model, optimizer, trainloader = accelerator.prepare(model, optimizer, trainloader)

    # Iterate over the number of epochs
    entries = []
    
    if epochs is None:
        epochs = config.get("num_epochs")
    
    for epoch in range(epochs):
        # Print epoch
        print(f'Starting epoch {epoch+1}')

        # Set current loss value
        current_loss = 0.0
        
        output_data = []
        targets_data = []
 
        # Iterate over the DataLoader for training data
        st_time = time.time()
        for i, data in enumerate(trainloader, 0):
#             print(i)

            # Get inputs
            inputs, targets = data

            # Zero the gradients
            optimizer.zero_grad()

            # Perform forward pass
            outputs = model(inputs)

            # Compute loss
#             print(outputs)
#             print(outputs.shape)
#             print(targets)
#             print(targets.shape)
            loss = loss_function(outputs, targets)
            
            output_data.extend(outputs.cpu().detach().numpy())
            targets_data.extend(targets.cpu().detach().numpy())
            current_loss += loss.item()
            
            # Perform backward pass
            accelerator.backward(loss)

            # Perform optimization
            optimizer.step()
            
            # Print statistics
            if debug:
                print('Loss after mini-batch %5d: %.3f' %
                    (i + 1, current_loss / 500))

        end_time = time.time()
        
        train_entry = {'type': 'train', 'epoch': epoch, 
                       'loss': current_loss, 'time': round(end_time - st_time, 1)}
        
        
        print(f'Loss: {current_loss}')
        
        test_entry = {'type': 'test', 'epoch': epoch, 'loss': test_loss}
        
        entries.extend([train_entry, test_entry])

    # Return trained model
    return model, pd.DataFrame(entries), current_loss

In [None]:
## Load Data 

# Create Attention Maps

In [None]:
from transformers import BartForConditionalGeneration
from transformers import AutoTokenizer, AutoModel
from transformers import AutoModelForSeq2SeqLM



```
# This is formatted as code
```

Load pre-trained model + tokenizer

In [None]:
local_checkpoint = workdir + 'outputmodel/review_generation/' + extraction_method

# Load Tokenizer used with the corresponding pre-trained mode
tokenizer = AutoTokenizer.from_pretrained(pre_trained_model_checkpoint)

# Load Pre-TrainedModel from BART
if local_checkpoint:
    pre_trained_model_checkpoint = local_checkpoint

model = BartForConditionalGeneration.from_pretrained(pre_trained_model_checkpoint, 
                                                     max_length=max_review_length, 
                                                     min_length=min_text_length,
                                                     task_specific_params=summarization_params)

Use BertViz package to create attention maps

In [None]:
len(encoder_input_ids[0])

In [None]:
txt[0:2000]

In [None]:
# Prepare the inputs and compute attention

# tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
# model = AutoModel.from_pretrained("Helsinki-NLP/opus-mt-en-de", output_attentions=True)

encoder_input_ids = tokenizer(txt[0:2000], return_tensors="pt", add_special_tokens=True).input_ids
with tokenizer.as_target_tokenizer():
    decoder_input_ids = tokenizer(decoded_output, return_tensors="pt", add_special_tokens=True).input_ids

outputs = model(input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids)

encoder_text = tokenizer.convert_ids_to_tokens(encoder_input_ids[0])
decoder_text = tokenizer.convert_ids_to_tokens(decoder_input_ids[0])

In [None]:
txt = tokenized_dataset_for_reviews['train'][0][extraction_method]
inputs = tokenizer(txt, return_tensors="pt", truncation=True).input_ids

model_name = 'outputmodel/review_generation/' + extraction_method
model = AutoModelForSeq2SeqLM.from_pretrained(workdir + model_name)
outputs = model.generate(inputs)

decoded_output = ""
for output in outputs:
  decoded_output += tokenizer.decode(output, skip_special_tokens=True)

In [None]:
# Display the visualization using either head_view or model_view
from bertviz import model_view
model_view(encoder_attention=outputs.encoder_attentions, decoder_attention=outputs.decoder_attentions, cross_attention=outputs.cross_attentions, encoder_tokens= encoder_text, decoder_tokens = decoder_text)

In [None]:
from transformers import AutoTokenizer, AutoModel, utils
from bertviz import model_view

utils.logging.set_verbosity_error()  # Remove line to see warnings

# Initialize tokenizer and model. Be sure to set output_attentions=True.
# Load BART fine-tuned for summarization on CNN/Daily Mail dataset
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, output_attentions=True)

# get encoded input vectors
encoder_input_ids = tokenizer("The House Budget Committee voted Saturday to pass a $3.5 trillion spending bill", return_tensors="pt", add_special_tokens=True).input_ids

# create ids of encoded input vectors
decoder_input_ids = tokenizer("The House Budget Committee passed a spending bill.", return_tensors="pt", add_special_tokens=True).input_ids

outputs = model(input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids)

encoder_text = tokenizer.convert_ids_to_tokens(encoder_input_ids[0])
decoder_text = tokenizer.convert_ids_to_tokens(decoder_input_ids[0])

model_view(
    encoder_attention=outputs.encoder_attentions,
    decoder_attention=outputs.decoder_attentions,
    cross_attention=outputs.cross_attentions,
    encoder_tokens= encoder_text,
    decoder_tokens=decoder_text
)