In [2]:
import json
import os
import time
import pickle

import seaborn as sns
import matplotlib.pylab as plt
import numpy as np
import pandas as pd

import importlib
import nltk

nltk.download('punkt')

# !nvidia-smi

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/cynthiachen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Install necessary packages

# ! pip install datasets
# ! pip install sentencepiece
# ! pip install rouge_score
# ! pip install wandb
# ! pip install bert-score
# ! pip install evaluate
# ! pip install transformers -U
# ! pip install bert-score
# ! pip install bertviz

from transformers import utils





Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.26.1
    Uninstalling transformers-4.26.1:
      Successfully uninstalled transformers-4.26.1
Successfully installed transformers-4.27.4




In [None]:
# # Connect code to Google Drive (if necessary)

# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

# gdrive_dir = '/content/drive/MyDrive/CS 282 Project/Checkpoint 2/cs282-project/'

# Configuration

In [4]:
workdir = gdrive_dir

input_filename = 'expanded_all_data.json'

# Full DataSet Size is 15000 (review, paper), but due to GPU cost + time constraints, we use a sub-sample of 5000 reviews
sample_size = 25 

seed = 100

# Specify Extraction Method used to Generate Training Text from Papers
# Either: (intro, ce_extract, hybrid)
extraction_method = 'hybrid'

# Maximum Token Length of Paper Extracts Used To Train Model
max_paper_extract_length = 1024 # 1024 is the max input size of a BART model
max_review_length = 1024
min_text_length = 100

# Pre-Trained Hugging Face Seq2Seq Transformers Model
pre_trained_model_checkpoint = "facebook/bart-large-cnn"

# Summarization Task Configuration
summarization_params = {
    "summarization": {
        "early_stopping": True,
        "length_penalty": 2.0, # BART (favor longer sequences)
        "max_length": max_review_length,
        "min_length": min_text_length,
        "no_repeat_ngram_size": 3, # BART default
        "num_beams": 4 # BART default
    }
}

NameError: name 'gdrive_dir' is not defined

## Load DataSet

In [None]:
from datasets import concatenate_datasets, DatasetDict, Dataset, load_dataset
from sklearn.model_selection import train_test_split

# Load input data (post-extraction and pre-processing to downsample paper text)
all_input_df = pd.read_json(workdir + input_filename, orient='records')

Load Tokenizer For Filtering

In [None]:
from transformers import AutoTokenizer

# Load Tokenizer used with the corresponding pre-trained mode
tokenizer = AutoTokenizer.from_pretrained(pre_trained_model_checkpoint)

In [None]:
# Filter on token length [not currently used because it decreases the available 
# dataset size for the CE_extraction method]
def filter_on_token_length(df, tokenizer, text_col, text_min=min_text_length, 
                           text_max=max_paper_extract_length, review_min=min_text_length, 
                           review_max=max_review_length):
    
    def test_length_constraints(txt, tokenizer, mn_length, mx_length):
        tokenized_txt = tokenizer(txt, max_length=None, truncation=False)
        num_tokens = len(tokenized_txt['input_ids'])

        return (num_tokens >= mn_length) and (num_tokens <= mx_length)
      
    return df[
              (df[text_col].apply(lambda s: test_length_constraints(s, tokenizer, text_min, text_max))) &
              (df['review'].apply(lambda s: test_length_constraints(s, tokenizer, review_min, review_max)))
           ]

In [None]:
filt_input_df = all_input_df # [not currently used] filter_on_token_length(df_exp, tokenizer, extraction_method)
if sample_size:
    sample_df = filt_input_df.sample(sample_size, random_state=seed)
input_dataset = Dataset.from_pandas(sample_df if sample_size else filt_input_df)
input_dataset_dict = input_dataset.train_test_split(test_size=0.20)

In [None]:
input_dataset_dict

## Tokenize DataSet

In [None]:
def preprocess_function_for_review_task(paper_json, extraction_method='intro'):
    if extraction_method == 'intro':
        input_text = paper_json['intro']
    elif extraction_method == 'ce_extract':
        input_text = paper_json['ce_extract']
    elif extraction_method == 'hybrid':
        input_text = paper_json['ce_extract'] + paper_json['abstract']

    model_inputs = tokenizer(
        input_text,
        max_length=max_paper_extract_length,
        truncation=True,
    )

    labels = tokenizer(
        text_target=paper_json["review"], 
        max_length=max_review_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    # print(len(model_inputs['labels']))
    # print(len(paper_json['text']))
    return model_inputs

In [None]:
tokenized_dataset_for_reviews = input_dataset_dict.map(lambda s: preprocess_function_for_review_task(s, extraction_method=extraction_method))

In [None]:
tokenized_dataset_for_reviews # validate train / test sizes

In [None]:
len(tokenized_dataset_for_reviews['train'][0]['input_ids']) # validate that tokenizer creates encoding of expected token length

Note that the token length exceeds 1024, which is the maximum input size for BART, so some of the input will be truncated. We were surprised that this was teh case for the cross-entropy inputs, since we used the authors default configuration for downsampling the paper to an input text and the authors state that they use BART for their Seq2Seq models. It's possible that they are using another tokenizer.

In any case, due to time constraints (of re-running cross-entropy extraction and/or determining a different tokenizer/vocabulary file to use), we truncate inputs for now and leave further investigation to the next steps. 

# Create Attention Maps

In [None]:
from transformers import BartForConditionalGeneration
from transformers import AutoTokenizer, AutoModel
from transformers import AutoModelForSeq2SeqLM
from bertviz import model_view
from bertviz import head_view

# Example: https://github.com/jessevig/bertviz/blob/master/notebooks/model_view_encoder_decoder.ipynb

Load pre-trained model + tokenizer

In [None]:
local_checkpoint = workdir + 'outputmodel/review_generation/' + extraction_method

# Load Tokenizer used with the corresponding pre-trained mode
tokenizer = AutoTokenizer.from_pretrained(pre_trained_model_checkpoint)

# Load Pre-TrainedModel from BART
if local_checkpoint:
    pre_trained_model_checkpoint = local_checkpoint

model = BartForConditionalGeneration.from_pretrained(pre_trained_model_checkpoint, 
                                                     max_length=max_review_length, # use max_new_tokens instead?
                                                     min_length=min_text_length,
                                                     task_specific_params=summarization_params,
                                                     output_attentions=True)

# model_name = 'outputmodel/review_generation/' + extraction_method
# model = AutoModelForSeq2SeqLM.from_pretrained(workdir + model_name)

Use BertViz package to create attention maps

In [None]:
txt = tokenized_dataset_for_reviews['train'][0][extraction_method]

In [None]:
# Select data

# get encoded input vectors
inputs = tokenizer(txt[0:50], return_tensors="pt", truncation=True)
encoder_input_ids = inputs.input_ids

# generate model outputs
model_outputs = model.generate(encoder_input_ids)
decoded_output = ""
for output in model_outputs:
  decoded_output += tokenizer.decode(output, skip_special_tokens=True)

# create ids of encoded input vectors
with tokenizer.as_target_tokenizer():
    decoder_input_ids = tokenizer(decoded_output, return_tensors="pt").input_ids

In [None]:
input_text = txt[0:50]
output_text = decoded_output

In [None]:
outputs = model(input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids)

encoder_text = tokenizer.convert_ids_to_tokens(encoder_input_ids[0])
decoder_text = tokenizer.convert_ids_to_tokens(decoder_input_ids[0])

In [None]:
# Display the visualization using either head_view or model_view
head_view(
    encoder_attention=outputs.encoder_attentions,
    decoder_attention=outputs.decoder_attentions,
    cross_attention=outputs.cross_attentions,
    encoder_tokens=encoder_text,
    decoder_tokens=decoder_text
)

In [None]:
# from transformers import AutoTokenizer, AutoModel, utils
# from bertviz import model_view

# utils.logging.set_verbosity_error()  # Remove line to see warnings

# # Initialize tokenizer and model. Be sure to set output_attentions=True.
# # Load BART fine-tuned for summarization on CNN/Daily Mail dataset
# model_name = "facebook/bart-large-cnn"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModel.from_pretrained(model_name, output_attentions=True) # LOOK AT THIS!!!!

# # get encoded input vectors
# encoder_input_ids = tokenizer(input_text, return_tensors="pt", add_special_tokens=True).input_ids

# # create ids of encoded input vectors
# decoder_input_ids = tokenizer(output_text, return_tensors="pt", add_special_tokens=True).input_ids

# outputs = model(input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids)

# encoder_text = tokenizer.convert_ids_to_tokens(encoder_input_ids[0])
# decoder_text = tokenizer.convert_ids_to_tokens(decoder_input_ids[0])

# model_view(
#     encoder_attention=outputs.encoder_attentions,
#     decoder_attention=outputs.decoder_attentions,
#     cross_attention=outputs.cross_attentions,
#     encoder_tokens= encoder_text,
#     decoder_tokens=decoder_text
# )

In [None]:
from transformers import AutoTokenizer, AutoModel, utils
from bertviz import model_view

utils.logging.set_verbosity_error()  # Remove line to see warnings

# Initialize tokenizer and model. Be sure to set output_attentions=True.
# Load BART fine-tuned for summarization on CNN/Daily Mail dataset
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, output_attentions=True) # LOOK AT THIS!!!!

# get encoded input vectors
encoder_input_ids = tokenizer("The House Budget Committee voted Saturday to pass a $3.5 trillion spending bill", return_tensors="pt", add_special_tokens=True).input_ids

# create ids of encoded input vectors
decoder_input_ids = tokenizer("The House Budget Committee passed a spending bill.", return_tensors="pt", add_special_tokens=True).input_ids

outputs = model(input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids)

encoder_text = tokenizer.convert_ids_to_tokens(encoder_input_ids[0])
decoder_text = tokenizer.convert_ids_to_tokens(decoder_input_ids[0])

model_view(
    encoder_attention=outputs.encoder_attentions,
    decoder_attention=outputs.decoder_attentions,
    cross_attention=outputs.cross_attentions,
    encoder_tokens= encoder_text,
    decoder_tokens=decoder_text
)