In [9]:
from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
import sys
import pickle
from rouge_score import rouge_scorer
from rouge_score import scoring
import random
import pandas as pd
import json

In [4]:
# imports

# delete research article 
class ResearchArticle(object):
    def __init__(self, article_id, abstract_text, article_text):
        """
          'article_id': str,
          'abstract_text': List[str],
          'article_text': List[str],
        """
        self.article_id = article_id
        self.abstract_text = abstract_text
        self.article_text = article_text


def load_data(path):
    """
    path - path where data is stored

    data - returns data as ResearchArticle objects

    """
    with open(path, 'rb') as f:
        data = pickle.load(f, encoding='bytes')
    return data

def shuffle_and_sample(data):
    random.seed(42)
    random.shuffle(data)
    return data[:600]


def to_paragraph(text):
    """
    converts text comprised of lists of sentances to a single to_paragraph

    text - list of multiple string sentances

    continuous_text - single continuous text string
    """
    continuous_text = ''
    for line in text:
        continuous_text += line
    return continuous_text

def get_scores(hypothesis, reference):
    """
    if hypothesis and or reference is converted to summary or not, this returns
    the rouge score of the two

    hypothesis - list of sentances or single paragraph

    reference - list of sentances or single paragraph

    score - rouge scores Todo:specify output
    """
    if len(hypothesis) !=1:
        hypothesis_abstract = to_paragraph(hypothesis)
    if len(reference) != 1:
        reference_abstract = to_paragraph(reference)
    rouge = Rouge()
    score = rouge.get_scores(hypothesis_abstract, reference_abstract)
    return score

### Load Data

In [11]:
with open('../data/mini_val_set_json.txt') as json_file:
    mini_val = json.load(json_file)
mini_val = mini_val['data']

In [3]:
# todo: delete
validation_data = load_data('pudmed_val.pk.bin')
mini_val_data = shuffle_and_sample(validation_data)

In [None]:
summaries_df = pd.DataFrame(columns=['reference', 'prediction'])
results = {}
# initialize tokenizer, model, and scorer
tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-pubmed")
model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-pubmed")
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeLsum"], use_stemmer=True)
aggregator = scoring.BootstrapAggregator()
for i in range(len(mini_val)):
    input = tokenizer(mini_val[i]['article_text'], is_split_into_words=True, return_tensors='pt', max_length=4096, truncation=True)
    prediction = model.generate(**input)
    prediction = tokenizer.batch_decode(prediction)
    score = scorer.score(to_paragraph(mini_val[i]['abstract_text']), prediction[0])
    aggregator.add_scores(score)
    ag_score = aggregator.aggregate()
    results[mini_val[i]['article_id']] = score
    addition = pd.DataFrame([[to_paragraph(mini_val[i]['abstract_text']), prediction[0]]],columns=['reference', 'prediction'])
    summaries_df = summaries_df.append(addition, ignore_index=True)
    if i % 10 == 0:
        print(f'Processed {i} documents')
final_ag_score = ag_score
print('Completed document evaluation')
print(final_ag_score)
with open("trucated_doc_results.txt", 'w') as outfile:
    json.dump(results, outfile)
summaries_df.to_csv('bigbird_baseline_summaries.csv', index=False)

Processed 0 documents
Processed 10 documents


In [None]:
# import data
validation_data = load_data('pudmed_val.pk.bin')
mini_val_data = shuffle_and_sample(validation_data)
#print('loaded data!')
# iterate over dev set
total_papers = 0
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeLsum"], use_stemmer=True) # todo: evaluate if this when changed to l(not sum) is the same
aggregator = scoring.BootstrapAggregator()
#print('initialized score package!')
tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-pubmed")
model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-pubmed")
#print('loaded model and tokenizer!')
#max_len = 4096  # maximum length the model can take in at once
i = 0
for paper in mini_val_data:
    #paper_abstract = to_paragraph(paper.abstract_text)  # create a single text line
    input = tokenizer(paper.article_text, return_tensors='pt', is_split_into_words=True, max_length=4096, truncation=True)
    #print('tokenized input')
    prediction = model.generate(**input)
    #print('generated prediction')
    prediction = tokenizer.batch_decode(prediction)
    #print('decoded predictin')
    #print(prediction[0])
    # evaluate summary compared to ground truth
    total_papers += 1
    reference = to_paragraph(paper.abstract_text)
    #print('generated reference')
    score = scorer.score(reference, prediction[0])
    #print('scored prediction')
    aggregator.add_scores(score)
    #print('aggregated score')
    ag_score = aggregator.aggregate()
    file1 = open("mini_val_records.txt","a")
    file1.write(str({'index': i, 'article_id': {paper.article_id}, 'predicted_summary': prediction[0], 'ground_truth': reference, 'score': score ,'aggregate_score': ag_score}))
    file1.close()
    if total_papers % 10 == 0:
        print(f'Processed {total_papers} papers')
        # todo: run again and save score, as well as the file name
    i += 1
#print final output
file2 = open("final_record.txt","a")
file2.write(str({'mini_val_data_aggregate_score': ag_score}))
file2.close()
print(f'Aggregate ROUGE Scores:\n {ag_score}')

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /opt/conda/conda-bld/pytorch_1623448265233/work/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


Processed 10 papers
Processed 20 papers
Processed 30 papers
Processed 40 papers
Processed 50 papers


Attention type 'block_sparse' is not possible if sequence_length: 458 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Processed 60 papers
Processed 70 papers
Processed 80 papers
Processed 90 papers
Processed 100 papers
Processed 110 papers
Processed 120 papers
Processed 130 papers
Processed 140 papers
Processed 150 papers
Processed 160 papers
Processed 170 papers
Processed 180 papers
Processed 190 papers
Processed 200 papers
Processed 210 papers
Processed 220 papers
Processed 230 papers
Processed 240 papers
Processed 250 papers
Processed 260 papers
Processed 270 papers
Processed 280 papers
Processed 290 papers
Processed 300 papers
Processed 310 papers
Processed 320 papers
Processed 330 papers
Processed 340 papers
Processed 350 papers
Processed 360 papers
Processed 370 papers
Processed 380 papers
Processed 390 papers
Processed 400 papers
Processed 410 papers
Processed 420 papers
Processed 430 papers
Processed 440 papers
Processed 450 papers
Processed 460 papers
Processed 470 papers
Processed 480 papers
Processed 490 papers
Processed 500 papers
