In [1]:
import datasets
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import evaluate
from tqdm import tqdm
import torch
from itertools import islice
import matplotlib.pyplot as plt
import seaborn as sns
import json
import numpy as np

In [2]:
billsum_test = datasets.load_dataset("json", data_files="billsum_data/us_test_data_final_OFFICIAL.jsonl")["train"]

billsum_test

Dataset({
    features: ['text', 'summary', 'bill_id', 'title', 'text_len', 'sum_len'],
    num_rows: 3269
})

In [3]:
bigbird_predictions = {}
with open("billsum_test_bigbird_tuned_pred.jsonl", "r") as file:
  for line in file.readlines():
    record = json.loads(line)
    bigbird_predictions[record["title"]] = { "candidate": record["pred_summary"] }

len(bigbird_predictions.keys())

3269

In [4]:
for record in billsum_test:
  try:
    bigbird_predictions[record["title"]]["reference"] = record["summary"]
  except:
    print("Couldn't find prediction for", record["title"])

In [5]:
for i, (title, summaries) in enumerate(bigbird_predictions.items()):
  if i >= 5:
    break

  print(f"Example #{ i + 1 }")
  print("Title:", title)
  print("Reference:", summaries["reference"])
  print("Candidate:", summaries["candidate"])
  print()

Example #1
Title: A bill to reauthorize the Appalachian Regional Development Act of 1965, and for other purposes.
Reference: Appalachian Regional Development Act Amendments of 2002 - Amends the Appalachian Regional Development Act of 1965 to include as functions of the Appalachian Regional Commission that it: (1) support local development districts; (2) encourage the use of eco-industrial development technologies and approaches; and (3) seek to coordinate economic development activities of, and the use of economic development resources by, Federal agencies in the Appalachian region.(Sec. 4) Directs the President to establish the Interagency Coordinating Council on Appalachia.(Sec. 5) Authorizes the Commission to provide technical assistance and make grants, enter into contracts, and otherwise provide funds to persons or entities in the region for projects to: (1) increase affordable access to advanced telecommunications, entrepreneurship, and management technologies or applications; (2

In [6]:
all_candidates = []
all_references = []

for summaries in bigbird_predictions.values():
  all_candidates.append(summaries["candidate"])
  all_references.append(summaries["reference"])

In [7]:
rouge = evaluate.load('rouge')

results = rouge.compute(predictions=all_candidates, references=all_references)
print(results)

Downloading builder script: 0.00B [00:00, ?B/s]

{'rouge1': 0.24333959915398778, 'rouge2': 0.04087011590491582, 'rougeL': 0.153464556449777, 'rougeLsum': 0.17615902800211444}


In [8]:
bertscore = evaluate.load("bertscore")

results = bertscore.compute(predictions=all_candidates, references=all_references, lang="en")
#print(results)

Downloading builder script: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


In [9]:
print(f'Average precision: {np.mean(results["precision"])}')
print(f'Average recall: {np.mean(results["recall"])}')
print(f'Average f1: {np.mean(results["f1"])}')

Average precision: 0.8146519995895355
Average recall: 0.8004088837387876
Average f1: 0.8071631306025113
