# Model Evaluation on Bill 117 Dataset

## Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q evaluate transformers datasets rouge_score jiwer
!pip install -q git+https://github.com/google-research/bleurt.git

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for BLEURT (setup.py) ... [?25l[?25hdone


In [None]:
import os
from datasets import load_dataset
from evaluate import evaluator, load
import evaluate
from transformers import AutoModelForSeq2SeqLM, pipeline, AutoTokenizer
import pprint
import torch
import numpy as np

In [None]:
with open("/content/drive/MyDrive/colab-notebooks/w266/hf.txt", "r") as f:
    HF_TOKEN = f.read()

os.environ["HF_TOKEN"] = HF_TOKEN

DATASET = "jordanfan/processed_us_congress_117_bills_v3"
INPUT_COLUMN = "cur_text"
LABEL_COLUMN = "cleaned_summary"

MODEL = "etav22/pegasus-lora-legalease"
TOKENIZER = "etav22/pegasus-lora-legalease"
REVISION = "main"

RESULTS_NAME = "pegasus-baseline-128"

bleurt = load('bleurt', model_type = 'metric', checkpoint = "bleurt-base-512")
rouge = load('rouge')

print(f"Training: {MODEL} | {REVISION} ")
print(f"Dataset: {DATASET} | {INPUT_COLUMN} | {LABEL_COLUMN}")

Downloading builder script:   0%|          | 0.00/5.20k [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Training: etav22/pegasus-lora-legalease | main 
Dataset: jordanfan/processed_us_congress_117_bills_v3 | cur_text | cleaned_summary


## Data + Model

In [None]:
dataset = load_dataset(DATASET, split="test")
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/1.67k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/133M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/132M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/78.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11277 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/3388 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/377 [00:00<?, ? examples/s]

Dataset({
    features: ['Unnamed: 0', 'index', 'id', 'policy_areas', 'cur_summary', 'cur_text', 'title', 'titles_official', 'titles_short', 'sponsor_name', 'sponsor_party', 'sponsor_state', 'cleaned_summary', 'extracted_text', 'extracted_text_375', 'extracted_text_750', 'extracted_text_1000', 'bertsum_extracted_250', 'bertsum_extracted_375', 'bertsum_extracted_375_1000', 'bertsum_extracted_250_1000', 'bertsum_extracted_375_750', 'bertsum_extracted_250_750', 'bertsum_extracted_375_500', 'bertsum_extracted_250_500', 'bertsum_extracted_375_375', 'bertsum_extracted_250_375'],
    num_rows: 377
})

In [None]:
# model = AutoModelForSeq2SeqLM.from_pretrained(MODEL, revision=REVISION)
# tokenizer = AutoTokenizer.from_pretrained(MODEL, revision=REVISION)
# pipe = pipeline("summarization", model=MODEL, tokenizer=tokenizer, device=0, max_length=128, num_beams=4)
pipe = pipeline("summarization", MODEL, device=0, max_length=128, num_beams=4)

config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.1k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.11k [00:00<?, ?B/s]

## Evaluate

In [None]:
task_evaluator = evaluator("summarization")

eval_results = task_evaluator.compute(
    model_or_pipeline=pipe,
    data=dataset,
    metric=evaluate.combine([bleurt, rouge]),
    input_column=INPUT_COLUMN,
    label_column=LABEL_COLUMN,
)

Your max_length is set to 128, but your input_length is only 124. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=62)
Your max_length is set to 128, but your input_length is only 101. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


## Results

In [None]:
# Score the mean, stdev, median of the bleurt scores
eval_results["bleurt_mean"] = np.mean(eval_results["scores"])
eval_results["bleurt_median"] = np.median(eval_results["scores"])
eval_results["bleurt_stdev"] = np.std(eval_results["scores"])

In [None]:
# Sort the eval_results by keys
eval_results = {k: v for k, v in sorted(eval_results.items(), key=lambda item: item[0])}
eval_results

{'bleurt_mean': -0.5505242058784955,
 'bleurt_median': -0.555473268032074,
 'bleurt_stdev': 0.35921470143826484,
 'latency_in_seconds': 1.6438627283978784,
 'rouge1': 0.3686569165361465,
 'rouge2': 0.20197448541213253,
 'rougeL': 0.28862510119852935,
 'rougeLsum': 0.2888425742430287,
 'samples_per_second': 0.6083233001910129,
 'scores': [-0.5790255665779114,
  -0.1826440840959549,
  -0.29163143038749695,
  -0.8051545023918152,
  -0.7117620706558228,
  -0.8771706819534302,
  -1.1546409130096436,
  -0.8705845475196838,
  -0.8754370212554932,
  0.5887635350227356,
  -0.5686818957328796,
  0.4399973750114441,
  -0.438571035861969,
  -1.2362953424453735,
  0.16201892495155334,
  -1.2433019876480103,
  -0.3879150152206421,
  -0.6426333785057068,
  -0.41556477546691895,
  -0.6426333785057068,
  -0.451993465423584,
  -0.6861805319786072,
  -0.6229086518287659,
  -0.29654520750045776,
  0.2679579555988312,
  -0.9930290579795837,
  -0.4440684914588928,
  0.23773741722106934,
  -0.623382151126861

In [None]:
evaluate.save(f'/content/drive/MyDrive/colab-notebooks/w266/results/reval/{RESULTS_NAME}', **eval_results)

PosixPath('/content/drive/MyDrive/colab-notebooks/w266/results/reval/pegasus-baseline-128/result-2024_04_05-21_39_24.json')