# Model Evaluation

## Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q evaluate transformers datasets rouge_score jiwer textstat
!pip install -q git+https://github.com/google-research/bleurt.git

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m47.7 MB

In [None]:
import os
from datasets import load_dataset
from evaluate import evaluator, load
import evaluate
from transformers import AutoModelForSeq2SeqLM, pipeline, AutoTokenizer, AutoModelForSequenceClassification
import pprint
import torch
import numpy as np
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import textstat

_tokenizer = RegexpTokenizer(r'\w+')

In [None]:
with open("/content/drive/MyDrive/colab-notebooks/w266/hf.txt", "r") as f:
    HF_TOKEN = f.read()

os.environ["HF_TOKEN"] = HF_TOKEN

DATASET = "jordanfan/processed_us_congress_117_bills_v3"
INPUT_COLUMN = "extracted_text_1000"
LABEL_COLUMN = "cleaned_summary"
NEW_COLUMN = "generated_summary"

MODEL = "jordanfan/bart_extractive_1024_1000"
TOKENIZER = "jordanfan/bart_extractive_1024_1000"
REVISION = "main"

RESULTS_NAME = "results.csv"

bleu = load('bleurt', model_type = 'metric', checkpoint = "bleurt-base-512")
rouge = load('rouge')

print(f"Training: {MODEL} | {REVISION} ")
print(f"Dataset: {DATASET} | {INPUT_COLUMN} | {LABEL_COLUMN}")

Downloading builder script:   0%|          | 0.00/5.20k [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/405M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Training: jordanfan/bart_extractive_1024_1000 | main 
Dataset: jordanfan/processed_us_congress_117_bills_v3 | extracted_text_1000 | cleaned_summary


## Data + Model

In [None]:
dataset = load_dataset(DATASET)
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/1.67k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/133M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/132M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/78.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11277 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/3388 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/377 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'index', 'id', 'policy_areas', 'cur_summary', 'cur_text', 'title', 'titles_official', 'titles_short', 'sponsor_name', 'sponsor_party', 'sponsor_state', 'cleaned_summary', 'extracted_text', 'extracted_text_375', 'extracted_text_750', 'extracted_text_1000', 'bertsum_extracted_250', 'bertsum_extracted_375', 'bertsum_extracted_375_1000', 'bertsum_extracted_250_1000', 'bertsum_extracted_375_750', 'bertsum_extracted_250_750', 'bertsum_extracted_375_500', 'bertsum_extracted_250_500', 'bertsum_extracted_375_375', 'bertsum_extracted_250_375'],
        num_rows: 11277
    })
    val: Dataset({
        features: ['Unnamed: 0', 'index', 'id', 'policy_areas', 'cur_summary', 'cur_text', 'title', 'titles_official', 'titles_short', 'sponsor_name', 'sponsor_party', 'sponsor_state', 'cleaned_summary', 'extracted_text', 'extracted_text_375', 'extracted_text_750', 'extracted_text_1000', 'bertsum_extracted_250', 'bertsum_extracted_375', 'b

In [None]:
def count_num_words(text):
  tokenized = _tokenizer.tokenize(str(text[INPUT_COLUMN]))
  return {"word_count": len(tokenized)}

dataset = dataset.map(count_num_words, batched=False)

Map:   0%|          | 0/11277 [00:00<?, ? examples/s]

Map:   0%|          | 0/3388 [00:00<?, ? examples/s]

Map:   0%|          | 0/377 [00:00<?, ? examples/s]

In [None]:
def calculate_word_statistics(split: str):
  categories = np.unique(dataset[split]['policy_areas'])
  category_statistics = {}
  for category in categories:
    statistics = {}
    temp_dataset = dataset[split].filter(lambda x: x['policy_areas'] == category)
    statistics['median'] = np.median(temp_dataset['word_count'])
    statistics['mean'] = np.mean(temp_dataset['word_count'])
    statistics['stdev'] = np.std(temp_dataset['word_count'])
    statistics['count'] = len(temp_dataset)

    category_statistics[category] = statistics

  category_statistics = dict(sorted(category_statistics.items(), key=lambda item: item[1]['count']))

  return category_statistics

In [None]:
train_word_statistics = calculate_word_statistics('train')
test_word_statistics = calculate_word_statistics('test')

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11277 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

In [None]:
def group_categories(categories_dict):
    # Sort the dictionary based on count
    sorted_categories = sorted(categories_dict.items(), key=lambda x: x[1]['count'])

    # Calculate the thresholds for low, medium, and high
    total_categories = len(sorted_categories)
    low_threshold = total_categories // 3
    high_threshold = 2 * low_threshold

    # Group categories based on count
    low_categories = {}
    medium_categories = {}
    high_categories = {}

    for category, data in sorted_categories:
        count = data['count']
        if count <= sorted_categories[low_threshold][1]['count']:
            low_categories[category] = data
        elif count <= sorted_categories[high_threshold][1]['count']:
            medium_categories[category] = data
        else:
            high_categories[category] = data

    return {"low": low_categories, "medium": medium_categories, "high": high_categories}

groups = group_categories(train_word_statistics)

# Print the categories found in each group:
for group in groups:
  print(f">>> Group: {group}")
  for category in groups[group]:
    print(category)

Total categories: 32
Low threshold: 10
Medium threshold: 20
>>> Group: low
Social Sciences and History
Sports and Recreation
Arts, Culture, Religion
Animals
Families
Water Resources Development
Civil Rights and Liberties, Minority Issues
Congress
Law
Economics and Public Finance
Emergency Management
>>> Group: medium
Social Welfare
Foreign Trade and International Finance
Native Americans
Housing and Community Development
Agriculture and Food
Environmental Protection
Science, Technology, Communications
Energy
Labor and Employment
Immigration
>>> Group: high
Finance and Financial Sector
Commerce
Public Lands and Natural Resources
Education
Transportation and Public Works
Crime and Law Enforcement
International Affairs
Government Operations and Politics
Taxation
Armed Forces and National Security
Health


In [None]:
for group in groups:
  # Print the summed count per grup
  print(f"Group: {group}")
  group_sum = 0
  for category in groups[group]:
    group_sum += groups[group][category]['count']
  print(f"Summed count: {group_sum}")


Group: low
Summed count: 863
Group: medium
Summed count: 2679
Group: high
Summed count: 7735


In [24]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL, revision=REVISION)
tokenizer = AutoTokenizer.from_pretrained(MODEL, revision=REVISION, model_max_length=512)
try:
  pipe = pipeline("summarization", model=model, tokenizer=tokenizer, device=0, max_length=128, num_beams=4)
except ValueError:
  print("Using CPU")
  pipe = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1, max_length=128, num_beams=4)

Using CPU


## Evaluate

In [None]:
task_evaluator = evaluator("summarization")

In [None]:
entailment_tokenizer = AutoTokenizer.from_pretrained("khalidalt/DeBERTa-v3-large-mnli")
entailment_model = AutoModelForSequenceClassification.from_pretrained("khalidalt/DeBERTa-v3-large-mnli")
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def calculate_entailment(
    premise,
    hypothesis,
    label_names = ["entailment", "neutral", "contradiction"]
):
  input = entailment_tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
  input.to(DEVICE)
  entailment_model.to(DEVICE)
  output = entailment_model(input["input_ids"])  # device = "cuda:0" or "cpu"

  prediction = torch.softmax(output["logits"][0], -1)

  return label_names[prediction.argmax(0).tolist()]

def encode_decode(df):
    inputs = tokenizer(df[INPUT_COLUMN], return_tensors = "pt", max_length=512, truncation=True)
    inputs.to(DEVICE)
    summary_ids = model.generate(input_ids=inputs["input_ids"], max_length=128, num_beams = 4)
    result = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return {NEW_COLUMN: result}

tokenizer_config.json:   0%|          | 0.00/310 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

In [None]:
results = {}
dataset_results = {}

for category in groups:
  category_dict = {}
  temp_dataset = dataset['test'].filter(lambda x: x['policy_areas'] in list(groups[category].keys()))

  print(f"Evaluating {len(temp_dataset)} {category} bills...")
  eval_results = task_evaluator.compute(
      model_or_pipeline=pipe,
      data=temp_dataset,
      metric=evaluate.combine([rouge, bleu]),
      input_column=INPUT_COLUMN,
      label_column=LABEL_COLUMN,
  )
  temp_dataset = temp_dataset.map(encode_decode)

  dataset_results[category] = temp_dataset

  category_dict['mean_bleu'] = np.mean(eval_results["scores"])
  category_dict['median_bleu'] = np.median(eval_results["scores"])
  category_dict['stdev_bleu'] = np.std(eval_results["scores"])
  category_dict['rouge1'] = eval_results['rouge1']
  category_dict['rouge2'] = eval_results['rouge2']
  category_dict['rougeL'] = eval_results['rougeL']
  category_dict['rougeLsum'] = eval_results['rougeLsum']
  category_dict['no_examples'] = len(temp_dataset)

  results[category] = category_dict

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Evaluating 37 low bills...


Your max_length is set to 128, but your input_length is only 105. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)


Map:   0%|          | 0/37 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Evaluating 93 medium bills...


Map:   0%|          | 0/93 [00:00<?, ? examples/s]

Filter:   0%|          | 0/377 [00:00<?, ? examples/s]

Evaluating 247 high bills...


Your max_length is set to 128, but your input_length is only 121. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)
Your max_length is set to 128, but your input_length is only 2. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=1)


Map:   0%|          | 0/247 [00:00<?, ? examples/s]

In [None]:
readability = {}

for category, data in dataset_results.items():
  print(f'Category: {category}')
  readability_metrics = pd.Series(data[NEW_COLUMN]).map(lambda x: textstat.flesch_reading_ease(x))
  readability[category] = np.mean(readability_metrics)

  print(f'Mean: {np.mean(readability_metrics)}')
  print(f'Median: {np.median(readability_metrics)}')
  print(f'Std: {np.std(readability_metrics)}')

Category: low
Mean: 23.16459459459459
Median: 25.12
Std: 29.310812099236138
Category: medium
Mean: 22.490645161290324
Median: 22.08
Std: 19.724177173589897
Category: high
Mean: 24.638137651821857
Median: 27.49
Std: 23.04296967824217


In [None]:
entailment = {}
neutral = {}
contradiction = {}

for category, data in dataset_results.items():
  print(f'Category: {category}')
  res = pd.DataFrame(data).\
    apply(lambda x: calculate_entailment(x[LABEL_COLUMN], x[NEW_COLUMN]), axis = 1).\
    value_counts(normalize = True)

  entailment[category] = res['entailment']
  neutral[category] = res['neutral']
  contradiction[category] = res['contradiction']

  print(res)

Category: low
neutral          0.594595
entailment       0.351351
contradiction    0.054054
Name: proportion, dtype: float64
Category: medium
neutral          0.634409
entailment       0.354839
contradiction    0.010753
Name: proportion, dtype: float64
Category: high
neutral          0.615385
entailment       0.356275
contradiction    0.028340
Name: proportion, dtype: float64


In [None]:
df = pd.DataFrame.from_dict(results, orient='index')
df = df.sort_values(by='no_examples', ascending=True)

df['readability'] = df.index.map(readability)
df['entailment'] = df.index.map(entailment)
df['neutral'] = df.index.map(neutral)
df['contradiction'] = df.index.map(contradiction)
df.to_csv(RESULTS_NAME)