# Readability and Entailment Evaluation

## Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -q evaluate transformers datasets rouge_score jiwer
!pip install -q git+https://github.com/google-research/bleurt.git
!pip install -q nltk textstat peft

  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
import os
from datasets import load_dataset
from evaluate import evaluator
import evaluate
from transformers import AutoModelForSeq2SeqLM, pipeline, AutoTokenizer, AutoModelForSequenceClassification
from peft import AutoPeftModelForSeq2SeqLM
import nltk
nltk.download('punkt')
import textstat
import torch
import pprint
import numpy as np
import pandas as pd

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
with open("/content/drive/MyDrive/colab-notebooks/w266/hf.txt", "r") as f:
    HF_TOKEN = f.read()

os.environ["HF_TOKEN"] = HF_TOKEN

DATASET = "jordanfan/processed_us_congress_117_bills_v3"
MODEL = "google/pegasus-xsum"
PEFT = False
REVISION = "main"

INPUT_COLUMN = "cur_text"
LABEL_COLUMN = "cleaned_summary"
NEW_COLUMN = "generated_summary"

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Generation/Entailment Functions

In [None]:
entailment_tokenizer = AutoTokenizer.from_pretrained("khalidalt/DeBERTa-v3-large-mnli")
entailment_model = AutoModelForSequenceClassification.from_pretrained("khalidalt/DeBERTa-v3-large-mnli")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def calculate_entailment(
    premise,
    hypothesis,
    label_names = ["entailment", "neutral", "contradiction"]
):
  input = entailment_tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
  input.to(DEVICE)
  entailment_model.to(DEVICE)
  output = entailment_model(input["input_ids"])  # device = "cuda:0" or "cpu"

  prediction = torch.softmax(output["logits"][0], -1)

  return label_names[prediction.argmax(0).tolist()]

def encode_decode(df):
    inputs = tokenizer(df[INPUT_COLUMN], return_tensors = "pt", max_length = 512, truncation=True)
    inputs.to(DEVICE)
    summary_ids = model.generate(input_ids=inputs["input_ids"], max_length = 128, num_beams = 4)
    result = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return {NEW_COLUMN: result}

# Data, Model, and Mapping

In [None]:
dataset_test = load_dataset(DATASET, split="test")

# Remove irrelevant columns
cols_to_keep = ["index", "policy_areas", INPUT_COLUMN, LABEL_COLUMN]
cols_to_remove = [col for col in dataset_test.column_names if col not in cols_to_keep]
dataset_test = dataset_test.remove_columns(cols_to_remove)

In [None]:
if PEFT:
  model = AutoPeftModelForSeq2SeqLM.from_pretrained(MODEL, revision=REVISION)
else:
  model = AutoModelForSeq2SeqLM.from_pretrained(MODEL)

model.to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(MODEL, revision=REVISION, model_max_length=512)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
dataset_test = dataset_test.map(encode_decode)
dataset_test

Map:   0%|          | 0/377 [00:00<?, ? examples/s]

Dataset({
    features: ['index', 'policy_areas', 'cur_text', 'cleaned_summary', 'generated_summary'],
    num_rows: 377
})

## Calculate Readability and Entailment

In [None]:
readability_metrics = pd.Series(dataset_test[NEW_COLUMN]).map(lambda x: textstat.flesch_reading_ease(x))

print(f'Mean: {np.mean(readability_metrics)}')
print(f'Median: {np.median(readability_metrics)}')
print(f'Std: {np.std(readability_metrics)}')

Mean: 47.38140583554377
Median: 44.41
Std: 30.047931883749833


In [None]:
results = pd.DataFrame(dataset_test).\
  apply(lambda x: calculate_entailment(x[LABEL_COLUMN], x[NEW_COLUMN]), axis = 1).\
  value_counts(normalize = True)

print(results)

neutral          0.859416
entailment       0.122016
contradiction    0.018568
Name: proportion, dtype: float64
