In [None]:
# Installation Library

!pip install transformers datasets sentencepiece rouge 

In [None]:
import torch
from transformers import pipeline
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

In [None]:
from rouge import Rouge
import pandas as pd

In [None]:
from datasets import load_dataset
dataset = load_dataset("billsum")

Downloading:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/832 [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset billsum/default (download: 64.14 MiB, generated: 259.80 MiB, post-processed: Unknown size, total: 323.94 MiB) to /root/.cache/huggingface/datasets/billsum/default/3.0.0/d1e95173aed3acb71327864be74ead49b578522e4c7206048b2f2e5351b57959...


Downloading:   0%|          | 0.00/67.3M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset billsum downloaded and prepared to /root/.cache/huggingface/datasets/billsum/default/3.0.0/d1e95173aed3acb71327864be74ead49b578522e4c7206048b2f2e5351b57959. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 18949
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 3269
    })
    ca_test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 1237
    })
})

In [None]:
train = dataset['train']
test =  dataset['ca_test']

In [None]:
test[0]

{'summary': 'Existing property tax law establishes a veterans’ organization exemption under which property is exempt from taxation if, among other things, that property is used exclusively for charitable purposes and is owned by a veterans’ organization.\nThis bill would provide that the veterans’ organization exemption shall not be denied to a property on the basis that the property is used for fraternal, lodge, or social club purposes, and would make specific findings and declarations in that regard. The bill would also provide that the exemption shall not apply to any portion of a property that consists of a bar where alcoholic beverages are served.\nSection 2229 of the Revenue and Taxation Code requires the Legislature to reimburse local agencies annually for certain property tax revenues lost as a result of any exemption or classification of property for purposes of ad valorem property taxation.\nThis bill would provide that, notwithstanding Section 2229 of the Revenue and Taxatio

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-billsum")
model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-billsum")

In [None]:
summarizer = pipeline("summarization", model= model, tokenizer= tokenizer, framework="tf")

In [None]:
system_summary = summarizer(train[0]['text'], min_length=200, max_length=500)

In [None]:
standard_summary = train[0]['summary']
system_summary = system_summary[0]['summary_text']

In [None]:
system_summary

'Shields a business entity from civil liability relating to any injury or death occurring at a facility of that entity in connection with a use of such facility by a nonprofit organization if: (1) the use occurs outside the scope of business of the business entity; (2) such injury or death occurs during a period that such facility is used by such organization; and (3) the business entity authorized the use of such facility by the organization. <n>Makes this Act inapplicable to an injury or death that results from an act or omission of a business entity that constitutes gross negligence or intentional misconduct, including misconduct that: (1) constitutes a crime of violence or act of international terrorism for which the defendant has been convicted in any court; or (2) involves a sexual offense for which the defendant has been convicted in any court or misconduct for which the defendant has been found to have violated a Federal or State civil rights law. Declares that this Act shall n

In [None]:
rouge = Rouge()
score = rouge.get_scores(system_summary, standard_summary)
pd.DataFrame(score[0]).set_index([['recall','precision','f-measure']])

Unnamed: 0,rouge-1,rouge-2,rouge-l
recall,0.756303,0.663594,0.714286
precision,0.967742,0.9,0.913978
f-measure,0.849057,0.763926,0.801887


### **Inference On TestSet**

In [None]:
data = dataset['ca_test']
CasesText = data['text']
GoldSummary = data['summary']

SystemSummary = []

In [None]:
# CasesText[:500]
# CasesText[500:800]
# CasesText[800:]

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model_name = "google/pegasus-billsum"

pegasus_tokenizer = PegasusTokenizer.from_pretrained(model_name)
pegasus_model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

In [None]:
for i, case in enumerate(CasesText):

  strtolist = []
  strtolist.append(case)

  batch = pegasus_tokenizer(strtolist, truncation=True, padding="longest", return_tensors="pt").to(device)
  summary = pegasus_model.generate(**batch)
  summary_final = pegasus_tokenizer.batch_decode(summary, skip_special_tokens=False)
  SystemSummary.append(summary_final)

  print(i)
  strtolist.clear()

In [None]:
SystemSummaryFinal = []

In [None]:
for i in SystemSummary:
  SystemSummaryFinal.append((i[0]))

In [None]:
Summaries = pd.DataFrame(list(zip(GoldSummary, SystemSummaryFinal)), columns =['GoldSummary', 'SystemSummary'])

In [None]:
Summaries.to_csv("PegsusSummaries.csv", header=True, index=False)

### **Pegsus Model Score on CA_Test**

In [None]:
file1 = pd.read_csv('/content/PegsusSummaries1.csv')
file2 = pd.read_csv('/content/PegsusSummaries2.csv')
file3 = pd.read_csv('/content/PegsusSummaries3.csv')

In [None]:
file1.shape, file2.shape, file3.shape, 

((500, 2), (300, 2), (437, 2))

In [None]:
PegsusSummaries = pd.concat([file1, file2, file3])

In [None]:
PegsusSummaries.sample(5)

Unnamed: 0,GoldSummary,SystemSummary
222,Exiting law requires insurers issuing group or...,"This bill amends the Insurance Code, with resp..."
354,"Under existing law, the California FAIR (fair ...",(This measure has not been amended since it wa...
365,Existing sales and use tax laws impose a tax o...,"Declares that, between January 1, 2017, and Ja..."
209,(1) The California Environmental Quality Act (...,This bill amends the Public Resources Code to ...
181,"Existing law, the Safe Drinking Water State Re...",This bill requires the State of California to ...


In [None]:
system_summary = PegsusSummaries['SystemSummary']
standard_summary = PegsusSummaries['GoldSummary']

In [None]:
rouge = Rouge()
score = rouge.get_scores(system_summary, standard_summary, avg=True)

In [None]:
PegsusRouge = pd.DataFrame(score).set_index([['recall','precision','f-measure']])*100

In [None]:
PegsusRouge

Unnamed: 0,rouge-1,rouge-2,rouge-l
recall,30.521843,14.370507,26.845739
precision,45.086425,24.3158,39.965022
f-measure,34.2502,16.632646,30.220123
