In [2]:
from datasets import load_dataset

billsum = load_dataset("billsum", split="ca_test")

In [3]:
billsum = billsum.train_test_split(test_size=0.2)

In [4]:
billsum["train"][0]

{'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 23663 of the Revenue and Taxation Code is amended to read:\n23663.\n(a) (1) Notwithstanding any other law\nto the contrary\n, for each taxable year beginning on or after July 1, 2008, any credit allowed to a taxpayer under this chapter that is an eligible credit may be assigned by that taxpayer to any eligible assignee.\n(2) A credit assigned under paragraph (1) may\nonly\nbe applied by the eligible assignee\nonly\nagainst the\n“tax” (as\n“tax,” as\ndefined in Section\n23036)\n23036,\nof the eligible assignee in a taxable year beginning on or after January 1, 2010.\n(3) Except as specifically provided in this section, following an assignment of any eligible credit under this section, the eligible assignee shall be treated as if it originally earned the assigned credit.\n(b) For purposes of this section, the following definitions shall apply:\n(1) “Affiliated corporation” means a corporation th

In [23]:
billsum['test'][0]

{'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 222.5 of the Code of Civil Procedure is amended to read:\n222.5.\n(a) To select a fair and impartial jury in civil jury trials, the court shall examine the prospective jurors. Upon completion of the court’s initial examination, counsel for each party shall have the right to examine, by oral and direct questioning, any of the prospective jurors so that counsel may intelligently exercise both peremptory challenges and challenges for cause. During any examination conducted by counsel for the parties, the court should permit liberal and probing examination calculated to discover bias or prejudice with regard to the circumstances of the particular case. The fact that a topic has been included in the court’s examination should not preclude additional nonrepetitive or nonduplicative questioning in the same area by counsel.\n(b) To help facilitate the jury selection process, the court in civil trials 

In [6]:
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [9]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
    print(model_inputs[0])
    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
    print(labels[0])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [10]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Encoding(num_tokens=1024, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
Encoding(num_tokens=72, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


Map: 100%|██████████| 989/989 [00:01<00:00, 956.99 examples/s]
Map: 100%|██████████| 248/248 [00:00<00:00, 951.60 examples/s]

Encoding(num_tokens=1024, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
Encoding(num_tokens=128, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])





In [11]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [20]:
import evaluate

rouge = evaluate.load("rouge")

In [21]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [25]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)