In [1]:
from datasets import load_dataset
billsum = load_dataset("billsum", split="ca_test")

In [2]:
billsum = billsum.train_test_split(test_size=0.2)

In [21]:
billsum

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 989
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 248
    })
})

In [3]:
from transformers import AutoTokenizer



In [4]:
checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [5]:
prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [6]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

In [7]:
from transformers import DataCollatorForSeq2Seq

In [8]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [10]:
import evaluate
rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [11]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [12]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [26]:
training_args = Seq2SeqTrainingArguments(
    output_dir="output_models",
    evaluation_strategy="epoch",
    save_strategy="epoch", # Let's see if this makes a difference
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

In [27]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [28]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.48484,0.1479,0.0564,0.1235,0.1234,19.0
2,No log,2.44457,0.1783,0.0776,0.1494,0.1494,19.0
3,No log,2.425741,0.1901,0.088,0.1594,0.1592,19.0
4,No log,2.418828,0.1897,0.0883,0.1587,0.1586,19.0




TrainOutput(global_step=248, training_loss=2.60229738297001, metrics={'train_runtime': 23926.2496, 'train_samples_per_second': 0.165, 'train_steps_per_second': 0.01, 'total_flos': 1070824333246464.0, 'train_loss': 2.60229738297001, 'epoch': 4.0})

In [1]:
# Now that we've finished training, let's do some inference using some data!
text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."

In [3]:
from transformers import pipeline
summarizer = pipeline("summarization", model="output_models/checkpoint-248")
summarizer(text)

Your max_length is set to 200, but your input_length is only 103. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)


[{'summary_text': "the Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country."}]

In [4]:
text_2 = "Kombucha (also tea mushroom, tea fungus, or Manchurian mushroom when referring to the culture; Latin name Medusomyces gisevii)[1] is a fermented, lightly effervescent, sweetened black tea drink. Sometimes the beverage is called kombucha tea to distinguish it from the culture of bacteria and yeast.[2] Juice, spices, fruit or other flavorings are often added."

In [5]:
summarizer(text_2)

Your max_length is set to 200, but your input_length is only 103. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)


[{'summary_text': 'Kombucha tea is a fermented, lightly effervescent, sweetened black tea drink . Juice, spices, fruit or other flavorings are often added to the beverage .'}]

In [8]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("output_models/checkpoint-248")
inputs = tokenizer(text, return_tensors="pt").input_ids

In [10]:
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained("output_models/checkpoint-248")
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)

In [11]:
outputs

tensor([[    0,    37,    86,    89,  6105,   419,  8291,  1983,  1364,     7,
          7744,  2672,  1358,     6,   533,   124,  1358,     6,    11,   827,
          1358,     5,    94,    31,     7,     8,   167,  8299,  1041,    30,
             3, 26074,     8,  3298,  5362,    16,   797,   892,     5,    94,
            31,   195,   987,     8,  6173,    18,  1123,   138,   189,    63,
            11, 11711,    12,   726,    70,  2725,   698,     5,     1]])

In [12]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

"The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history. It'll ask the ultra-wealthy and corporations to pay their fair share."