In [1]:
from huggingface_hub import notebook_login, login

In [2]:
notebook_login()

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/geshi/.cache/huggingface/token
Login successful


In [3]:
from datasets import load_dataset

billsum = load_dataset("billsum", split="ca_test")

In [4]:
billsum = billsum.train_test_split(test_size=0.2)

In [5]:
billsum["train"][0]

{'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 7522.02 of the Government Code is amended to read:\n7522.02.\n(a) (1) Notwithstanding any other law, except as provided in this article, on and after January 1, 2013, this article shall apply to all state and local public retirement systems and to their participating employers, including the Public Employees’ Retirement System, the State Teachers’ Retirement System, the Legislators’ Retirement System, the Judges’ Retirement System, the Judges’ Retirement System II, county and district retirement systems created pursuant to the County Employees Retirement Law of 1937 (Chapter 3 (commencing with Section 31450) of Part 3 of Division 4 of Title 3), independent public retirement systems, and to individual retirement plans offered by public employers. However, this article shall be subject to the Internal Revenue Code and Section 17 of Article XVI of the California Constitution. The administration o

In [6]:
from transformers import AutoTokenizer

In [7]:
checkpoint = "google-t5/t5-small"

In [8]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [9]:
prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [10]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

In [11]:
from transformers import DataCollatorForSeq2Seq

In [12]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [13]:
import evaluate
import numpy as np

In [14]:
rouge = evaluate.load("rouge")

In [15]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

# Train

In [16]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [17]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [18]:
training_args = Seq2SeqTrainingArguments(
    output_dir="t5_billsum_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

In [19]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [20]:
trainer.train()

[2024-08-30 18:33:12,829] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/geshi/.netrc




Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,3.158419,0.1359,0.0442,0.1156,0.1155,19.0
2,No log,2.830211,0.1315,0.0407,0.111,0.1112,19.0
3,No log,2.716018,0.133,0.0426,0.1129,0.1127,19.0
4,No log,2.687833,0.1347,0.0445,0.1138,0.1138,19.0




TrainOutput(global_step=124, training_loss=3.29980714859501, metrics={'train_runtime': 1154.5231, 'train_samples_per_second': 3.427, 'train_steps_per_second': 0.107, 'total_flos': 1070824333246464.0, 'train_loss': 3.29980714859501, 'epoch': 4.0})

In [21]:
trainer.push_to_hub()

events.out.tfevents.1725067993.cajal.1852258.0:   0%|          | 0.00/8.31k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/geshijoker/t5_billsum_model/commit/43776de2538ea8854db02981b8c26cb58539876b', commit_message='End of training', commit_description='', oid='43776de2538ea8854db02981b8c26cb58539876b', pr_url=None, pr_revision=None, pr_num=None)

# Inference

In [22]:
text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."

In [23]:
from transformers import pipeline

In [24]:
summarizer = pipeline("summarization", model="geshijoker/t5_billsum_model")

config.json:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.7k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [25]:
summarizer(text)

Your max_length is set to 200, but your input_length is only 103. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)


[{'summary_text': "the Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs . it's the most aggressive action on tackling the climate crisis in history . no one making under $400,000 per year will pay a penny more in taxes ."}]

In [26]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [27]:
tokenizer = AutoTokenizer.from_pretrained("geshijoker/t5_billsum_model")

In [28]:
inputs = tokenizer(text, return_tensors="pt").input_ids

In [29]:
model = AutoModelForSeq2SeqLM.from_pretrained("geshijoker/t5_billsum_model")

In [30]:
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)

In [31]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

"the Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in American history. it'll ask the ultra-wealthy and corporations to pay their fair share."