# Pegasus Training on Bill-117 Dataset

## Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -q datasets evaluate accelerate peft bitsandbytes

In [None]:
from transformers import DataCollatorForSeq2Seq, PegasusTokenizer, PegasusForConditionalGeneration
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from huggingface_hub import notebook_login
from datasets import load_dataset
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import evaluate
from peft import get_peft_model, LoraConfig, TaskType
import torch
import os

sns.set_theme()

In [None]:
with open("/content/drive/MyDrive/colab-notebooks/w266/hf.txt", "r") as f:
    HF_TOKEN = f.read()
    os.environ["HF_TOKEN"] = HF_TOKEN

In [None]:
# ====== ENVIRONMENT ======
DEV = False
EXPLORE = False
PEFT = False
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# ====== DATA =======
INPUT_MAX_LENGTH = 512
LABEL_MAX_LENGTH = 128
INPUT_COLUMN = "cur_text"
LABEL_COLUMN = "cleaned_summary"

# ====== MODEL ======
CHECKPOINT = "google/pegasus-xsum"
PATH = '/content/drive/MyDrive/colab-notebooks/w266/'
if PEFT:
  MODEL_NAME = "pegasus-lora-legalease"
else:
  MODEL_NAME = "pegasus-legalease"
HUGGINGFACE_DIR = "etav22/" + MODEL_NAME
CUSTOM_NAME = "pegasus-baseline-128"

# ====== OPTIIZER =======
OPTIMIZER = "adamw_torch"
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01

# ====== TRAINING ======
EVAL_STRATEGY = "steps"
BATCH_SIZE = 4
EPOCHS = 2
SAVE_STEPS = 100 if DEV else 1000
EVAL_STEPS = 100 if DEV else 250
LOGGING_STEPS = 100 if DEV else 500
EARLY_STOPPING_PATIENCE = 3
EARLY_STOPPING_THRESHOLD = 0.005

print(f"Using {CHECKPOINT} model")
print(f"Column names: {INPUT_COLUMN}, {LABEL_COLUMN}")
print(f"Model name: {MODEL_NAME}")
print(f"Custom name: {CUSTOM_NAME}")

Using google/pegasus-xsum model
Column names: cur_text, cleaned_summary
Model name: pegasus-legalease
Custom name: pegasus-baseline-128


## Dataset setup

In [None]:
!mkdir -p /content/drive/MyDrive/colab-notebooks/w266/data_v3
!git clone https://huggingface.co/datasets/jordanfan/processed_us_congress_117_bills_v3 /content/drive/MyDrive/colab-notebooks/w266/data_v3

fatal: destination path '/content/drive/MyDrive/colab-notebooks/w266/data_v3' already exists and is not an empty directory.


In [None]:
dataset = load_dataset("/content/drive/MyDrive/colab-notebooks/w266/data_v3/data")
dataset['train']

Dataset({
    features: ['Unnamed: 0', 'index', 'id', 'policy_areas', 'cur_summary', 'cur_text', 'title', 'titles_official', 'titles_short', 'sponsor_name', 'sponsor_party', 'sponsor_state', 'cleaned_summary', 'extracted_text', 'extracted_text_375', 'extracted_text_750', 'extracted_text_1000', 'bertsum_extracted_250', 'bertsum_extracted_375', 'bertsum_extracted_375_1000', 'bertsum_extracted_250_1000', 'bertsum_extracted_375_750', 'bertsum_extracted_250_750', 'bertsum_extracted_375_500', 'bertsum_extracted_250_500', 'bertsum_extracted_375_375', 'bertsum_extracted_250_375'],
    num_rows: 11277
})

## Pegasus Model

In [None]:
tokenizer = PegasusTokenizer.from_pretrained(CHECKPOINT)
model = PegasusForConditionalGeneration.from_pretrained(CHECKPOINT)
model.config.max_length = LABEL_MAX_LENGTH
model.config.num_beams = 4
model.config

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PegasusConfig {
  "_name_or_path": "google/pegasus-xsum",
  "activation_dropout": 0.1,
  "activation_function": "relu",
  "add_bias_logits": false,
  "add_final_layer_norm": true,
  "architectures": [
    "PegasusForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 16,
  "decoder_start_token_id": 0,
  "do_blenderbot_90_layernorm": false,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 16,
  "eos_token_id": 1,
  "extra_pos_embeddings": 0,
  "force_bos_token_to_be_generated": false,
  "forced_eos_token_id": 1,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,


## Prepare Dataset

In [None]:
# Tokenize the entire dataset
def tokenize_function(examples):
	model_inputs = tokenizer(examples[INPUT_COLUMN], return_tensors="pt", max_length=INPUT_MAX_LENGTH, padding=True, truncation=True)
	labels = tokenizer(text_target=examples[LABEL_COLUMN], max_length=LABEL_MAX_LENGTH, padding=True, truncation=True)
	model_inputs["labels"] = labels["input_ids"]

	return model_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=CHECKPOINT)

## Train Model

### Peft Configuration

In [None]:
def find_target_modules(model):
    # Initialize a Set to Store Unique Layers
    unique_layers = set()

    # Iterate Over All Named Modules in the Model
    for name, module in model.named_modules():
        # Check if the Module Type Contains 'Linear4bit'
        if "Linear" in str(type(module)):
            # Extract the Type of the Layer
            layer_type = name.split('.')[-1]

            # Add the Layer Type to the Set of Unique Layers
            unique_layers.add(layer_type)

    # Return the Set of Unique Layers Converted to a List
    return list(unique_layers)

modules = find_target_modules(model)
print(modules)

['out_proj', 'lm_head', 'fc1', 'fc2', 'v_proj', 'q_proj', 'k_proj']


In [None]:
if PEFT:
  peft_config = LoraConfig(
      task_type=TaskType.SEQ_2_SEQ_LM,
      inference_mode=False,
      target_modules=modules,
      r=16,
      lora_alpha=32,
      lora_dropout=0.1
  )

  model = get_peft_model(model, peft_config)
  model.print_trainable_parameters()

### Training Configurations

In [None]:
from transformers import EarlyStoppingCallback

early_stopping = EarlyStoppingCallback(early_stopping_patience=EARLY_STOPPING_PATIENCE, early_stopping_threshold=EARLY_STOPPING_THRESHOLD)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=HUGGINGFACE_DIR,
    evaluation_strategy=EVAL_STRATEGY,
    save_strategy=EVAL_STRATEGY,
    save_steps=SAVE_STEPS,
    eval_steps=EVAL_STEPS,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=WEIGHT_DECAY,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    num_train_epochs=EPOCHS,
    fp16=True,
    report_to=["tensorboard"],
    push_to_hub=True,
    hub_token=HF_TOKEN
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks = [early_stopping]
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss
250,No log,4.995386
500,5.259200,4.317471
750,5.259200,1.307422
1000,2.381900,1.198699
1250,2.381900,1.16778
1500,1.311300,1.149074
1750,1.311300,1.136877
2000,1.215800,1.127331
2250,1.215800,1.116548
2500,1.211900,1.113659


Non-default generation parameters: {'max_length': 128, 'num_beams': 4, 'length_penalty': 0.6, 'forced_eos_token_id': 1}
Non-default generation parameters: {'max_length': 128, 'num_beams': 4, 'length_penalty': 0.6, 'forced_eos_token_id': 1}
Non-default generation parameters: {'max_length': 128, 'num_beams': 4, 'length_penalty': 0.6, 'forced_eos_token_id': 1}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=3750, training_loss=1.922697998046875, metrics={'train_runtime': 4901.8577, 'train_samples_per_second': 4.601, 'train_steps_per_second': 1.151, 'total_flos': 2.166664890993869e+16, 'train_loss': 1.922697998046875, 'epoch': 1.33})

## Upload Model

In [None]:
trainer.create_model_card(
    language='english',
    model_name=MODEL_NAME,
    finetuned_from=CHECKPOINT,
    tasks='summarization',
    tags='summarization',
    dataset='hheiden/us-congress-117-bills',
    dataset_args=f"Max token input: {INPUT_MAX_LENGTH} | {LABEL_MAX_LENGTH}"
)

In [None]:
if DEV:
  commit_msg = f"training completed[dev]: {CUSTOM_NAME}"
else:
  commit_msg = f"training completed[prod]: {CUSTOM_NAME}"

trainer.push_to_hub(commit_message=commit_msg)

Non-default generation parameters: {'max_length': 128, 'num_beams': 4, 'length_penalty': 0.6, 'forced_eos_token_id': 1}


events.out.tfevents.1712346352.9e6d5516626c.3600.0:   0%|          | 0.00/11.4k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/etav22/pegasus-legalease/commit/c9dc406239283b65e173ccd7c915dabe141d5ea9', commit_message='training completed[prod]: pegasus-baseline-128', commit_description='', oid='c9dc406239283b65e173ccd7c915dabe141d5ea9', pr_url=None, pr_revision=None, pr_num=None)