# Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -q datasets evaluate accelerate peft bitsandbytes

In [None]:
from transformers import DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, pipeline, AutoTokenizer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from huggingface_hub import notebook_login
from datasets import load_dataset
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import evaluate
from peft import get_peft_model, LoraConfig, TaskType
import torch
import os
import pprint
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import DataCollatorForSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

sns.set_theme()

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# ====== ENVIRONMENT ======
DEV = False
EXPLORE = False
PEFT = True
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# ====== DATA =======
INPUT_MAX_LENGTH = 512
LABEL_MAX_LENGTH = 128
INPUT_COLUMN = "cur_text"
LABEL_COLUMN = "cleaned_summary"

# ====== MODEL ======
CHECKPOINT = "t5-base"
PATH = '/content/drive/MyDrive/colab-notebooks/w266/'
MODEL_NAME = "T5-PEFT-4"
HUGGINGFACE_DIR = "jgibb/" + MODEL_NAME
CUSTOM_NAME = "T5-PEFT-4"

# ====== OPTIIZER =======
OPTIMIZER = "adamw_torch"
LEARNING_RATE = 2e-4
WEIGHT_DECAY = 0.01

# ====== TRAINING ======
EVAL_STRATEGY = "steps"
BATCH_SIZE = 8
EPOCHS = 2
SAVE_STEPS = 100 if DEV else 1000
EVAL_STEPS = 100 if DEV else 250
LOGGING_STEPS = 100 if DEV else 500
EARLY_STOPPING_PATIENCE = 3
EARLY_STOPPING_THRESHOLD = 0.005


## Dataset setup

In [None]:
dataset = load_dataset("jordanfan/processed_us_congress_117_bills_v3")

# Dataset Exploration

There are a couple of things I want to first do before modeling:

1. Find the distribution of bill categories
2. Get an understanding for how long the text is for each bill
3. Determine how long the summaries are for each bill

### 1. Distribution of Bill Categories

In [None]:
# Create a bar plot of the policy areas in descending order with the label on the y-axis
if EXPLORE:
  policy_areas = {"train": {}, "test": {}}

  for split in dataset.keys():
    for i in range(len(dataset[split])):
      policy_area = dataset[split][i]["policy_areas"]
      if policy_area not in policy_areas[split]:
        policy_areas[split][policy_area] = 0
      policy_areas[split][policy_area] += 1
    fig, ax = plt.subplots(2, 1)

  for i, split in enumerate(policy_areas.keys()):
      sns.barplot(
          ax=ax[i],
          y=list(policy_areas[split].values()),
          x=list(policy_areas[split].keys()),
          hue=list(policy_areas[split].keys()),
          palette="viridis"
      )
      # Set the title for both plots
      ax[i].set_title(f"Policy Areas in {split} split")
      # Rotate the x-axis labels for better readability
      ax[i].set_xticklabels(ax[i].get_xticklabels(), rotation=90)
      # turn off y label for the right plot
      if i == 0:
          ax[i].set_xticklabels([])

  plt.show()

### 2. Lengths of Bill Text

In [None]:
if EXPLORE:
	bill_lengths = {
			"train": {"text": [], "summary": []},
			"test": {"text": [], "summary": []}
	}
	bill_stats = {
			"train": {"text": {}, "summary": {}},
			"test": {"text": {}, "summary": {}}
	}

	for split in dataset.keys():
		for i in range(len(dataset[split])):
			try:
				bill_lengths[split]['text'].append(len(dataset[split][i]["cur_text"].split()))
				bill_lengths[split]['summary'].append(len(dataset[split][i]["cur_summary"].split()))
			except Exception as e:
				pass

		for category in bill_lengths[split]:
			bill_stats[split][category]["mean"] = sum(bill_lengths[split][category]) / len(bill_lengths[split][category])
			bill_stats[split][category]["max"] = max(bill_lengths[split][category])
			bill_stats[split][category]["min"] = min(bill_lengths[split][category])
			bill_stats[split][category]["std"] = np.std(bill_lengths[split][category])
			bill_stats[split][category]["median"] = np.median(bill_lengths[split][category])

	print(bill_stats['train'])
	print(bill_stats['test'])

# T5 Model

In [None]:
tokenizer = T5Tokenizer.from_pretrained(CHECKPOINT)
model = T5ForConditionalGeneration.from_pretrained(CHECKPOINT)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Prepare Dataset

In [None]:
def preprocess_data_baseline(df):
  input = ["summarize: " + doc for doc in df["cur_text"]]
  #tokenize full text
  model_inputs = tokenizer(input, max_length=INPUT_MAX_LENGTH, return_tensors = "pt", padding=True, truncation=True)
  #tokenize the summaries
  summary_input = [doc for doc in df["cleaned_summary"]]
  targets = tokenizer(text_target = summary_input, max_length=LABEL_MAX_LENGTH, padding= True, truncation=True)
  #set labels
  model_inputs['labels'] = targets['input_ids']
  return model_inputs

tokenized_datasets = dataset.map(preprocess_data_baseline, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=CHECKPOINT)

Map:   0%|          | 0/11277 [00:00<?, ? examples/s]

Map:   0%|          | 0/3388 [00:00<?, ? examples/s]

Map:   0%|          | 0/377 [00:00<?, ? examples/s]

## Train Model

### Peft Configuration

In [None]:
def find_target_modules(model):
    # Initialize a Set to Store Unique Layers
    unique_layers = set()

    # Iterate Over All Named Modules in the Model
    for name, module in model.named_modules():
        # Check if the Module Type Contains 'Linear4bit'
        if "Linear" in str(type(module)):
            # Extract the Type of the Layer
            layer_type = name.split('.')[-1]

            # Add the Layer Type to the Set of Unique Layers
            unique_layers.add(layer_type)

    # Return the Set of Unique Layers Converted to a List
    return list(unique_layers)

modules = find_target_modules(model)
print(modules)

['k', 'wo', 'v', 'wi', 'o', 'lm_head', 'q']


In [None]:

peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    target_modules=modules,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1
  )

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 7,014,400 || all params: 229,917,952 || trainable%: 3.0508274534386945


### Training Configurations

In [None]:
from transformers import EarlyStoppingCallback

early_stopping = EarlyStoppingCallback(early_stopping_patience=EARLY_STOPPING_PATIENCE, early_stopping_threshold=EARLY_STOPPING_THRESHOLD)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=HUGGINGFACE_DIR,
    evaluation_strategy=EVAL_STRATEGY,
    save_strategy=EVAL_STRATEGY,
    save_steps=SAVE_STEPS,
    eval_steps=EVAL_STEPS,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=WEIGHT_DECAY,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    num_train_epochs=EPOCHS,
    # predict_with_generate=True,
    fp16=True,
    report_to=["tensorboard"],
    push_to_hub=True,

)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
    callbacks = [early_stopping],
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss
250,No log,1.303691
500,1.671300,1.252544
750,1.671300,1.221709
1000,1.312900,1.206804
1250,1.312900,1.197818
1500,1.292000,1.185618
1750,1.292000,1.179381
2000,1.253000,1.17686
2250,1.253000,1.172513
2500,1.252900,1.169267




TrainOutput(global_step=2820, training_loss=1.342320413792387, metrics={'train_runtime': 3112.3725, 'train_samples_per_second': 7.247, 'train_steps_per_second': 0.906, 'total_flos': 1.422043519647744e+16, 'train_loss': 1.342320413792387, 'epoch': 2.0})

In [None]:
trainer.create_model_card(
    language='english',
    model_name=MODEL_NAME,
    finetuned_from=CHECKPOINT,
    tasks='summarization',
    tags='summarization',
    dataset='jordanfan/processed_us_congress_117_bills_v3',
    dataset_args=f"Max token input: {INPUT_MAX_LENGTH} | {LABEL_MAX_LENGTH}"
)

In [None]:
CUSTOM_NAME

'T5-base-PEFT-1'

In [None]:

  commit_msg = f"training completed[dev]: {MODEL_NAME}"


trainer.push_to_hub(commit_message=commit_msg)



events.out.tfevents.1712276926.66d042a15750.4009.1:   0%|          | 0.00/9.94k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jgibb/T5-PEFT-4/commit/e22e889642ecf05c1a37ad91aa31a4349f5d4f57', commit_message='training completed[dev]: T5-PEFT-4', commit_description='', oid='e22e889642ecf05c1a37ad91aa31a4349f5d4f57', pr_url=None, pr_revision=None, pr_num=None)

AttributeError: 'T5Config' object has no attribute 'to_json'