In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install pydot --quiet
!pip install tensorflow==2.15.0 --quiet
!pip install transformers --quiet
!pip install sentencepiece --quiet
!pip install datasets --quiet
!pip install accelerate --quiet
!pip install transformers[torch] --quiet
!pip install evaluate --quiet
!pip install rouge_score --quiet
!pip install jiwer --quiet
!pip install git+https://github.com/google-research/bleurt.git --quiet


  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
import transformers
from transformers import DataCollatorForSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset, load_from_disk
import numpy as np
import pandas as pd
import accelerate
import torch
from torch.utils.data import Dataset
from evaluate import load
pd.set_option('display.max_colwidth', 0)

In [None]:
# ====== ENVIRONMENT ======
DEV = False
EXPLORE = False
RANDOM_SEED = 42

# ====== DATA =======
EXTRACTED_TEXT_LEN = 1000
INPUT_MAX_LENGTH = 1024
LABEL_MAX_LENGTH = 128

# ====== MODEL ======
CHECKPOINT = "facebook/bart-large"
PATH = '/content/drive/MyDrive/colab-notebooks/W266/'
PEFT = False

# ====== OPTIIZER =======
OPTIMIZER = "adamw_bnb_8bit"
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01

# ====== TRAINING ======
EVAL_STRATEGY = "steps"
BATCH_SIZE = 6
EPOCHS = 2
SAVE_STEPS = 100 if DEV else 1000
EVAL_STEPS = 100 if DEV else 250
LOGGING_STEPS = 100 if DEV else 500
METRIC_WER = load("wer")
METRIC_ROUGE = load("rouge")
METRIC_BLEURT = load('bleurt', module_type = 'metric', checkpoint = "bleurt-base-512")



In [None]:
with open("/content/drive/MyDrive/Colab Notebooks/W266/huggingface_credentials.txt", "r") as f:
    HF_TOKEN = f.read()


In [None]:
df = load_dataset("jordanfan/processed_us_congress_117_bills_v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
def model_init():
  seed = 42
  torch.manual_seed(seed)
  return AutoModelForSeq2SeqLM.from_pretrained(CHECKPOINT)
#model = AutoModelForSeq2SeqLM.from_pretrained(CHECKPOINT)


In [None]:
def preprocess_data_extracted(df):
  if EXTRACTED_TEXT_LEN == 500:
    input = [doc for doc in df["extracted_text"]]
  else:
    input = [doc for doc in df[f"extracted_text_{EXTRACTED_TEXT_LEN}"]]
  #tokenize full text
  model_inputs = tokenizer(input, max_length=INPUT_MAX_LENGTH, padding=True, truncation=True)
  #tokenize the summaries
  summary_input = [doc for doc in df["cleaned_summary"]]
  targets = tokenizer(text_target = summary_input, max_length=LABEL_MAX_LENGTH, padding= True, truncation=True)
  #set labels
  model_inputs['labels'] = targets['input_ids']
  return model_inputs


In [None]:
tokenized_data_extracted = df.map(preprocess_data_extracted, batched = True)

Map:   0%|          | 0/3388 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=CHECKPOINT)

In [None]:
def compute_metrics(pred):

    labels_ids = pred.label_ids
    pred_ids = pred.predictions[0]

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    result_wer = METRIC_WER.compute(predictions=pred_str, references=label_str)
    result_rouge = METRIC_ROUGE.compute(predictions=pred_str, references=label_str, use_stemmer=True, use_aggregator=True)
    result_rouge = {key: value for key, value in result_rouge.items()}
    result_bleurt = METRIC_BLEURT.compute(predictions = pred_str, references = label_str)

    results = {key: value for key, value in result_rouge.items()}
    results["WER"] = result_wer
    results["bleurt"] = result_bleurt["scores"][0]
    return {k: round(v, 4) for k,v in results.items()}

def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

In [None]:
args = Seq2SeqTrainingArguments(
    output_dir = f"jordanfan/bart_extractive_{INPUT_MAX_LENGTH}_{EXTRACTED_TEXT_LEN}",
    evaluation_strategy=EVAL_STRATEGY,
    save_strategy=EVAL_STRATEGY,
    save_steps=SAVE_STEPS,
    eval_steps=EVAL_STEPS,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=WEIGHT_DECAY,
#    eval_accumulation_steps=16,
    load_best_model_at_end=True,
    num_train_epochs=EPOCHS,
    # predict_with_generate=True,
    fp16=True,
    report_to=["tensorboard"],
    push_to_hub=True,
    hub_token=HF_TOKEN
    )

trainer = Seq2SeqTrainer(
    #model = model,
    model_init=model_init,
    args = args,
    train_dataset=tokenized_data_extracted["train"],
    eval_dataset=tokenized_data_extracted["val"],
    tokenizer=tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
    preprocess_logits_for_metrics = preprocess_logits_for_metrics
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Wer,Bleurt
250,No log,1.136153,0.6713,0.4064,0.6113,0.6111,0.4774,-0.1118
500,2.045400,1.033698,0.6869,0.4301,0.6289,0.6288,0.4555,-0.1734
750,2.045400,1.000227,0.7017,0.4465,0.6435,0.6434,0.4467,-0.357
1000,1.098700,0.974688,0.7008,0.4469,0.6423,0.6422,0.442,-0.0679
1250,1.098700,0.958931,0.7092,0.456,0.6521,0.652,0.4363,0.2669
1500,1.041800,0.95512,0.704,0.4538,0.6486,0.6485,0.4343,-0.1447
1750,1.041800,0.931632,0.7096,0.4605,0.6546,0.6544,0.4285,-0.0465
2000,1.003100,0.915011,0.7129,0.4653,0.6584,0.6583,0.4255,-0.1069
2250,1.003100,0.909404,0.7119,0.4658,0.6577,0.6576,0.4234,-0.4062
2500,0.905200,0.910144,0.721,0.4736,0.6665,0.6664,0.4206,0.2201


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=3760, training_loss=1.1074390979523354, metrics={'train_runtime': 6329.0644, 'train_samples_per_second': 3.564, 'train_steps_per_second': 0.594, 'total_flos': 4.887687719603405e+16, 'train_loss': 1.1074390979523354, 'epoch': 2.0})

In [None]:
trainer.create_model_card(
    language='english',
    model_name=f"bart-extractive-{INPUT_MAX_LENGTH}-{EXTRACTED_TEXT_LEN}",
    tasks='summarization',
    tags='summarization',
    dataset='jordanfan/processed_us_congress_117_bills_v2',
    dataset_args=f"Max token input: {INPUT_MAX_LENGTH} | {LABEL_MAX_LENGTH}"
)

commit_msg = f"training completed[dev]: {INPUT_MAX_LENGTH} {LABEL_MAX_LENGTH}"
trainer.push_to_hub(commit_message=commit_msg)

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


events.out.tfevents.1710869501.79eef9fcf5d9.12590.0:   0%|          | 0.00/16.2k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jordanfan/bart_extractive_1024_1000/commit/4f35d3348c2660e860f79b0bdfea37bb5b21383a', commit_message='training completed[dev]: 1024 128', commit_description='', oid='4f35d3348c2660e860f79b0bdfea37bb5b21383a', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
predict_results = trainer.predict(tokenized_data_extracted["test"])
trainer.save_metrics(f"test_extractive_{INPUT_MAX_LENGTH}_{EXTRACTED_TEXT_LEN}", predict_results.metrics)