In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, DataCollatorForLanguageModeling, TrainingArguments
import torch
from bert_score import score
import torch.nn as nn
from peft import LoraConfig, get_peft_model
from datasets import Dataset
import random
from transformers import DataCollatorForSeq2Seq

2025-05-15 23:40:47.851316: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-15 23:40:47.870486: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747352447.892959 1020579 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747352447.900043 1020579 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747352447.918769 1020579 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
# setup the server
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "4" 
device = "cuda"

In [3]:
# load the summarized data
df = pd.read_csv("summarize_text.csv")
# split the data into train test
train_df, test_df = train_test_split(df, test_size=0.1, random_state=1234)


### Base Model

In [4]:
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to(device)

In [5]:
style = "short"
max_length_map = {"tiny": 40, "short": 170, "long": 400}
summary_col = f"summary_{style}"

In [6]:
# base generator
base_gen = {
    "temperature": 0.1,
    "top_p": 0.9,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "eos_token_id": tokenizer.eos_token_id
}

def query_lm(model, tokenizer, text, style, gen_config, max_length_map):
    cfg = gen_config.copy()
    cfg["max_new_tokens"] = max_length_map[style]
    prompt = f"summarize:{style} >> {text}"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        out = model.generate(**inputs, **cfg)
    return tokenizer.decode(out[0], skip_special_tokens=True)

In [7]:
texts = test_df["combined_cleaned_whisper_text"].tolist()
base_outputs = [
    query_lm(model, tokenizer, txt, style, base_gen, max_length_map)
    for txt in texts
]

## Fine Tuning

In [8]:
for param in model.parameters():
    # requires_grad will stop any gradients from being computed for that parameter and will stay frozen during backwards passes
    param.requires_grad = False 
    
    # changing 1d parameters such as biases to high precision may help with stability
    if param.ndim == 1:
        param.data = param.data.to(torch.float32)

# here helps the memory
model.gradient_checkpointing_enable() 

# ensures that the final output logits of the model are full precision
class CastOutputToFloat(nn.Sequential):
    def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

In [9]:
# config the lora based on the model, set the task to seq2seq because it's summarization task
config = LoraConfig(
    r = 8,
    # lora_alpha=16,
    target_modules=["q","k","v","o"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)
# set this to false for training
config.inference_mode = False

In [10]:
# get_peft_model is from the peft huggingface
model = get_peft_model(model, config)

ft_text = [f"Summarize the following text: {text}" for text in train_df['combined_cleaned_whisper_text'].tolist()]
ft_summary = [summary for summary in train_df[summary_col].tolist()]

ts_dict = {"text": train_df['combined_cleaned_whisper_text'].tolist(), "summary": train_df[summary_col].tolist()}
train_dataset = Dataset.from_dict(ts_dict)

In [11]:
def prepare_and_tokenize(example):
    cleaned_text = example["text"]
    target_summary = example["summary"]
    
    # create an input text for the prompt
    input_text = f"summarize:{style} >> {cleaned_text}"
    
    # tokenize the text input
    model_inputs = tokenizer(input_text, truncation=True, padding="max_length", max_length=1024)
    
    # tokenize the target_summary to update the model
    labels = tokenizer(target_summary, truncation=True, padding="max_length", max_length=256)

    # set labels and replace padding token id with -100. if there is no label, then set it to -100 to avoid the cross entropy loss, so that the model won't be confused
    model_inputs["labels"] = labels["input_ids"]
    model_inputs["labels"] = [label if label != tokenizer.pad_token_id else -100 for label in model_inputs["labels"]]

    # pop the num_items_in_batch because it always cause error when running the train trainer
    model_inputs.pop("num_items_in_batch", None)

    return model_inputs

In [12]:
# mapped the train dataset
mapped_train_dataset = train_dataset.map(prepare_and_tokenize, batched=False, remove_columns=['text', 'summary'])

Map:   0%|          | 0/87 [00:00<?, ? examples/s]

In [13]:
# set seed for reproducibility
SEED = 0
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

In [14]:
# create datacollator for seq2seq because the purpose of the model is to do summarization
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100)

from transformers import Trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # Strip out num_items_in_batch if it ends up in inputs too
        inputs = {k: v for k, v in inputs.items() if k != 'num_items_in_batch'}
        return super().compute_loss(model, inputs, return_outputs)

trainer = CustomTrainer(
    model=model.to(device),
    train_dataset=mapped_train_dataset,
    args=TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=1,
        warmup_steps=10, 
        num_train_epochs=40,
        learning_rate=1e-3,
        fp16=True,
        logging_steps=5,
        output_dir=f'lora_model_{summary_col}',
    ),
    data_collator=data_collator
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [15]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
5,5.4014
10,5.3164
15,4.2974
20,3.5047
25,3.0723
30,3.3457
35,3.1171
40,3.0114
45,2.8464
50,2.9119


TrainOutput(global_step=880, training_loss=2.1878400260751896, metrics={'train_runtime': 191.4611, 'train_samples_per_second': 18.176, 'train_steps_per_second': 4.596, 'total_flos': 954590035968000.0, 'train_loss': 2.1878400260751896, 'epoch': 40.0})

In [16]:
# check whether the weights has changed
model.config.use_cache = True
model.eval()

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 512)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 512)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=512, out_features=512, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=512, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=512, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
              

In [17]:
lora_outputs=[query_lm(model, tokenizer, txt, style, base_gen, max_length_map) for txt in texts]

In [18]:
# use the rogue metrics to check whether the LoRA has improved the summary or not
import evaluate
rouge = evaluate.load("rouge")

# compute the rogue score for the output generated from small model and fine-tuned with the summary generated by a larger model. In this data, I am using the Llama larger model to generate the summary
def compute_rouge(output, reference):
    scores = rouge.compute(predictions=[output], references=[reference])
    return scores

In [19]:
# set the test_summary, which is taking the summary from the test data generated by the larger model
test_summary = test_df[summary_col]

base_score = []
lora_score = []

# use for loop to test the output from the base model and lora, compared with the true summary
for i in range(len(test_summary)):
    true_summary = test_summary.iloc[i]
    
    base_rogue = compute_rouge(base_outputs[i], true_summary)
    lora_rogue = compute_rouge(lora_outputs[i], true_summary)
    
    base_score.append(base_rogue)
    lora_score.append(lora_rogue)

In [20]:
# get the average for all test data
def average_rouge(scores_list):
    keys = scores_list[0].keys()  # Extract the metric names
    avg_scores = {k: sum(d[k] for d in scores_list) / len(scores_list) for k in keys}
    return avg_scores

avg_base_rouge = average_rouge(base_score)
avg_lora_rouge = average_rouge(lora_score)

# print the average score for each model
print("Avg ROUGE Score:")
print("Base Model:", avg_base_rouge)
print("LoRA Model:", avg_lora_rouge)

Avg ROUGE Score:
Base Model: {'rouge1': 0.33783717467336977, 'rouge2': 0.09105381229843086, 'rougeL': 0.19645021103054205, 'rougeLsum': 0.19645021103054205}
LoRA Model: {'rouge1': 0.41586075011908996, 'rouge2': 0.13463764233993725, 'rougeL': 0.27137701372945117, 'rougeLsum': 0.27137701372945117}


In [21]:
lora_outputs

['The room was fragrant with all the riches of greek thought and song since the days when Tolomea Philadelphia walked there with Euclid and theocritus Kalamakis and Lycafron, and the only movables in it were a sofa bed, table and armchair, all of delicate and graceful forms. We had no idea that the room was rich enough for human eyes for the sake of one treasure which it possessed and beside which nothing was worth a moment to clients. The room was fragrant with curling greek lips, ripe curling greek thought and song, and ripe, and ripe, and ripe, and a patia to be welcomed into the celestial ranks of the heroic to rise to the',
 'Cold Thomas Thomas opened a large restaurant in oferrall street above Fillmore, and for two years did a thriving business. Thomas opened a restaurant called the Del Monte in Powell Street near market, but it was too early for success and closed after a short career here. Thomas opened a restaurant called the Del Monte in Powell Street, near market, and opened