## Libraries

In [1]:
%%capture
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U accelerate
%pip install -U transformers accelerate
%pip install -U peft
%pip install -U trl
%pip install GPUtil
%pip install evaluate
%pip install rouge_score

In [None]:
import os
import json
import numpy as np
import pandas as pd
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
import shutil
from IPython.display import FileLink, display

from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import bitsandbytes as bnb
from evaluate import load as load_metric
from datasets import DatasetDict, Dataset, load_dataset
from peft import LoraConfig, PeftConfig, get_peft_model, TaskType
from trl import setup_chat_format, SFTTrainer
import transformers
from transformers import (AutoTokenizer, 
                          AutoModelForCausalLM,
                          AutoModelForSequenceClassification,
                          BitsAndBytesConfig,
                          Trainer,
                          TrainingArguments, 
                          pipeline, 
                          logging,
                          DataCollatorWithPadding)

from huggingface_hub import login as hf_login
from kaggle_secrets import UserSecretsClient
import wandb

## Config

In [27]:
# Load the config file
with open('/kaggle/input/config/config.json', 'r') as f:
    config = json.load(f)

file_path = config["data_loc"]

data_loc = "/kaggle/input/"
model_path = "meta-llama/Llama-3.2-3B-Instruct"
output_dir="llama-3.2-fine-tuned-model"
EXPERIMENTS_FILE = "experiment_results.csv"

# Tokens
user_secrets = UserSecretsClient()
access_token = user_secrets.get_secret("hf_read_token_access")
write_access_token = user_secrets.get_secret("hf_write_token_access")
wb_access_token = user_secrets.get_secret("wanda_token")
hf_login(write_access_token)


wandb.login(key=wb_access_token)
run = wandb.init(
    project='Fine-tune Llama-3.2-3B-Instruct for QTL Title Generation', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mgabriel-ferreira[0m ([33mgabriel-ferreira-iowa-state-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [4]:
# Define reusable template
prompt_template = """
You are an expert at writing concise and informative research paper titles based on abstracts.

Given the abstract below, generate only a clear, accurate, and concise title that best reflects the core idea of the abstract. I only need the title from you with no more than 45 words.

Abstract: {abstract}

Title: {title}
""".strip()

# Generation for training (with known title)
def generate_prompt(data_point):
    return prompt_template.format(abstract=data_point["Abstract"], title=data_point["Title"])

# Generation for testing (no title provided)
def generate_test_prompt(data_point):
    return prompt_template.format(abstract=data_point["Abstract"], title="")

# Define text preprocessing
def preprocess_function(example):
    tokens = tokenizer(example['text'], truncation=True, padding='max_length', max_length=256)
    return tokens
    
def predict(test_data, model, tokenizer, max_new_tokens=45, temperature=0.1, batch_size=8):
    # Convert to Hugging Face Dataset if it's a DataFrame
    if isinstance(test_data, pd.DataFrame):
        test_data = Dataset.from_pandas(test_data)

    prompts = test_data

    pipe = pipeline(
        task="text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        device_map="auto"
    )

    y_pred = []
    for i in tqdm(range(0, len(prompts), batch_size), desc="Generating Titles"):
        batch = prompts[i:i + batch_size]
        results = pipe(batch)
        for r in results:
            text = r[0]["generated_text"] if isinstance(r, list) else r["generated_text"]
            title = text.split("Title: ")[-1].split("\n")[0].strip()
            y_pred.append(title)


    return y_pred

def evaluate(y_true, y_pred):
    from evaluate import load as load_metric

    bleu = load_metric("bleu")
    rouge = load_metric("rouge")

    bleu_eval = bleu.compute(predictions=y_pred, references=y_true)
    rouge_eval = rouge.compute(predictions=y_pred, references=y_true)

    required_metrics = {
        "BLEU": round(bleu_eval["bleu"], 4),
        "ROUGE-2": round(rouge_eval["rouge2"], 4),
        "ROUGE-L": round(rouge_eval["rougeL"], 4)
    }

    return required_metrics

def format_experiment_metrics(experiment_metadata, metrics):
    return pd.DataFrame([experiment_metadata | metrics])

def save_experiment(experiment_metadata, metrics, path=EXPERIMENTS_FILE, mode="append"):
    new_record = format_experiment_metrics(experiment_metadata, metrics)

    if mode == "visualize":
        pass
    elif mode == "overwrite" or not os.path.exists(path):
        new_record.to_csv(path, index=False)
        print(f"Experiment saved to {path}")
    elif mode == "append":
        existing = pd.read_csv(path)
        new_record = pd.concat([existing, new_record], ignore_index=True)
        new_record.to_csv(path, index=False)
        print(f"Experiment saved to {path}")
    else:
        raise ValueError("Mode must be 'append' or 'overwrite'")    

    return new_record
    
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

## Preparing Dataset

In [37]:
# Data Location Path
data_loc = "/kaggle/input/579nlp-project2"

# Test Set
file_name = "test_unlabeled.tsv"
final_path = os.path.join(data_loc, file_name) 
df_test = pd.read_csv(final_path, sep='\t').drop(columns=['PMID'])
print(f"The test set has {df_test.shape[0]} observations and {df_test.shape[1]} columns.\n")

# Train Set
file_name = "QTL_text.json"
final_path = os.path.join(data_loc, file_name) 
df_train = pd.read_json(final_path)
df_train = df_train.drop(columns=['Journal', 'PMID'])
# df_train = df_train[0:100]
print(f"The train set has {df_train.shape[0]} observations and {df_train.shape[1]} columns.\n")

# Prepare Input Text
df_train['input_text'] = df_train.apply(lambda x: generate_prompt(x), axis=1).tolist()
df_test['input_text'] = df_test.apply(lambda x: generate_test_prompt(x), axis=1).tolist()

# Split train and validation
X_train, X_val = train_test_split(df_train, test_size=.2, random_state=42)

# Set test
X_test = pd.DataFrame(df_test['input_text'], columns=["input_text"])
y_test = df_test['Title']

The test set has 1097 observations and 3 columns.

The train set has 11278 observations and 3 columns.



In [38]:
# Training Data
train_data = {"text": X_train['input_text']}
train_dataset = Dataset.from_dict(train_data)

# Validation Data
val_data = {"text": X_val['input_text']}
val_dataset = Dataset.from_dict(val_data)

# Test Data
test_data = {"text": X_test['input_text'], "titles": y_test.astype(str).tolist()}
test_dataset = Dataset.from_dict(test_data)

# Dataset Dictionary 
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 9022
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 2256
    })
    test: Dataset({
        features: ['text', 'titles'],
        num_rows: 1097
    })
})

In [7]:
print(dataset_dict['train']['text'][0])

You are an expert at writing concise and informative research paper titles based on abstracts.

Given the abstract below, generate only a clear, accurate, and concise title that best reflects the core idea of the abstract. I only need the title from you with no more than 45 words.

Abstract: Reprogramming of adipocyte function in obesity is implicated in metabolic disorders like type 2 diabetes. Here, we used the pig, an animal model sharing many physiological and pathophysiological similarities with humans, to perform in-depth epigenomic and transcriptomic characterization of pure adipocyte fractions. Using a combined DNA methylation capture sequencing and Reduced Representation bisulfite sequencing (RRBS) strategy in 11 lean and 12 obese pigs, we identified in 3529 differentially methylated regions (DMRs) located at close proximity to-, or within genes in the adipocytes. By sequencing of the transcriptome from the same fraction of isolated adipocytes, we identified 276 differentially

In [8]:
print(dataset_dict['test']['text'][0])

You are an expert at writing concise and informative research paper titles based on abstracts.

Given the abstract below, generate only a clear, accurate, and concise title that best reflects the core idea of the abstract. I only need the title from you with no more than 45 words.

Abstract: Porcine circovirus type 3 (PCV3) is regularly reported in association with various clinical presentations, including porcine dermatitis and nephropathy syndrome (PDNS)-like lesions, respiratory signs, congenital tremor, and reproductive disorders. To investigate the epidemiology of PCV3 in a boar stud, we analysed fresh boar semen and matching sera from 181 boars from a German stud  supplying semen for artificial insemination (AI) to approximately 740 breeder farms for PCV3 DNA. PCV3 DNA was detected in 1.7% semen samples and 24.3% sera. Spearman rho correlation demonstrated a significant positive correlation between  boar age and quantitative DNA (by PCR quantification cycles [Cq] values) in serum

## Load Model

In [50]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype="float16",
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_path)

tokenizer.pad_token_id = tokenizer.eos_token_id

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
    token=access_token,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [39]:
# Preprocess all datasets
tokenized_data = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/9022 [00:00<?, ? examples/s]

Map:   0%|          | 0/2256 [00:00<?, ? examples/s]

Map:   0%|          | 0/1097 [00:00<?, ? examples/s]

## Model Demonstration - Generation

In [11]:
# Sample a prompt
n=4
prompt = X_test['input_text'][n]

# print(prompt)
outputs = pipe(prompt, max_new_tokens=45, do_sample=True, pad_token_id=tokenizer.eos_token_id)
print(outputs[0]["generated_text"])

You are an expert at writing concise and informative research paper titles based on abstracts.

Given the abstract below, generate only a clear, accurate, and concise title that best reflects the core idea of the abstract. I only need the title from you with no more than 45 words.

Abstract: BACKGROUND: Acute or chronic irreversible respiratory failure may occur in patients undergoing pneumonectomy. Aim of this study was to determine transcriptome expression changes after experimental pneumonectomy in swine model. Experimental left pneumonectomy was performed in five pigs under general anaesthesia. Both the resected and the remaining lung, after 60 post-operative completely uneventful days, underwent genome-wide bulk RNA-Sequencing (RNA-Seq). RESULTS: Histological analysis showed dilation of air spaces and rupture of interalveolar septa. In addition, mild inflammation, no fibrosis, radial stretch  of the bronchus, strong enlargement of airspaces and thinning of the blood supply were ob

In [12]:
pred = [outputs[0]["generated_text"].split("Title: ")[-1].split(" \n")[0]]
pred

[' Transcriptome expression changes after experimental pneumonectomy in a swine model: A study on the pulmonary response to lung resection.']

In [13]:
reference =[ y_test[n]]
reference

['Genome-wide expression of the residual lung reacting to experimental Pneumonectomy.']

In [14]:
# Evaluate
metrics = evaluate(reference, pred)
metrics

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'BLEU': 0.0, 'ROUGE-2': 0.0714, 'ROUGE-L': 0.2}

## Evaluate Baseline Model

In [None]:
# Start time
start_time = datetime.now()

# Run predictions
y_pred = predict(tokenized_data['test']['text'], model, tokenizer)
y_reference = tokenized_data['test']['titles']

# End time
end_time = datetime.now()

# Calculate duration
inference_duration = round((end_time - start_time).total_seconds() / 60, 2)

# Evaluate
metrics = evaluate(y_reference, y_pred)

Device set to use cuda:0
Generating Titles:   7%|▋         | 10/138 [02:48<35:53, 16.83s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Generating Titles:  64%|██████▍   | 88/138 [24:21<13:41, 16.44s/it]

In [None]:
# Experiment configuration
fine_tuned = "No"
comments = "Baseline Model"

experiment_metadata = {
    "Start Time": start_time.strftime("%Y-%m-%d %H:%M:%S"),
    "End Time": end_time.strftime("%Y-%m-%d %H:%M:%S"),
    "Inference Duration (min)": inference_duration,
    "Model": model_path,
    "Tokenizer": tokenizer.name_or_path,
    "Fine-Tuned": fine_tuned,
    "Test Size": len(y_reference),
    "Prompt Template": prompt_template,
    "Comments": comments
}

# Save the experiment
save_experiment(experiment_metadata, metrics, mode="overwrite") # mode={overwrite, append, visualize}

## Model Fine-Tunning

In [22]:
modules = find_all_linear_names(model)
modules

['down_proj', 'v_proj', 'q_proj', 'o_proj', 'gate_proj', 'up_proj', 'k_proj']

In [51]:
training_arguments = TrainingArguments(
    output_dir=output_dir,                    # directory to save and repository id
    num_train_epochs=1,                       # number of training epochs
    per_device_train_batch_size=1,            # batch size per device during training
    gradient_accumulation_steps=8,            # number of steps before performing a backward/update pass
    gradient_checkpointing=True,              # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",
    logging_steps=1,                         
    learning_rate=2e-4,                       # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
    max_steps=-1,
    warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
    group_by_length=False,
    lr_scheduler_type="cosine",               # use cosine learning rate scheduler
    report_to="wandb",                  # report metrics to w&b
    eval_strategy="steps",              # save checkpoint every epoch
    eval_steps = 0.2
)

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules,
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['validation'],
    peft_config=peft_config
)

Truncating train dataset:   0%|          | 0/9022 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/2256 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [52]:
trainer.train()

Step,Training Loss,Validation Loss
226,1.6387,1.734248
452,1.575,1.716754
678,1.6368,1.704379
904,1.5364,1.697459


TrainOutput(global_step=1127, training_loss=1.7208292630254005, metrics={'train_runtime': 14455.8057, 'train_samples_per_second': 0.624, 'train_steps_per_second': 0.078, 'total_flos': 4.038248661359002e+16, 'train_loss': 1.7208292630254005})

## Evaluate Fine-Tuned Model 

In [53]:
# Start time
start_time = datetime.now()

# Run predictions
y_pred = predict(tokenized_data['test']['text'], model, tokenizer)
y_reference = tokenized_data['test']['titles']

# End time
end_time = datetime.now()

# Calculate duration
inference_duration = round((end_time - start_time).total_seconds() / 60, 2)

# Evaluate
metrics = evaluate(y_reference, y_pred)

Device set to use cuda:0
Generating Titles:   0%|          | 0/138 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
DynamicCache + torch.export is tested on torch 2.6.0+ and may not work on earlier versions.
Generating Titles: 100%|██████████| 138/138 [50:14<00:00, 21.84s/it]


In [54]:
# Experiment configuration
fine_tuned = "Yes"
comments = "Fine-tuned Model"

experiment_metadata = {
    "Start Time": start_time.strftime("%Y-%m-%d %H:%M:%S"),
    "End Time": end_time.strftime("%Y-%m-%d %H:%M:%S"),
    "Inference Duration (min)": inference_duration,
    "Model": model_path,
    "Tokenizer": tokenizer.name_or_path,
    "Fine-Tuned": fine_tuned,
    "Test Size": len(y_reference),
    "Prompt Template": prompt_template,
    "Comments": comments
}

# Save the experiment
save_experiment(experiment_metadata, metrics, mode="append") # mode={overwrite, append, visualize}

Experiment saved to experiment_results.csv


Unnamed: 0,Start Time,End Time,Inference Duration (min),Model,Tokenizer,Fine-Tuned,Test Size,Prompt Template,Comments,BLEU,ROUGE-2,ROUGE-L
0,2025-04-17 23:28:31,2025-04-18 00:06:13,37.7,meta-llama/Llama-3.2-3B-Instruct,meta-llama/Llama-3.2-3B-Instruct,No,1097,You are an expert at writing concise and infor...,Baseline Model,0.0859,0.2321,0.3804
1,2025-04-18 00:32:57,2025-04-18 00:33:04,0.11,meta-llama/Llama-3.2-3B-Instruct,meta-llama/Llama-3.2-3B-Instruct,Yes,2,You are an expert at writing concise and infor...,Fine-tuned Model,0.0,0.3845,0.6009
2,2025-04-18 04:43:31,2025-04-18 05:33:45,50.24,meta-llama/Llama-3.2-3B-Instruct,meta-llama/Llama-3.2-3B-Instruct,Yes,1097,You are an expert at writing concise and infor...,Fine-tuned Model,0.1322,0.2575,0.4135


## Close Work

### Finish Weight & Bias

In [55]:
wandb.finish(exit_code=0)
model.config.use_cache = True

0,1
eval/loss,█▇▆▅▅▁▁▁▁
eval/mean_token_accuracy,▁▂▃▃▃████
eval/num_tokens,▁▁▁▁▁▃▄▆█
eval/runtime,▁▁▁▁▁████
eval/samples_per_second,███▇█▁▁▁▁
eval/steps_per_second,█████▁▁▁▁
train/epoch,▄▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇██
train/global_step,▁▁▁▁▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,█▄▂▃▂▂▁▂▂▂▂▃▄▂▂▁▂▂▂▁▁▂▂▁▂▂▃▃▂▃▂▄▄▂▂▂▂▃▃▂
train/learning_rate,▃▅▃█████▇▇▇▇▇▆▆▆▆▆▆▅▅▄▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁

0,1
eval/loss,1.69746
eval/mean_token_accuracy,0.63423
eval/num_tokens,1851392.0
eval/runtime,671.1725
eval/samples_per_second,3.361
eval/steps_per_second,0.42
total_flos,4.038248661359002e+16
train/epoch,0.99933
train/global_step,1127.0
train/grad_norm,0.15165


### Save Model Locally

In [None]:
# Save trained model and tokenizer
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

('llama-3.2-fine-tuned-model/tokenizer_config.json',
 'llama-3.2-fine-tuned-model/special_tokens_map.json',
 'llama-3.2-fine-tuned-model/tokenizer.json')

In [None]:
# checkpoint = 'checkpoint-1125'

# # Compress the checkpoint folder into a zip file.
# shutil.make_archive(f'/kaggle/working/{output_dir}/{checkpoint}', 'zip', f'/kaggle/working/{output_dir}/{checkpoint}')

# FileLink(f"{output_dir}/{checkpoint}.zip")

### Push the model and tokenizer to the Hugging Face Hub

In [62]:
model.push_to_hub(output_dir, use_temp_dir=False)
tokenizer.push_to_hub(output_dir, use_temp_dir=False)

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Gabriel-Ferreira/llama-3.2-fine-tuned-model/commit/dfda0c6b3665eadf973a6013498754131970e6ba', commit_message='Upload tokenizer', commit_description='', oid='dfda0c6b3665eadf973a6013498754131970e6ba', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Gabriel-Ferreira/llama-3.2-fine-tuned-model', endpoint='https://huggingface.co', repo_type='model', repo_id='Gabriel-Ferreira/llama-3.2-fine-tuned-model'), pr_revision=None, pr_num=None)