In [1]:
import time
import os
from unsloth import FastLanguageModel
import torch
import json
from transformers import AdamW
from transformers import TrainingArguments, Trainer
from torch.utils.data import Dataset, DataLoader
from transformers import DataCollatorForSeq2Seq
import argparse
from datasets import Dataset, DatasetDict
from datasets import load_dataset

import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
def getDatalist(jsonPath):
    data_list = []
    with open(jsonPath, 'r') as input:
        for jsonObj in input:
            testObj = jsonObj.strip()
            data_list.append(json.loads(testObj))
        input.close()
    return data_list

def getDataWithPrompt(datalist):
    res = []
    system_msg = "You are a helpful assistant. Your task is to summarize the given question based on the provided question and possibly helpful retrieved documents. The retrieved documents may or may not be useful for summarization."
    for i in range(len(datalist)):  
        question = datalist[i]["question"]
        summary = datalist[i]["summary"]
        retrieval_doc = datalist[i]["retrieval"][0]["doc"]
        input_text = f"{question.strip()}\n### Retrieved Document:\n{retrieval_doc}"
        res.append({"instruction":"", "input": input_text, "output": summary.strip()})
    return res

def preprocess_function(examples):
    inputs = examples["input_text"]
    targets = examples["target"]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=50, truncation=True, padding="max_length")
    return {
        "input_ids": model_inputs["input_ids"],
        "attention_mask": model_inputs["attention_mask"],
        "labels": labels["input_ids"],
    }


def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):

        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

In [3]:
max_seq_length = 2048 
dtype = None 
load_in_4bit = True 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-7B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
   
)
EOS_TOKEN = tokenizer.eos_token 
model = FastLanguageModel.get_peft_model(
    model,
    r = 8, 
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, 
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,  
    loftq_config = None, 
)



alpaca_prompt = """You are a helpful assistant. Your task is to summarize the given question based on the provided question and possibly helpful retrieved documents. The retrieved documents may or may not be useful for summarization.
{}
### Question:
{}
### Summary:
{}"""

==((====))==  Unsloth 2025.3.9: Fast Qwen2 patching. Transformers: 4.49.0.
   \\   /|    NVIDIA RTX A6000. Num GPUs = 2. Max memory: 47.544 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.4.0+cu121. CUDA: 8.6. CUDA Toolkit: 12.1. Triton: 3.0.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████████████████████████| 2/2 [00:01<00:00,  1.01it/s]
Unsloth 2025.3.9 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [114]:
train_data_ = getDatalist("/data/luo/hqs/yahoo_lg/train_rerank_seed4396.json")
val_data_ = getDatalist("/data/luo/hqs/yahoo_lg/test_rerank_seed4396.json")
train_data = getDataWithPrompt(train_data_)
val_data = getDataWithPrompt(val_data_)
dataset__ = DatasetDict({
    "train": Dataset.from_list(train_data),
    "validation": Dataset.from_list(val_data)
})
dataset = dataset__["train"].map(formatting_prompts_func, batched = True,)
print(dataset[0])
print(dataset)

Map: 100%|██████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 78801.79 examples/s]

{'instruction': '', 'input': "anyone know any cures or treatments for a crick in your neck???? ive tried everything - heating pads, ice packs (which i wasn't supposed to use), i tried the thing that stimulates my nerves, but the battery didn't work, those things you heat up in the microwave, Advil, and a prescribed medicine thing - i'm in so much pain!!!!!!! and i'm looking for serious answers!!\n### Retrieved Document:\nWhat are the causes and remedies for the Neck pain?   answer: Causes: stress is #1 in my book, next to injury. I suffer from neck pain from sitting at the computer for long periods of time. Cramps that come from sleeping in a bad position at night can pinch your nerves and cause discomfort.\n\nRemedies: I bought a neck messaging chair for about $100. It does great for short-term relief, but for the long-term, it sucks. Heating pads help to calm the nerves in the neck, as well as massage. Sometimes, depending on the source of your pain, Tylenol-type products will help."




In [23]:
dataset_ = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset_.map(formatting_prompts_func, batched = True,)

# SFT

In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 10,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 1,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

start_time = time.time()
trainer_stats = trainer.train()
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Training completed in {elapsed_time:.2f} seconds.")

Tokenizing to ["text"] (num_proc=2): 100%|████████████████████████████| 1000/1000 [00:01<00:00, 561.05 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 10 | Total steps = 310
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 20,185,088/5,042,935,296 (0.40% trained)


Step,Training Loss
1,2.3883
2,2.4549
3,2.4336
4,2.4505
5,2.2986
6,2.33
7,2.2247
8,2.1493
9,2.0373
10,2.095


# Inference

In [86]:
FastLanguageModel.for_inference(model)
input_text = alpaca_prompt.format(
        dataset[3]["instruction"],
        dataset[3]["input"],
        "",
    )
inputs = tokenizer(
[
    input_text
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 50, use_cache = True)
print(tokenizer.batch_decode(outputs))

['You are a helpful assistant. Your task is to summarize the given question based on the provided question and possibly helpful retrieved documents. The retrieved documents may or may not be useful for summarization.\n\n### Question:\nHow do I get rid of pain in my back after soccer? I am 25 and is it normal to have back pain every time after I play soccer? Help me, am I doing anything wrong and what can I do to make the pain go away?\n### Retrieved Document:\nRight Lower Back Pain HELP!!? I have lower back pain problems that just started about a week ago. If i lay on my back and lift my left leg it doesnt hurt, but when i lift my right leg my right lower back hurts. The worst part is that i play soccer and kick with my right foot so i have to lift it and it REALLY hurts. I need to know some exercises and why it hurts and what i should do PLEASE!!  answer: Acute back pain is treated with muscle relaxantsor nonsteroidal anti-inflammatory drugs (NSAIDs), such as ibuprofen or aspirin. App

In [97]:
import re
FastLanguageModel.for_inference(model)
def getOutputs(model, tokenizer, dataset, savepath):
    pattern = r"### Summary:\n(.*?)(<\|endoftext\|>|$)"
    res = []
    for i in range(len(dataset)):
        input_text = alpaca_prompt.format(
            dataset[i]["instruction"],
            dataset[i]["input"]
            "",
        )
        inputs = tokenizer(input_text, return_tensors = "pt").to("cuda")
        
        outputs = model.generate(**inputs, max_new_tokens = 80, use_cache = True)
        text = tokenizer.batch_decode(outputs)[0]
        #print(text)
        match = re.search(pattern, text)
        if match:
            summary = match.group(1).strip()
            res.append(summary)
        if "endoftext" not in text:
            print(i)
            print(text)
    with open(savepath, "w", encoding="utf-8") as file:
        for string in res:
            file.write(string + "\n")
    return res

In [124]:
results = getOutputs(model, tokenizer, test_dataset, "savePath")

334
You are a helpful assistant. Your task is to summarize the given question based on the provided question and possibly helpful retrieved documents. The retrieved documents may or may not be useful for summarization.

### Question:
I'm concerned much? what's my risk of contracting the HIV/AIDS virus if I've never had vaginal or anal sex, never performed oral sex, never shared needles or had tattoos, never had blood transfusions, never had needlestick lacerations, never shared sex toys, but received unprotected oral sex a few times, engaged in deep French kissing, was fingered, and I had no underwear on, and I sat on a man's penis(which was covered by his underwear), and his underwear was wet from his semen, and also got some semen on my finger, wiped it off on the guy's shirt,(I didn't look when I wiped it off on his shirt), and then, a few seconds later, with that same finger, touched my labia minora, and lastly, I touched his penis(which seemed moist), then touched a one-day-old op

In [117]:
targets_path = ".../CHQ-Summ/test.target"
save_path = "save_path"
print(rouge155.calculate_rouge155_md(targets_path, save_path))


| ROUGE-1 | ROUGE-2 | ROUGE-L |
|---------|---------|---------|
|   42.95  |   22.82  |   40.03  |

