In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install evaluate rouge_score -q

import os
import torch
import numpy as np
import json
import evaluate
import matplotlib.pyplot as plt
import re

from datasets import load_dataset
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from peft import (
    PeftModel,
    LoraConfig,
    get_peft_model,
)

# matplotlib Î∞±ÏóîÎìú ÏÑ§Ï†ï (ÎÖ∏Ìä∏Î∂Å ÌôòÍ≤ΩÏù¥ ÏïÑÎãå Í≤ΩÏö∞ Ïò§Î•ò Î∞©ÏßÄ)
plt.switch_backend('Agg')
rouge = evaluate.load("rouge")

# --------------------------------------------------------------------------------
# 1. Í≤ΩÎ°ú Î∞è ÌôòÍ≤Ω ÏÑ§Ï†ï (Ïú†ÏßÄ)
# --------------------------------------------------------------------------------
BASE_PATH = "/content/drive/MyDrive/textanl"
TRAIN_DATA_FILE = os.path.join(BASE_PATH, "dataset/train_instruction_corpus.jsonl")
VALID_DATA_FILE = os.path.join(BASE_PATH, "dataset/valid_instruction_corpus.jsonl")
OUTPUT_DIR = os.path.join(BASE_PATH, "qa")
MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
LORA_PATH = os.path.join(BASE_PATH, "dapt_params/qwen2.5_1.5b_dapt_adapter_bf16")

MAX_LENGTH = 512
PROMPT_SEPARATOR_WITH_NEWLINE = "### Assistant:\n" # ÎßàÏä§ÌÇπ Í∏∞Ï§Ä Íµ¨Î∂ÑÏûê

# --------------------------------------------------------------------------------
# 2. DAPTÎêú Î≤†Ïù¥Ïä§ Î™®Îç∏ + Í∏∞Ï°¥ LoRA Î°úÎìú
# --------------------------------------------------------------------------------

# ‚úÖ ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†ÄÎäî Î≤†Ïù¥Ïä§ Î™®Îç∏ÏóêÏÑú
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    fix_mistral_regex=True,
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# 1) ÏàúÏ†ï Qwen2.5 Î≤†Ïù¥Ïä§ Î°úÎìú
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.float16,   # bf16 Ïì∞Í≥† Ïã∂ÏúºÎ©¥ Ïó¨Í∏∞Îûë TrainingArguments ÎßûÏ∂îÎ©¥ Îê®
    trust_remote_code=True,
)

# 2) Í∏∞Ï°¥Ïóê ÌïôÏäµÌïú LoRA Ïñ¥ÎåëÌÑ∞ Î°úÎìú (Ïù¥Í±∏ Í∑∏ÎåÄÎ°ú Ïù¥Ïñ¥ÏÑú ÌïôÏäµ)
model = PeftModel.from_pretrained(
    base_model,
    LORA_PATH,
    is_trainable=True,
)

model.config.use_cache = False
model.print_trainable_parameters()
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

# --------------------------------------------------------------------------------
# 4. Îç∞Ïù¥ÌÑ∞ÏÖã Î°úÎìú Î∞è ÌÜ†ÌÅ¨ÎÇòÏù¥Ïßï (üìå ÎßàÏä§ÌÇπ Î∞è ÌÅ¥Î¶¨Îãù Í∞ïÌôî)
# --------------------------------------------------------------------------------
raw_datasets = load_dataset(
    "json",
    data_files={"train": TRAIN_DATA_FILE, "validation": VALID_DATA_FILE}
)

def clean_input_text(text):
    """
    Input ÌÖçÏä§Ìä∏ ÎÇ¥Ïùò Î™®Îì† Í∞úÌñâ Î¨∏Ïûê(\n)ÏôÄ Ïó∞ÏÜçÎêú Í≥µÎ∞±ÏùÑ Îã®Ïùº Í≥µÎ∞±ÏúºÎ°ú ÏπòÌôòÌï©ÎãàÎã§.
    """
    if not isinstance(text, str):
        return ""
    # üìå \n, \t, Ïó∞ÏÜç Í≥µÎ∞±ÏùÑ Î™®Îëê Îã®Ïùº Í≥µÎ∞±ÏúºÎ°ú ÏπòÌôò
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def tokenize_and_mask_function(examples):
    full_text = examples["text"]

    # 1. ÌÖçÏä§Ìä∏ ÌÅ¥Î¶¨Îãù Ï†ÅÏö©: \n Î¨∏ÏûêÎ•º Ï†úÍ±∞ÌïòÏó¨ ÌîÑÎ°¨ÌîÑÌä∏Î•º Ï†ïÎèà
    full_text = clean_input_text(full_text)

    # 2. ÏãúÌÄÄÏä§ ÌÜ†ÌÅ∞Ìôî
    tokenized = tokenizer(
        full_text, truncation=True, max_length=MAX_LENGTH, padding=False,
    )
    labels = tokenized["input_ids"].copy()

    # --- Masking Logic: ÌîÑÎ°¨ÌîÑÌä∏ Î∂ÄÎ∂ÑÏùò labelsÎ•º -100ÏúºÎ°ú ÏÑ§Ï†ï ---
    # Íµ¨Î∂ÑÏûêÎèÑ ÌÅ¥Î¶¨ÎãùÎêòÏóàÏùÑ Ïàò ÏûàÏúºÎØÄÎ°ú, ÌÅ¥Î¶¨ÎãùÎêú ÌÖçÏä§Ìä∏ÏóêÏÑú Íµ¨Î∂ÑÏûêÎ•º Ï∞æÏäµÎãàÎã§.
    # PROMPT_SEPARATOR_WITH_NEWLINE ("### Assistant:\n")Ïùò \nÏùÄ ÌÅ¥Î¶¨ÎãùÏóê ÏùòÌï¥ Í≥µÎ∞±ÏúºÎ°ú Î∞îÎÄùÎãàÎã§.
    CLEANED_SEPARATOR = clean_input_text(PROMPT_SEPARATOR_WITH_NEWLINE) # "### Assistant:"

    parts = full_text.split(CLEANED_SEPARATOR, 1)

    if len(parts) == 2:
        prompt_plus_header = parts[0] + CLEANED_SEPARATOR

        # ÌîÑÎ°¨ÌîÑÌä∏ + Ìó§ÎçîÎ•º ÌÜ†ÌÅ∞ÌôîÌïòÏó¨ Í∏∏Ïù¥Î•º Ï†ïÌôïÌûà Í≥ÑÏÇ∞
        # add_special_tokens=FalseÎ•º ÏÇ¨Ïö©ÌïòÏó¨ ÌÜ†ÌÅ∞ Í∞úÏàò Í≥ÑÏÇ∞
        prompt_tokens = tokenizer(
            prompt_plus_header,
            add_special_tokens=False,
            truncation=True,
            max_length=MAX_LENGTH,
            padding=False
        )
        prompt_len = len(prompt_tokens["input_ids"])

        # ÌîÑÎ°¨ÌîÑÌä∏ Í∏∏Ïù¥ÎßåÌÅº -100ÏúºÎ°ú ÎßàÏä§ÌÇπ
        if prompt_len < len(labels):
            labels[:prompt_len] = [-100] * prompt_len
        else:
            labels[:] = [-100] * len(labels)
    else:
        labels[:] = [-100] * len(labels)

    tokenized["labels"] = labels
    return tokenized

print(f"[-] Tokenizing dataset and applying label masking (Input Cleaned)...")
tokenized_datasets = raw_datasets.map(
    tokenize_and_mask_function, batched=False, remove_columns=["text"],
)

train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]


# --------------------------------------------------------------------------------
# 5. ÌïôÏäµ ÏÑ§Ï†ï (TrainingArguments)
# --------------------------------------------------------------------------------

# ROUGE ÌïÑÌÑ∞ÎßÅ Ìï®Ïàò (ÌèâÍ∞Ä Ïãú ÏàúÏàò ÎãµÎ≥Ä Ï∂îÏ∂úÏö©)
def filter_generated_text(text, separator):
    # separatorÎäî ÌÅ¥Î¶¨ÎãùÎêú "### Assistant:" ÏûÖÎãàÎã§.
    CLEANED_SEPARATOR = re.sub(r'\s+', ' ', separator).strip()

    if CLEANED_SEPARATOR in text:
        filtered_text = text.split(CLEANED_SEPARATOR, 1)[1].strip()
        filtered_text = filtered_text.replace("### Human:", "").strip()
        filtered_text = filtered_text.replace("### Assistant:", "").strip()
        return filtered_text
    return ""

# compute_metrics Ìï®Ïàò Ï†ïÏùò (LossÏôÄ ROUGE-L Í≥ÑÏÇ∞)
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    global tokenizer
    global rouge

    predictions = np.array(predictions)

    # üîπ logits(T,B,V)Î°ú Îì§Ïñ¥Ïò§Î©¥ argmax, Ïù¥ÎØ∏ ids(B,T)Ïù¥Î©¥ Í∑∏ÎåÄÎ°ú ÏÇ¨Ïö©
    if predictions.ndim == 3:
        predictions = np.argmax(predictions, axis=-1)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds_filtered = [filter_generated_text(pred, PROMPT_SEPARATOR_WITH_NEWLINE) for pred in decoded_preds]
    decoded_labels_filtered = [filter_generated_text(label, PROMPT_SEPARATOR_WITH_NEWLINE) for label in decoded_labels]

    rouge_results = rouge.compute(
        predictions=decoded_preds_filtered,
        references=decoded_labels_filtered
    )
    return {"rougeL": rouge_results["rougeL"]}



training_args = TrainingArguments(
    output_dir=os.path.join(OUTPUT_DIR, "qwen_r32_ift_checkpoints"),

    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,

    learning_rate=5e-5,
    num_train_epochs=3,

    logging_strategy="steps",
    logging_steps=1000,             # üîπ Ïó¨Í∏∞ÏÑú train loss Ï∞çÌûò

    fp16=True,
    bf16=False,

    optim="adamw_torch",
    lr_scheduler_type="cosine",
    warmup_ratio=0.06,

    do_eval=True,
    eval_strategy="steps",          # üîπ Ïä§ÌÖù Îã®ÏúÑÎ°ú val loss Í≥ÑÏÇ∞
    eval_steps=1000,                # üîπ 1000 stepÎßàÎã§ eval_loss Ï∞çÌûò

    save_strategy="steps",
    save_steps=1000,

    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",  # üîπ Ïù¥Ï†ú Í∏∞Ï§ÄÏùÄ eval_loss
    greater_is_better=False,

    logging_dir=os.path.join(OUTPUT_DIR, "qwen_r32_ift_checkpoints", "logs"),
    report_to="none",
    gradient_checkpointing=False,
)

from dataclasses import dataclass
from typing import Dict, List, Any
import torch
from transformers import PreTrainedTokenizerBase

@dataclass
class DataCollatorForCausalLMWithMaskedLabels:
    tokenizer: PreTrainedTokenizerBase
    max_length: int = 1024

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        # 1) labels Îî∞Î°ú ÎπºÎëêÍ∏∞
        labels = [f["labels"] for f in features]

        # 2) tokenizer.padÏóê ÎÑòÍ∏∏ ÎïåÎäî labels ÎπºÍ≥† ÎÑòÍ∏∞Í∏∞
        features_no_labels = []
        for f in features:
            f = dict(f)         # shallow copy
            f.pop("labels")     # labels Ï†úÍ±∞
            features_no_labels.append(f)

        # 3) input_ids / attention_mask Ìå®Îî©
        batch = self.tokenizer.pad(
            features_no_labels,
            padding="longest",
            max_length=self.max_length,
            return_tensors="pt",
        )

        # 4) labelsÎèÑ Í∞ôÏùÄ Í∏∏Ïù¥Î°ú -100 Ìå®Îî©
        max_len = batch["input_ids"].shape[1]
        padded_labels = torch.full(
            (len(labels), max_len),
            -100,
            dtype=torch.long,
        )

        for i, l in enumerate(labels):
            l = l[:max_len]
            padded_labels[i, :len(l)] = torch.tensor(l, dtype=torch.long)

        batch["labels"] = padded_labels
        return batch

data_collator = DataCollatorForCausalLMWithMaskedLabels(
    tokenizer=tokenizer,
    max_length=MAX_LENGTH,
)

from transformers import Trainer

trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=None,
)

# --------------------------------------------------------------------------------
# 6. ÌïôÏäµ Ïã§Ìñâ Î∞è Ï†ÄÏû•
# --------------------------------------------------------------------------------
print("---ÌååÏù∏ÌäúÎãù ÏãúÏûë--- (r=32, Input Cleaned, Logging ROUGE-L)")
trainer.train()

save_path = os.path.join(OUTPUT_DIR, "qa_params")
trainer.model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"[-] IFT Model saved to: {save_path}")

# --------------------------------------------------------------------------------
# 7. ÌïôÏäµ Î°úÍ∑∏ Ï†ÄÏû• (ÏãúÍ∞ÅÌôî Îç∞Ïù¥ÌÑ∞Î•º ÏúÑÌïú Ï∂îÏ∂ú)
# --------------------------------------------------------------------------------
print("\n[--- Saving Training and Evaluation Logs Ï†ÄÏû•Ï§ë---]")

log_history = trainer.state.log_history
log_output_path = os.path.join(OUTPUT_DIR, "qwen_r32_ift_metrics_log.json")

with open(log_output_path, 'w', encoding='utf-8') as f:
    json.dump(log_history, f, ensure_ascii=False, indent=4)

print(f"‚úÖ ÌïôÏäµ/ÌèâÍ∞Ä Î°úÍ∑∏Í∞Ä ÏÑ±Í≥µÏ†ÅÏúºÎ°ú Ï†ÄÏû•ÏôÑÎ£å: {log_output_path}")

print("\nÎ°úÍ∑∏ ÌôúÏö© Í∞ÄÏù¥Îìú]")
print(f"Ï†ÄÏû•Îêú {os.path.basename(log_output_path)} ÌååÏùºÏùÑ Ïù¥Ïö©ÌïòÏó¨ 'loss', 'eval_loss', 'eval_rougeL' Í∞íÏùÑ Ï∂îÏ∂úÌïòÏó¨ ÏãúÍ∞ÅÌôî Í∞ÄÎä•.")

Mounted at /content/drive
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

trainable params: 73,859,072 || all params: 1,617,573,376 || trainable%: 4.5660


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

[-] Tokenizing dataset and applying label masking (Input Cleaned)...


Map:   0%|          | 0/41097 [00:00<?, ? examples/s]

Map:   0%|          | 0/5138 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


---ÌååÏù∏ÌäúÎãù ÏãúÏûë--- (r=32, Input Cleaned, Logging ROUGE-L)


Step,Training Loss,Validation Loss
1000,1.0204,0.994381
2000,0.8949,0.979438
3000,0.854,0.978041


[-] IFT Model saved to: /content/drive/MyDrive/textanl/qa/qa_params

[--- Saving Training and Evaluation Logs Ï†ÄÏû•Ï§ë---]
‚úÖ ÌïôÏäµ/ÌèâÍ∞Ä Î°úÍ∑∏Í∞Ä ÏÑ±Í≥µÏ†ÅÏúºÎ°ú Ï†ÄÏû•ÏôÑÎ£å: /content/drive/MyDrive/textanl/qa/qwen_r32_ift_metrics_log.json

Î°úÍ∑∏ ÌôúÏö© Í∞ÄÏù¥Îìú]
Ï†ÄÏû•Îêú qwen_r32_ift_metrics_log.json ÌååÏùºÏùÑ Ïù¥Ïö©ÌïòÏó¨ 'loss', 'eval_loss', 'eval_rougeL' Í∞íÏùÑ Ï∂îÏ∂úÌïòÏó¨ ÏãúÍ∞ÅÌôî Í∞ÄÎä•.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from torch.utils.data import DataLoader
import numpy as np

print("\n[--- ÏµúÏ¢Ö Í≤ÄÏ¶ù: validation ÏÖã ROUGE-L Ìïú Î≤à Í≥ÑÏÇ∞ ---]")

model.eval()
eval_loader = DataLoader(
    eval_dataset,
    batch_size=1,           # ÏïàÏ†ÑÌïòÍ≤å 1
    shuffle=False,
    collate_fn=data_collator,
)

all_preds_text = []
all_labels_text = []

for batch in eval_loader:
    batch = {k: v.to(model.device) for k, v in batch.items()}

    with torch.no_grad():
        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
        )
        logits = outputs.logits          # (1, L, V)

    # (1, L) ‚Üí (L,)
    pred_ids  = torch.argmax(logits, dim=-1).cpu().numpy()[0]
    label_ids = batch["labels"].cpu().numpy()[0]

    # üîπ ÎãµÎ≥Ä ÌÜ†ÌÅ∞ ÏúÑÏπòÎßå ÏÇ¨Ïö© (ÌîÑÎ°¨ÌîÑÌä∏ / Ìó§ÎçîÎäî Ïù¥ÎØ∏ -100)
    mask = label_ids != -100
    if not np.any(mask):
        continue  # ÌòπÏãú Ï†ÑÎ∂Ä -100Ïù∏ Ïù¥ÏÉÅÌïú ÏÉòÌîå ÏûàÏúºÎ©¥ Ïä§ÌÇµ

    pred_ans_ids  = pred_ids[mask]
    label_ans_ids = label_ids[mask]

    # üîπ labelÏùò -100ÏùÄ Ïù¥ÎØ∏ ÎßàÏä§ÌÅ¨Î°ú Í±∏Î†ÄÏúºÎãà Ï∂îÍ∞Ä ÏπòÌôò ÌïÑÏöî ÏóÜÏùå
    #    (ÌòπÏãú ÏïàÏ†ÑÌïòÍ≤å ÌïòÍ≥† Ïã∂ÏúºÎ©¥ ÏïÑÎûòÏ≤òÎüº Ìïú Î≤à Îçî)
    # label_ans_ids = np.where(label_ans_ids != -100, label_ans_ids, tokenizer.pad_token_id)

    pred_text  = tokenizer.decode(pred_ans_ids,  skip_special_tokens=True)
    label_text = tokenizer.decode(label_ans_ids, skip_special_tokens=True)

    # üîπ Ïó¨Í∏∞ÏÑúÎäî Îçî Ïù¥ÏÉÅ filter_generated_text Ïì∞ÏßÄ Îßê Í≤É
    all_preds_text.append(pred_text.strip())
    all_labels_text.append(label_text.strip())

# üîπ ROUGE Í≥ÑÏÇ∞
final_metrics = rouge.compute(
    predictions=all_preds_text,
    references=all_labels_text,
)

print(f"‚úÖ ÏµúÏ¢Ö ROUGE-L: {final_metrics['rougeL']:.4f}")



[--- ÏµúÏ¢Ö Í≤ÄÏ¶ù: validation ÏÖã ROUGE-L Ìïú Î≤à Í≥ÑÏÇ∞ ---]
‚úÖ ÏµúÏ¢Ö ROUGE-L: 0.3893
