In [2]:
# =========================================================
# 🧠 Fine-tuning Small Language Model with Teacher LLM
# by Pongsatron / Template for AIAT Internship Project
# =========================================================

# ✅ STEP 1: Install dependencies
!pip install -q transformers peft datasets accelerate bitsandbytes openai

# ✅ STEP 2: Import libraries
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from openai import OpenAI
import pandas as pd
import os, json, time, random
from google.colab import userdata

OPENROUTER_API_KEY = userdata.get('OPENROUTER_API_KEY')

In [3]:
# 🔑 ตั้งค่า API key ของ OpenRouter (หรือ OpenAI)
client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=OPENROUTER_API_KEY,
)


In [4]:
# ====== CONFIG ======
MAIN_PATH = "/content/drive/MyDrive/demo_finetuning/"
CSV_PATH = MAIN_PATH + "teacher_prompts_50.csv"          # ไฟล์ prompt ชุด 200 แถว
DDI_CSV  = MAIN_PATH + "db_drug_interactions.csv"         # ไฟล์ drug-drug interactions (ถ้ามี)

CSV_JSON = MAIN_PATH + "teacher_prompts_50.json"          # ไฟล์ prompt ชุด 200 แถว
DDI_JSON = MAIN_PATH + "db_drug_interactions.json"

MODEL_NAME = "openai/gpt-4o-mini"       # teacher model
OUT_CSV   = MAIN_PATH + "teacher_dataset.csv"             # dataset ออกเป็น CSV
OUT_JSONL = MAIN_PATH + "teacher_dataset.jsonl"           # dataset ออกเป็น JSONL

# สร้างเพิ่มจากฐานข้อมูลดิบ (เปิด/ปิดได้)
INCLUDE_RAW_DB_TASKS = True

# ขีดจำกัด / backoff
SLEEP_BETWEEN_CALLS = 1.0
MAX_RETRIES = 3
BACKOFF_BASE = 1.8

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# ====== TEST CONNECTION (เหมือนโค้ดเดิม) ======
test = client.responses.create(model=MODEL_NAME, input="ping")
print("✅ OpenRouter connected:", test.output_text[:60], "...")

✅ OpenRouter connected: Pong! How can I assist you today? ...


In [6]:

# ====== HELPERS ======
def robust_input_text(instruction: str, input_payload: str | None) -> str:
    """
    รวม instruction + input ให้เป็นข้อความเดียว เผื่อโมเดลที่รับเป็นข้อความเดี่ยว
    """
    input_payload = (input_payload or "").strip()
    if input_payload:
        return f"{instruction.strip()}\n\nINPUT:\n{input_payload}"
    return instruction.strip()

def llm_call_with_retry(model: str, prompt: str):
    """
    เรียก LLM พร้อม retry + backoff แบบง่าย
    """
    last_err = None
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            resp = client.responses.create(model=model, input=prompt)
            return resp.output_text
        except Exception as e:
            last_err = e
            wait = (BACKOFF_BASE ** (attempt - 1)) + random.uniform(0, 0.3)
            print(f"⚠️  Error (attempt {attempt}/{MAX_RETRIES}): {e} → retry in {wait:.1f}s")
            time.sleep(wait)
    # ถ้าไม่สำเร็จเลย ให้โยน error สุดท้ายออกไป
    raise last_err

def append_record(dataset: list, instruction: str, input_payload: str, output_text: str, meta: dict):
    """
    เพิ่มแถวลง dataset (เก็บ instruction/input/output + metadata)
    """
    row = {
        "instruction": instruction,
        "input": input_payload or "",
        "output": output_text,
    }
    # แนบ metadata ไว้ให้
    for k, v in (meta or {}).items():
        row[k] = v
    dataset.append(row)

  # ====== (ทางเลือก) สร้างงานเพิ่มจากฐาน Medicine + DDI ======
def safe_get(row, key):
    try:
        v = row.get(key, "")
        if pd.isna(v):
            return ""
        return str(v)
    except Exception:
        return ""


In [7]:

# ====== โหลด prompt จากไฟล์ teacher_prompts_200.csv ======
if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"ไม่พบไฟล์ {CSV_PATH} ในโฟลเดอร์ปัจจุบัน")

prompts_df = pd.read_csv(CSV_PATH)
required_cols = {"instruction", "input"}
missing = required_cols - set(map(str.lower, prompts_df.columns))
# รองรับกรณีคอลัมน์ตัวพิมพ์ใหญ่/เล็กต่างกัน
colmap = {c.lower(): c for c in prompts_df.columns}
instruction_col = colmap.get("instruction")
input_col = colmap.get("input")

if instruction_col is None or input_col is None:
    raise ValueError(f"ไฟล์ {CSV_PATH} ต้องมีคอลัมน์ instruction และ input")

# คอลัมน์เมตา (ถ้ามี)
task_type_col  = colmap.get("task_type")
language_col   = colmap.get("language")
difficulty_col = colmap.get("difficulty")
expected_col   = colmap.get("expected_output_format")

dataset = []
ddi_dataset = []

print(f"▶️  Generating from {CSV_PATH} ({len(prompts_df)} rows) ...")
if not os.path.exists(CSV_JSON):
  for i, row in prompts_df.iterrows():
      try:
          instruction = str(row[instruction_col] or "").strip()
          input_payload = "" if pd.isna(row[input_col]) else str(row[input_col]).strip()

          # รวมข้อความสำหรับส่งเข้าโมเดล (ตัวเดียวกับโค้ดเดิมใช้ param 'input')
          prompt_text = robust_input_text(instruction, input_payload)

          # เรียก teacher
          output_text = llm_call_with_retry(MODEL_NAME, prompt_text)

          # แนบเมตา
          meta = {}
          if task_type_col:  meta["task_type"] = row.get(task_type_col, "")
          if language_col:   meta["language"] = row.get(language_col, "")
          if difficulty_col: meta["difficulty"] = row.get(difficulty_col, "")
          if expected_col:   meta["expected_output_format"] = row.get(expected_col, "")

          append_record(dataset, instruction, input_payload, output_text, meta)

          if (i + 1) % 10 == 0:
              print(f"  ✓ Done {i+1}/{len(prompts_df)}")
          time.sleep(SLEEP_BETWEEN_CALLS)
      except Exception as e:
          print(f"❌ Error at row {i+1}: {e}")
          continue
  export_json = json.dumps(dataset, ensure_ascii=False)
  with open(CSV_JSON, "w", encoding="utf-8") as f:
      f.write(export_json)
else:
  with open(CSV_JSON, "r", encoding="utf-8") as f:
      dataset = json.load(f)


if INCLUDE_RAW_DB_TASKS:
    # โหลดไฟล์ถ้ามี
    ddi_df = pd.read_csv(DDI_CSV) if os.path.exists(DDI_CSV) else pd.DataFrame()


    # ------------ งานจาก db_drug_interactions ------------
    if not ddi_df.empty:
        def fcol(df, names, default=None):
            low = {c.lower(): c for c in df.columns}
            for n in names:
                for c in low:
                    if n.lower() == c or n.lower() in c:
                        return low[c]
            return default or df.columns[0]

        a_col = fcol(ddi_df, ["drug_a","drug1","object","perpetrator","index_drug"])
        b_col = fcol(ddi_df, ["drug_b","drug2","precipitant","victim","coadministered"])
        sev_col = fcol(ddi_df, ["severity","risk_level","level"])
        mech_col = fcol(ddi_df, ["mechanism","moa","mechanism_of_interaction"])
        mgmt_col = fcol(ddi_df, ["management","action","recommendation","clinical_management"])
        desc_col = fcol(ddi_df, ["description","details","interaction_description","notes"])

        print(f"▶️  Generating from {DDI_CSV} ({len(ddi_df)} rows) ...")

        if not os.path.exists(DDI_JSON):
          for i, row in ddi_df.iterrows():
              try:
                  # ตัวอย่างงาน: DDI explanation (EN)
                  instruction = f"Explain the drug–drug interaction between {safe_get(row, a_col)} and {safe_get(row, b_col)}. Include severity, mechanism, clinical consequences, and recommended management."
                  input_payload = json.dumps({
                      "severity": safe_get(row, sev_col),
                      "mechanism": safe_get(row, mech_col),
                      "description": safe_get(row, desc_col),
                      "management": safe_get(row, mgmt_col),
                  }, ensure_ascii=False)

                  prompt_text = robust_input_text(instruction, input_payload)
                  output_text = llm_call_with_retry(MODEL_NAME, prompt_text)

                  meta = {
                      "task_type": "ddi_explain.en(raw)",
                      "language": "en",
                      "source": "db_drug_interactions.csv"
                  }
                  append_record(ddi_dataset, instruction, input_payload, output_text, meta)
                  if (i + 1) % 50 == 0:
                      print(f"  ✓ DDI {i+1}/{len(ddi_df)}")
                  time.sleep(SLEEP_BETWEEN_CALLS)
              except Exception as e:
                  print(f"❌ DDI row {i+1}: {e}")
                  continue
          export_json = json.dumps(ddi_dataset, ensure_ascii=False)
          with open(DDI_JSON, "w", encoding="utf-8") as f:
              f.write(export_json)
        else:
          with open(DDI_JSON, "r", encoding="utf-8") as f:
              ddi_dataset = json.load(f)

dataset = [*dataset, *ddi_dataset]


▶️  Generating from /content/drive/MyDrive/demo_finetuning/teacher_prompts_50.csv (49 rows) ...
▶️  Generating from /content/drive/MyDrive/demo_finetuning/db_drug_interactions.csv (49 rows) ...


In [8]:

# ====== บันทึกผลลัพธ์ ======
df = pd.DataFrame(dataset)
df.to_csv(OUT_CSV, index=False)
print(f"✅ Dataset generated with {len(df)} samples and saved as {OUT_CSV}")

with open(OUT_JSONL, "w", encoding="utf-8") as f:
    for _, r in df.iterrows():
        f.write(json.dumps({
            "instruction": r["instruction"],
            "input": r.get("input", ""),
            "output": r["output"],
            # เขียนเมตาเพิ่มได้ตามต้องการ
            "meta": {
                "task_type": r.get("task_type",""),
                "language": r.get("language",""),
                "difficulty": r.get("difficulty",""),
                "expected_output_format": r.get("expected_output_format",""),
                "source": r.get("source","teacher_prompts_50.csv")
            }
        }, ensure_ascii=False) + "\n")
print(f"✅ Also saved JSONL at {OUT_JSONL}")

✅ Dataset generated with 98 samples and saved as /content/drive/MyDrive/demo_finetuning/teacher_dataset.csv
✅ Also saved JSONL at /content/drive/MyDrive/demo_finetuning/teacher_dataset.jsonl


In [9]:
# -*- coding: utf-8 -*-
import os, time, functools, math, random, json
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    TrainingArguments, Trainer,
    DataCollatorForLanguageModeling, BitsAndBytesConfig, EarlyStoppingCallback,
    set_seed
)
# Import เพิ่มสำหรับโหลดโมเดล LoRA
from peft import LoraConfig, get_peft_model, PeftModel

# =========================
# 🔧 CONFIG
# =========================
SEED = 42
VAL_SIZE = 0.1                  # 10% validation
PATIENCE = 2                    # early stopping patience
EVAL_STRATEGY = "epoch"         # "epoch" or "steps"
MODEL_NAME = "Qwen/Qwen3-1.7B"
MAX_LENGTH = 2048
SAMPLE_N = 20 # จำนวน sample สำหรับ quick compare

# --- !! สำคัญ: แก้ MAIN_PATH ให้เป็น path ที่ถูกต้องของคุณ !! ---
# ตัวอย่าง: MAIN_PATH = "/content/drive/MyDrive/demo_finetuning/"
# ถ้า MAIN_PATH ไม่ได้ถูกกำหนดใน cell ก่อนหน้า, ให้กำหนดที่นี่
if 'MAIN_PATH' not in globals():
    MAIN_PATH = "/content/drive/MyDrive/demo_finetuning/" # <--- แก้ไข path ตรงนี้

OUTPUT_DIR = MAIN_PATH + "finetuned_qwen3_1p7b_lora_checkpoints" # เปลี่ยนชื่อเพื่อไม่ให้สับสนกับที่เซฟสุดท้าย
SAVE_DIR = MAIN_PATH + "qwen3_1p7b-instruct-lora" # ที่เซฟโมเดลสุดท้าย
DATA_CSV_PATH = MAIN_PATH + "teacher_dataset.csv" # Path ของไฟล์ข้อมูล
LOG_CSV = MAIN_PATH + "training_logs.csv" # Path สำหรับบันทึก training logs
COMPARE_CSV = MAIN_PATH + "winrate_20.csv" # Path สำหรับบันทึกผล quick compare
USE_CHAT_TEMPLATE = False # ถ้าโมเดลรองรับ chat template ของ HF ให้ตั้งเป็น True

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
set_seed(SEED)

# =========================
# ⚙️ Load tokenizer/model
# =========================
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,
)

def with_retry(fn, retries=3, delay=8):
    import functools, time
    @functools.wraps(fn)
    def _wrap(*args, **kwargs):
        last = None
        for i in range(retries):
            try:
                return fn(*args, **kwargs)
            except Exception as e:
                last = e
                print(f"[retry {i+1}/{retries}] {e}")
                time.sleep(delay)
        raise last
    return _wrap

load_tokenizer = with_retry(lambda: AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True))
tokenizer = load_tokenizer()
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

load_base_model = with_retry(lambda: AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else None,
    low_cpu_mem_usage=True
))
base_model = load_base_model()

# =================================================================
# ---  lógica nueva: comprueba si existe un modelo, si no, entrena ---
# =================================================================

# Check if a trained model adapter already exists
if os.path.exists(SAVE_DIR) and 'adapter_config.json' in os.listdir(SAVE_DIR):
    # --- If model exists, load it ---
    print(f"✅ Found existing fine-tuned model at '{SAVE_DIR}'. Loading it.")
    model = PeftModel.from_pretrained(base_model, SAVE_DIR)
    print("✅ Model loaded successfully from saved directory.")

else:
    # --- If model does not exist, start training ---
    print(f"❌ No fine-tuned model found at '{SAVE_DIR}'. Starting a new training session.")

    # Create LoRA model from base for training
    lora_config = LoraConfig(
        r=8,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
    )
    model = get_peft_model(base_model, lora_config)
    model.print_trainable_parameters()

    # Load dataset
    def load_rows_from_sources():
        df = pd.DataFrame()
        if os.path.exists(DATA_CSV_PATH):
            try:
                df = pd.read_csv(DATA_CSV_PATH)
                print(f"Loaded data from {DATA_CSV_PATH}")
            except Exception as e:
                print(f"Error loading {DATA_CSV_PATH}: {e}")
        else:
            print(f"Data file not found at {DATA_CSV_PATH}")

        # Ensure required columns exist and handle NaNs
        for col in ["instruction", "input", "output"]:
            if col not in df.columns:
                df[col] = ""
            df[col] = df[col].fillna("").astype(str)

        # Filter for samples with non-empty instruction and output
        df = df[(df["instruction"].str.strip() != "") & (df["output"].str.strip() != "")]
        df = df.drop_duplicates(subset=["instruction", "input"], keep="last").reset_index(drop=True)

        if len(df) == 0:
            raise ValueError("No trainable samples found: instruction + output are required.")
        return df

    df = load_rows_from_sources()
    print(f"✅ Loaded samples: {len(df)}")
    dataset = Dataset.from_pandas(df)

    # Prompt formatting
    def format_example(example):
        instruction = (example.get("instruction") or "").strip()
        inp = (example.get("input") or "").strip()
        output = (example.get("output") or "").strip()

        if inp:
            user_text = f"### Instruction:\n{instruction}\n\nINPUT:\n{inp}"
        else:
            user_text = f"### Instruction:\n{instruction}"

        text = f"{user_text}\n\n### Response:\n{output}{tokenizer.eos_token}"
        return tokenizer(text, truncation=True, max_length=MAX_LENGTH, padding=False)

    tokenized = dataset.map(format_example, remove_columns=dataset.column_names)

    # Train/Val split
    split = tokenized.train_test_split(test_size=VAL_SIZE, seed=SEED)
    train_dataset = split["train"]
    eval_dataset  = split["test"]
    print(f"Train: {len(train_dataset)} | Val: {len(eval_dataset)}")

    # Collator
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    # TrainingArguments + EarlyStopping
    use_bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=2,
        learning_rate=2e-4,
        num_train_epochs=5,
        logging_steps=10,
        save_total_limit=1,
        save_strategy=EVAL_STRATEGY,
        evaluation_strategy=EVAL_STRATEGY, # Correct parameter name
        bf16=use_bf16,
        fp16=not use_bf16,
        report_to="none",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        seed=SEED
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)]
    )

    # Train
    print("🏋️ Starting training...")
    trainer.train()

    # Save best model
    print(f"💾 Saving best model to {SAVE_DIR}...")
    model.save_pretrained(SAVE_DIR)
    tokenizer.save_pretrained(SAVE_DIR)
    print(f"✅ Fine-tuning complete! Model saved at {SAVE_DIR}")

# ===============================================================
# --- END OF NEW LOGIC ---
# At this point, the `model` variable is ready for inference,
# whether it was loaded from disk or freshly trained.
# ===============================================================
print("\nModel is ready for use.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/622M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

✅ Found existing fine-tuned model at '/content/drive/MyDrive/demo_finetuning/qwen3_1p7b-instruct-lora'. Loading it.
✅ Model loaded successfully from saved directory.

Model is ready for use.


In [12]:
# -*- coding: utf-8 -*-
import os, time, functools, math, random, json
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    TrainingArguments, Trainer,
    DataCollatorForLanguageModeling, BitsAndBytesConfig, EarlyStoppingCallback,
    set_seed
)
# Import เพิ่มสำหรับโหลดโมเดล LoRA
from peft import LoraConfig, get_peft_model, PeftModel

# =========================
# 📊 Extract train/eval losses → CSV
# =========================
log_hist = []
if 'trainer' in globals() and hasattr(trainer, 'state') and hasattr(trainer.state, 'log_history'):
    log_hist = trainer.state.log_history

# ดึงเฉพาะ record ที่มีค่า loss
records = []
for rec in log_hist:
    row = {}
    if "loss" in rec: row["train_loss"] = rec["loss"]
    if "eval_loss" in rec: row["eval_loss"] = rec["eval_loss"]
    if "epoch" in rec: row["epoch"] = rec["epoch"]
    if row:
        records.append(row)
df_log = pd.DataFrame(records)

if not df_log.empty:
    df_log.to_csv(LOG_CSV, index=False)
    last_train_loss = df_log["train_loss"].dropna().iloc[-1] if "train_loss" in df_log and df_log["train_loss"].notna().any() else None
    last_eval_loss  = df_log["eval_loss"].dropna().iloc[-1]  if "eval_loss"  in df_log and df_log["eval_loss"].notna().any()  else None
    print(f"📉 Final train loss: {last_train_loss}")
    print(f"🧪 Final eval  loss: {last_eval_loss}")
    print(f"📝 Logs saved: {LOG_CSV}")
else:
    last_train_loss = None
    last_eval_loss = None
    print("⚠️ No training logs found. Skipping loss extraction.")


# =========================
# 🧪 Quick before/after compare (win-rate@{SAMPLE_N})
# =========================
def rouge_l_f(ref, hyp):
    ref_tokens = ref.split()
    hyp_tokens = hyp.split()
    m, n = len(ref_tokens), len(hyp_tokens)
    dp = [[0]*(n+1) for _ in range(m+1)]
    for i in range(m):
        for j in range(n):
            if ref_tokens[i] == hyp_tokens[j]:
                dp[i+1][j+1] = dp[i][j] + 1
            else:
                dp[i+1][j+1] = max(dp[i][j+1], dp[i+1][j])
    lcs = dp[m][n]
    prec = lcs / (n or 1)
    rec  = lcs / (m or 1)
    if prec + rec == 0: return 0.0
    return 2 * prec * rec / (prec + rec)

# ชื่อไฟล์ CSV ที่จะใช้เก็บผล win-rate
RESULT_DIR = "/content/drive/MyDrive/demo_finetuning"
os.makedirs(RESULT_DIR, exist_ok=True)
WINRATE_CSV = os.path.join(RESULT_DIR, "winrate_20.csv")

# 🔎 ตรวจว่ามีไฟล์ winrate อยู่แล้วไหม
if os.path.exists(WINRATE_CSV):
    print(f"📂 พบไฟล์ผลลัพธ์ก่อนหน้า: {WINRATE_CSV}")
    df_cmp = pd.read_csv(WINRATE_CSV)
    win_rate = (df_cmp["winner"] == "FT").mean() if "winner" in df_cmp.columns else 0.0

    print("\n========== 📊 PREVIOUS EVALUATION SUMMARY ==========")
    print(f"Train loss (last): {last_train_loss}")
    print(f"Eval  loss (last): {last_eval_loss}")
    try:
        ppl = math.exp(float(last_eval_loss)) if last_eval_loss is not None and pd.notnull(last_eval_loss) else None
        print(f"Perplexity (eval): {ppl:.2f}" if ppl is not None else "Perplexity (eval): N/A")
    except Exception:
        print("Perplexity (eval): Calculation Error")

    print(f"Win-rate@{len(df_cmp)} (FT vs BASE): {win_rate*100:.1f}%")
    print("👀 ตัวอย่างบรรทัดแรก:")
    print(df_cmp.head().to_string(index=False))
    print("====================================================")

else:
    print("🔎 ไม่พบไฟล์ winrate_20.csv — กำลังรันการเปรียบเทียบใหม่...")

    # เตรียมตัวอย่าง 20 แถวจากชุด eval ดิบ
    if 'eval_dataset' in globals() and "__index_level_0__" in eval_dataset.column_names:
        eval_raw = df.iloc[eval_dataset["__index_level_0__"]]
    else:
        eval_raw = df.sample(min(SAMPLE_N, len(df)), random_state=SEED)

    eval_sample = eval_raw.sample(min(SAMPLE_N, len(eval_raw)), random_state=SEED).reset_index(drop=True)

    def build_user_text_from_df_row(r):
        instr = (r.get("instruction","") or "").strip()
        inp = (r.get("input","") or "").strip()
        return f"{instr}\n\nINPUT:\n{inp}" if inp else instr

    @torch.no_grad()
    def generate_batch(model_obj, rows_df, max_new_tokens=196):
        model_obj.eval()
        outs = []
        for _, r in rows_df.iterrows():
            user_text = build_user_text_from_df_row(r)
            if USE_CHAT_TEMPLATE:
                messages = [{"role": "user", "content": user_text}]
                prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            else:
                prompt_text = f"### Instruction:\n{user_text}\n\n### Response:\n"
            inputs = tokenizer(prompt_text, return_tensors="pt").to(model_obj.device)
            gen = model_obj.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                eos_token_id=tokenizer.eos_token_id
            )
            decoded = tokenizer.decode(gen[0], skip_special_tokens=True)
            prompt_end_index = decoded.find(prompt_text)
            outs.append(decoded[prompt_end_index + len(prompt_text):].strip() if prompt_end_index != -1 else decoded.strip())
        return outs

    print("🔎 Generating base vs finetuned outputs on 20 samples...")
    base_for_eval = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        device_map="auto",
        trust_remote_code=True,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else None,
        low_cpu_mem_usage=True
    )

    if hasattr(base_for_eval, "gradient_checkpointing_enable"):
        base_for_eval.gradient_checkpointing_enable()

    base_preds = generate_batch(base_for_eval, eval_sample)
    ft_preds   = generate_batch(model,         eval_sample)

    # คำนวณ ROUGE-L และสรุปผล
    rows_cmp, wins = [], 0
    for i, r in eval_sample.iterrows():
        ref = (r.get("output","") or "")
        b = rouge_l_f(ref, base_preds[i])
        f = rouge_l_f(ref, ft_preds[i])
        winner = "FT" if f > b else ("BASE" if b > f else "TIE")
        if winner == "FT": wins += 1
        rows_cmp.append({
            "idx": i,
            "instruction": (r["instruction"][:120] + "…") if len(r["instruction"]) > 120 else r["instruction"],
            "base_score_rougeL": round(b, 4),
            "ft_score_rougeL": round(f, 4),
            "winner": winner
        })

    df_cmp = pd.DataFrame(rows_cmp)
    df_cmp.to_csv(WINRATE_CSV, index=False)
    win_rate = wins / len(df_cmp) if len(df_cmp) else 0.0

    print("========== 📊 EVALUATION SUMMARY ==========")
    print(f"Train loss (last): {last_train_loss}")
    print(f"Eval  loss (last): {last_eval_loss}")
    try:
        ppl = math.exp(float(last_eval_loss)) if last_eval_loss is not None and pd.notnull(last_eval_loss) else None
        print(f"Perplexity (eval): {ppl:.2f}" if ppl is not None else "Perplexity (eval): N/A")
    except Exception:
        print("Perplexity (eval): Calculation Error")

    print(f"Win-rate@{len(df_cmp)} (FT vs BASE): {win_rate*100:.1f}%")
    print(f"💾 บันทึกผลลง: {WINRATE_CSV}")
    print("==========================================")

⚠️ No training logs found. Skipping loss extraction.
📂 พบไฟล์ผลลัพธ์ก่อนหน้า: /content/drive/MyDrive/demo_finetuning/winrate_20.csv

Train loss (last): None
Eval  loss (last): None
Perplexity (eval): N/A
Win-rate@20 (FT vs BASE): 85.0%
👀 ตัวอย่างบรรทัดแรก:
 idx                                                                                                               instruction  base_score_rougeL  ft_score_rougeL winner
   0 Explain the drug–drug interaction between Docetaxel and Docetaxel. Include severity, mechanism, clinical consequences, a…             0.1333           0.2108     FT
   1 Normalize the following drug record for Augmentin 625 Duo Tablet into JSON with keys: drug, brand, class, mechanism, ind…             0.2210           0.7041     FT
   2                       Translate the mechanism of action of Avastin 400mg Injection into Thai and keep it to one sentence.             0.0632           0.0845     FT
   3 Create a concise clinical card for Aciloc 150 Tablet. Incl

In [14]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=b4fc8de4bbaf484a01c7251cd1fae75e5a8f5d71565a35be0eece132c7549e89
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [15]:
!pip install bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [18]:
# -*- coding: utf-8 -*-
import os, glob, math, re, datetime as dt
from collections import Counter
import pandas as pd
import numpy as np
import torch

# ==============================================================================
# สคริปต์นี้จะ:
# - โหลด df_val อัตโนมัติ (หรือสร้างตัวอย่าง 3 แถว)
# - ใช้ build_prompt สำรองถ้าไม่มี
# - ประเมินโมเดล (แบบไม่พึ่ง trainer) และคำนวณเมตริกต่าง ๆ
# - บันทึกผลเป็น CSV /content/drive/MyDrive/demo_finetuning/metrics_val_*.csv
# - ถ้ามีไฟล์เก่า: จะแสดงไฟล์ล่าสุดแทนการรันใหม่ (ตั้ง FORCE_RERUN=True เพื่อบังคับรัน)
# ==============================================================================

def _has(name):
    return (name in globals()) and (globals()[name] is not None)

# ====== 0) โหลด df_val อัตโนมัติ ======
def _pick_latest(paths):
    paths = [p for p in paths if os.path.exists(p)]
    if not paths: return None
    return max(paths, key=lambda p: os.path.getmtime(p))

if not _has("df_val"):
    CANDIDATES = []
    base_dirs = [".", "/content", "/content/drive/MyDrive", "/content/drive/MyDrive/demo_finetuning"]
    names = ["val.csv", "validation.csv", "df_val.csv", "valid.csv", "dev.csv",
             "val.parquet", "validation.parquet"]
    for d in base_dirs:
        for n in names:
            CANDIDATES.append(os.path.join(d, n))

    hit = _pick_latest([p for p in CANDIDATES if p.endswith(".csv")]) \
          or _pick_latest([p for p in CANDIDATES if p.endswith(".parquet")])

    if hit:
        try:
            if hit.endswith(".csv"):
                df_val = pd.read_csv(hit)
            else:
                df_val = pd.read_parquet(hit)
            print(f"✅ โหลด df_val จากไฟล์: {hit} | shape={df_val.shape}")
        except Exception as e:
            print(f"⚠️ โหลดไฟล์ {hit} ไม่สำเร็จ: {e}")

if not _has("df_val"):
    default_csv = "/content/drive/MyDrive/demo_finetuning/validation.csv"
    if os.path.exists(default_csv):
        try:
            df_val = pd.read_csv(default_csv)
            print(f"✅ โหลด df_val จากไฟล์: {default_csv} | shape={df_val.shape}")
        except Exception as e:
            print(f"⚠️ โหลด {default_csv} ไม่สำเร็จ: {e}")

if not _has("df_val"):
    print("⚠️ ไม่พบไฟล์ validation — จะสร้าง df_val ตัวอย่าง 3 แถว (โปรดเปลี่ยนเป็นข้อมูลจริงของคุณ)")
    df_val = pd.DataFrame([
        {"input":"Explain the mechanism of action of aspirin in one sentence.",
         "output":"Aspirin irreversibly inhibits COX-1 and COX-2 to reduce prostaglandin and thromboxane synthesis.",
         "drug":"aspirin"},
        {"input":"Describe the MOA of ibuprofen in one sentence.",
         "output":"Ibuprofen reversibly inhibits COX-1 and COX-2 to decrease prostaglandin production.",
         "drug":"ibuprofen"},
        {"input":"What is the mechanism of action of omeprazole? (one sentence)",
         "output":"Omeprazole irreversibly inhibits the gastric H+/K+-ATPase proton pump to suppress acid secretion.",
         "drug":"omeprazole"},
        {"input":"Give the MOA of loratadine in one sentence.",
         "output":"Loratadine antagonizes H1 histamine receptors to reduce allergic symptoms.",
         "drug":"loratadine"},
        {"input":"Explain how fluoxetine works (one sentence).",
         "output":"Fluoxetine inhibits SERT to increase synaptic serotonin levels.",
         "drug":"fluoxetine"},
        {"input":"Describe sertraline's mechanism of action in one sentence.",
         "output":"Sertraline selectively inhibits SERT to block serotonin reuptake.",
         "drug":"sertraline"},
        {"input":"What is haloperidol's mechanism of action? (one sentence)",
         "output":"Haloperidol antagonizes D2 dopamine receptors to reduce positive psychotic symptoms.",
         "drug":"haloperidol"},
        {"input":"Explain diazepam's mechanism of action in one sentence.",
         "output":"Diazepam positively modulates GABA-A receptors to enhance inhibitory GABAergic transmission.",
         "drug":"diazepam"},
        {"input":"Give the MOA of ketamine in one sentence.",
         "output":"Ketamine noncompetitively antagonizes NMDA receptors to reduce excitatory neurotransmission.",
         "drug":"ketamine"},
        {"input":"Explain the MOA of salbutamol (albuterol) in one sentence.",
         "output":"Salbutamol is a β2 agonist that activates adenylate cyclase, increases cAMP, and relaxes bronchial smooth muscle.",
         "drug":"salbutamol"},
        {"input":"Describe propranolol's mechanism of action in one sentence.",
         "output":"Propranolol antagonizes β1 and β2 adrenergic receptors to decrease heart rate and contractility.",
         "drug":"propranolol"},
        {"input":"What is losartan's mechanism of action? (one sentence)",
         "output":"Losartan is an ARB that antagonizes AT1 receptors to block angiotensin II effects.",
         "drug":"losartan"},
        {"input":"Explain the MOA of lisinopril in one sentence.",
         "output":"Lisinopril inhibits ACE to reduce angiotensin II formation and bradykinin breakdown.",
         "drug":"lisinopril"},
        {"input":"State amlodipine's mechanism of action in one sentence.",
         "output":"Amlodipine blocks L-type Ca2+ channels in vascular smooth muscle causing vasodilation.",
         "drug":"amlodipine"},
        {"input":"Explain lidocaine's mechanism of action in one sentence.",
         "output":"Lidocaine blocks voltage-gated Na+ channels to prevent action potential propagation.",
         "drug":"lidocaine"},
        {"input":"Describe clopidogrel's MOA in one sentence.",
         "output":"Clopidogrel irreversibly antagonizes platelet P2Y12 receptors to inhibit ADP-mediated aggregation.",
         "drug":"clopidogrel"},
        {"input":"What is warfarin's mechanism of action? (one sentence)",
         "output":"Warfarin inhibits VKORC1, reducing activation of vitamin K–dependent clotting factors.",
         "drug":"warfarin"},
        {"input":"Explain metformin's mechanism of action in one sentence.",
         "output":"Metformin activates AMPK to decrease hepatic gluconeogenesis and improve insulin sensitivity.",
         "drug":"metformin"},
        {"input":"State tocilizumab's mechanism of action in one sentence.",
         "output":"Tocilizumab antagonizes the IL-6 receptor and downregulates JAK/STAT signaling to reduce inflammation.",
         "drug":"tocilizumab"},
        {"input":"Give the MOA of amoxicillin in one sentence.",
         "output":"Amoxicillin inhibits bacterial cell wall synthesis by binding penicillin-binding proteins.",
         "drug":"amoxicillin"},
    ])

# ====== 1) build_prompt สำรอง ======
if not _has("build_prompt"):
    def build_prompt(row):
        return str(row.get("instruction") or row.get("input") or row.get("question") or "").strip()
    print("ℹ️ ใช้ build_prompt แบบสำรอง (ดึงจากคอลัมน์ instruction/input/question)")

# ====== 2) Evaluate & Save (robust ต่อกรณีไม่มี trainer) ======
eval_loss, eval_ppl = float("nan"), float("nan")

if _has("trainer") and hasattr(trainer, "evaluate"):
    try:
        eval_out = trainer.evaluate()
        eval_loss = float(eval_out.get("eval_loss", float("nan")))
        eval_ppl = math.exp(eval_loss) if eval_loss == eval_loss else float("nan")
        print(f"📊 Eval loss: {eval_loss:.4f} | Perplexity: {eval_ppl:.2f}")
    except Exception as e:
        print(f"⚠️ ข้าม evaluate ของ trainer เนื่องจาก: {e}")
else:
    print("⚠️ ไม่พบตัวแปร 'trainer' หรือไม่มีเมธอด evaluate — ข้ามการประเมิน eval_loss/perplexity")

save_dir =  MAIN_PATH + "qwen3_1p7b-instruct-lora-best"
_saved_any = False

if _has("trainer") and hasattr(trainer, "save_model"):
    try:
        trainer.save_model(save_dir)
        _saved_any = True
        print(f"✅ Saved best model (via trainer) at {save_dir}")
    except Exception as e:
        print(f"⚠️ บันทึกผ่าน trainer ไม่สำเร็จ: {e}")

if _has("model") and hasattr(model, "save_pretrained"):
    try:
        model.save_pretrained(save_dir)
        _saved_any = True
        print(f"✅ Saved model (save_pretrained) at {save_dir}")
    except Exception as e:
        print(f"⚠️ บันทึกโมเดลด้วย save_pretrained ไม่สำเร็จ: {e}")

if _has("tokenizer") and hasattr(tokenizer, "save_pretrained"):
    try:
        tokenizer.save_pretrained(save_dir)
        _saved_any = True
        print(f"✅ Saved tokenizer at {save_dir}")
    except Exception as e:
        print(f"⚠️ บันทึก tokenizer ไม่สำเร็จ: {e}")

if not _saved_any:
    print("⚠️ ยังไม่ได้บันทึกอะไรเลย (ไม่มี trainer/model/tokenizer ที่บันทึกได้) — ดำเนินการต่อ")

# ====== 3) เตรียม libs เมตริก ======
try:
    from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
except Exception:
    sentence_bleu, SmoothingFunction = None, None

try:
    from rouge_score import rouge_scorer
except Exception:
    rouge_scorer = None

try:
    from bert_score import score as bertscore
except Exception:
    bertscore = None
    print("⚠️ ไม่พบโมดูล 'bert-score' — ข้ามการคำนวณ BERTScore (F1). ติดตั้งด้วย: pip install bert-score")

# ====== 4) เครื่องมือช่วยและเมตริกต่าง ๆ ======
TARGET_LEXICON = [
    r"\bCOX-?1\b", r"\bCOX-?2\b", r"\bACE\b", r"\bACE2\b", r"\bARB\b", r"\bSERT\b", r"\bNET\b",
    r"\bDAT\b", r"\bGABA[A-B]?\b", r"\bNMDA\b", r"\bH1\b", r"\bH2\b", r"\bD2\b", r"\b5-HT\d?\b",
    r"\bNa\+? channel\b", r"\bK\+? channel\b", r"\bCa\+{1,2}\b", r"\bJAK/STAT\b", r"\bMAPK\b",
]
ACTION_LEXICON = [
    r"\binhibit(s|ion|or)?\b", r"\bblock(s|er|ade)?\b", r"\bantagon(ist|ize|ism)\b",
    r"\bagon(ist|ize|ism)\b", r"\bpartial agonist\b", r"\bmodulat(e|or|ion)\b",
    r"\bstimulat(e|ion|or)\b", r"\bactivate(s|ion|d)?\b", r"\birreversible\b", r"\breversible\b"
]
PATHWAY_LEXICON = [r"\bcAMP\b", r"\bPI3K\b", r"\bJAK/STAT\b", r"\bMAPK\b", r"\bPLC\b", r"\bPKC\b"]

def extract_facets(text: str):
    text = text or ""
    def find_any(patterns):
        found = set()
        for p in patterns:
            for m in re.finditer(p, text, flags=re.IGNORECASE):
                found.add(m.group(0).lower())
        return found
    return {
        "target": find_any(TARGET_LEXICON),
        "action": find_any(ACTION_LEXICON),
        "pathway": find_any(PATHWAY_LEXICON),
    }

def f1_from_sets(true_set, pred_set):
    tp = len(true_set & pred_set)
    fp = len(pred_set - true_set)
    fn = len(true_set - pred_set)
    prec = tp / (tp + fp) if (tp + fp) else 0.0
    rec  = tp / (tp + fn) if (tp + fn) else 0.0
    f1   = 2*prec*rec / (prec+rec) if (prec+rec) else 0.0
    return f1, prec, rec

def moa_facet_f1(reference: str, prediction: str):
    ref = extract_facets(reference)
    hyp = extract_facets(prediction)
    scores = {}
    for k in ["target","action","pathway"]:
        f1, p, r = f1_from_sets(ref[k], hyp[k])
        scores[f"{k}_f1"] = f1
        scores[f"{k}_prec"] = p
        scores[f"{k}_rec"] = r
    scores["facet_f1_macro"] = np.mean([scores["target_f1"], scores["action_f1"], scores["pathway_f1"]])
    return scores

KEY_TERMS = [
    "irreversible","reversible","cox-1","cox-2","ace","sert","net","nmda","gaba","h1","h2","d2",
    "agonist","antagonist","partial agonist","inhibitor","blocker","pathway"
]
def key_term_accuracy(reference: str, prediction: str):
    ref_l = reference.lower() if reference else ""
    hyp_l = prediction.lower() if prediction else ""
    present = [k for k in KEY_TERMS if k in ref_l]
    if not present:
        return {"key_terms_covered": 1.0, "n_terms": 0}
    covered = sum(1 for k in present if k in hyp_l)
    return {"key_terms_covered": covered/len(present), "n_terms": len(present)}

ACTION_VERBS = ["inhibits","blocks","antagonizes","activates","stimulates","modulates","binds"]
def extract_triples(drug_name: str, text: str):
    text = text or ""
    triples = set()
    for verb in ACTION_VERBS:
        pat = re.compile(rf"\b({re.escape(drug_name)})\b.*?\b({verb})\b.*?\b([A-Za-z0-9\-\+\/]+)\b", re.IGNORECASE)
        for m in pat.finditer(text):
            triples.add((m.group(1).lower(), m.group(2).lower(), m.group(3).lower()))
    return triples

def triple_match(drug_name: str, ref: str, hyp: str):
    ref_t = extract_triples(drug_name, ref)
    hyp_t = extract_triples(drug_name, hyp)
    exact = len(ref_t & hyp_t)
    ref_soft = {(a,t) for (_,a,t) in ref_t}
    hyp_soft = {(a,t) for (_,a,t) in hyp_t}
    soft = len(ref_soft & hyp_soft)
    denom = max(1, len(ref_t))
    return {"triple_exact": exact/denom, "triple_soft": soft/denom, "n_ref_triples": len(ref_t)}

def bleu_score(ref, hyp):
    if sentence_bleu is None: return None
    smoothie = SmoothingFunction().method3
    ref_tok, hyp_tok = ref.split(), hyp.split()
    return sentence_bleu([ref_tok], hyp_tok, smoothing_function=smoothie)

def rougeL_score(ref, hyp):
    if rouge_scorer is None: return None
    sc = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True).score(ref, hyp)
    return sc["rougeL"].fmeasure

# ====== 5) เตรียม df_val ให้มีคอลัมน์ 'output' ======
print(f"คอลัมน์ทั้งหมดที่มีใน df_val: {df_val.columns.to_list()}")
ACTUAL_ANSWER_COLUMN = 'expected_output_format'  # ปรับได้ให้ตรงกับ dataset ของคุณ
if 'output' not in df_val.columns:
    if ACTUAL_ANSWER_COLUMN in df_val.columns:
        print(f"พบปัญหา: ไม่พบคอลัมน์ 'output' ... → เปลี่ยนชื่อ '{ACTUAL_ANSWER_COLUMN}' เป็น 'output'")
        df_val = df_val.rename(columns={ACTUAL_ANSWER_COLUMN: 'output'})
    else:
        raise KeyError(f"ไม่พบคอลัมน์ 'output' และ '{ACTUAL_ANSWER_COLUMN}' ใน df_val: {df_val.columns.to_list()}")
else:
    print("✅ พบ 'output' ใน df_val แล้ว ดำเนินการต่อได้")

# ====== 6) เช็ค/โหลดไฟล์เก่า หรือรันใหม่แล้วเซฟ CSV ======
RESULT_DIR = "/content/drive/MyDrive/demo_finetuning"
os.makedirs(RESULT_DIR, exist_ok=True)
pattern = os.path.join(RESULT_DIR, "metrics_val_*.csv")
existing_csvs = sorted(glob.glob(pattern))
FORCE_RERUN = False  # True = บังคับรันใหม่แม้มีไฟล์เก่า

if (len(existing_csvs) > 0) and (not FORCE_RERUN):
    latest_csv = existing_csvs[-1]
    print(f"📂 พบไฟล์ผลลัพธ์ก่อนหน้า: {latest_csv}")
    prev_df = pd.read_csv(latest_csv)

    def _safe_mean(series):
        s = pd.Series([x for x in series if pd.notnull(x)])
        return float(s.mean()) if len(s) else float("nan")

    print("\n📊 สรุปจากไฟล์ก่อนหน้า:")
    for name in ["bleu","rougeL","facet_f1_macro","key_terms_covered","triple_exact","triple_soft","bertscore_f1"]:
        if name in prev_df.columns:
            print(f"- {name}: {_safe_mean(prev_df[name]):.4f}")
        else:
            print(f"- {name}: (n/a)")
    print("\n👀 ตัวอย่างบรรทัดแรก ๆ:")
    print(prev_df.head().to_string(index=False))

else:
    print("🔁 ไม่พบไฟล์เก่ามาก่อน หรือบังคับให้รันใหม่ — กำลังคำนวณ metrics ...")

    # ตรวจความพร้อมของ model/tokenizer/build_prompt
    if not _has("model") or not hasattr(model, "generate"):
        raise RuntimeError("ไม่พบ `model` ที่พร้อม generate — โปรดกำหนดโมเดล (เช่น AutoModelForCausalLM.from_pretrained)")
    if not _has("tokenizer") or not hasattr(tokenizer, "__call__"):
        raise RuntimeError("ไม่พบ `tokenizer` — โปรดกำหนด tokenizer ให้พร้อมใช้งาน")
    if not _has("build_prompt"):
        raise RuntimeError("ไม่พบฟังก์ชัน `build_prompt(row)` — จำเป็นต่อการสร้างพรอมป์จาก df_val")

    model.eval()
    device = "cuda" if torch.cuda.is_available() else "cpu"

    def generate_answer(prompt, max_new_tokens=160):
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        with torch.no_grad():
            outs = model.generate(**inputs, max_new_tokens=max_new_tokens)
        return tokenizer.decode(outs[0], skip_special_tokens=True)

    # เก็บผลเพื่อ batch BERTScore
    preds, refs, ids = [], [], []
    facet_rows, kta_rows, triple_rows, bleu_list, rl_list = [], [], [], [], []

    for i, row in df_val.iterrows():
        prompt = build_prompt(row)
        ref = str(row["output"])
        pred = generate_answer(prompt)

        facet = moa_facet_f1(ref, pred)
        kta   = key_term_accuracy(ref, pred)
        trip  = triple_match((row.get("drug") or row.get("DrugName") or "").strip() or "drug", ref, pred)
        bleu  = bleu_score(ref, pred)
        rL    = rougeL_score(ref, pred)

        ids.append(i)
        preds.append(pred)
        refs.append(ref)
        facet_rows.append(facet)
        kta_rows.append(kta)
        triple_rows.append(trip)
        bleu_list.append(None if bleu is None else float(bleu))
        rl_list.append(None if rL is None else float(rL) if rL is not None else None)

    # ========== BERTScore (เลือกภาษาอัตโนมัติ) ==========
    def _thai_ratio(text):
        if not text: return 0.0
        th = sum(0x0E00 <= ord(c) <= 0x0E7F for c in text)
        return th / max(1, len(text))

    if bertscore is not None and len(preds) > 0:
        try:
            # ถ้าอย่างน้อยครึ่งหนึ่งของ refs เป็นไทย -> lang='th' มิฉะนั้น 'en'
            thai_refs = sum(_thai_ratio(r) > 0.3 for r in refs)
            lang_choice = 'th' if thai_refs >= (len(refs) / 2) else 'en'
            # ถ้าเป็นข้อความผสมมาก ๆ แนะนำปิด rescale เพื่อลด bias ฐาน
            rescale = True if lang_choice == 'th' else False
            P, R, F1 = bertscore(preds, refs, lang=lang_choice, rescale_with_baseline=rescale)
            bs_f1 = [float(x) for x in F1.tolist()]
            print(f"ℹ️ BERTScore ใช้ภาษา: {lang_choice} | rescale_with_baseline={rescale}")
        except Exception as e:
            print(f"⚠️ คำนวณ BERTScore ไม่สำเร็จ: {e}")
            bs_f1 = [np.nan] * len(preds)
    else:
        bs_f1 = [np.nan] * len(preds)

    # รวมเป็น DataFrame
    records = []
    for j in range(len(ids)):
        rec = {
            "id": int(ids[j]),
            "prediction": preds[j],
            "reference": refs[j],
            "bleu": bleu_list[j],
            "rougeL": rl_list[j],
            "facet_f1_macro": facet_rows[j]["facet_f1_macro"],
            "target_f1": facet_rows[j]["target_f1"],
            "action_f1": facet_rows[j]["action_f1"],
            "pathway_f1": facet_rows[j]["pathway_f1"],
            "key_terms_covered": kta_rows[j]["key_terms_covered"],
            "triple_exact": triple_rows[j]["triple_exact"],
            "triple_soft": triple_rows[j]["triple_soft"],
            "bertscore_f1": bs_f1[j],
        }
        records.append(rec)

    metrics_df = pd.DataFrame(records)

    def safe_mean(s):
        s = pd.Series([x for x in s if x is not None and pd.notnull(x)])
        return float(s.mean()) if len(s) else float("nan")

    print("\n📊 Automatic Metrics on Validation:")
    print(f"- Eval loss:           {eval_loss:.4f} | Perplexity: {eval_ppl:.2f}")
    print(f"- BLEU:                {safe_mean(metrics_df['bleu']):.4f}")
    print(f"- ROUGE-L:             {safe_mean(metrics_df['rougeL']):.4f}")
    print(f"- Facet F1 (macro):    {metrics_df['facet_f1_macro'].mean():.4f}")
    print(f"- Key-Term Coverage:   {metrics_df['key_terms_covered'].mean():.4f}")
    print(f"- Triple Exact/Soft:   {metrics_df['triple_exact'].mean():.4f} / {metrics_df['triple_soft'].mean():.4f}")
    if 'bertscore_f1' in metrics_df:
        print(f"- BERTScore (F1):      {safe_mean(metrics_df['bertscore_f1']):.4f}")

    # บันทึก CSV
    ts = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
    out_csv = os.path.join(RESULT_DIR, f"metrics_val_{ts}.csv")
    metrics_df.to_csv(out_csv, index=False, encoding="utf-8")
    print(f"\n💾 บันทึกผลลัพธ์ลงไฟล์: {out_csv}")

# ====== 7) Preference Testing (Teacher vs Student) + CACHED CSV ======
if not _has("SEED"):
    SEED = 42

# เผื่อ MAIN_PATH ไม่ได้กำหนดมาก่อน
MAIN_PATH = globals().get("MAIN_PATH", "/content/drive/MyDrive/demo_finetuning/")
RESULT_DIR = globals().get("RESULT_DIR", os.path.join(MAIN_PATH))
os.makedirs(RESULT_DIR, exist_ok=True)

EVAL_DIR = RESULT_DIR
EVAL_CSV = os.path.join(EVAL_DIR, "evaluateTeacherStudent.csv")

# สวิตช์บังคับรันใหม่ (ทับ cache)
FORCE_RERUN_EVAL = False  # True = บังคับรันใหม่แม้มี evaluateTeacherStudent.csv อยู่แล้ว

# ฟังก์ชันเฉลี่ยแบบปลอดภัย
def _safe_mean(series):
    s = pd.Series([x for x in series if pd.notnull(x)])
    return float(s.mean()) if len(s) else float("nan")

# === A) ถ้ามีไฟล์ evaluateTeacherStudent.csv และไม่บังคับรันใหม่ → โชว์และจบ ===
if os.path.exists(EVAL_CSV) and not FORCE_RERUN_EVAL:
    print(f"📂 พบไฟล์ evaluate ก่อนหน้า: {EVAL_CSV}")
    _prev_eval = pd.read_csv(EVAL_CSV)

    if "winner" in _prev_eval.columns:
        _win_rate = (_prev_eval["winner"] == "Student").mean()
    else:
        _win_rate = float("nan")

    print("\n📊 สรุปจาก evaluateTeacherStudent.csv (แคช):")
    print(f"- Student win-rate: {_win_rate:.2%}")
    for nm in ["summary_train_loss_last","summary_eval_loss_last","summary_perplexity_eval",
               "summary_bleu_mean","summary_rougeL_mean","summary_facet_f1_macro_mean",
               "summary_key_terms_covered_mean","summary_triple_exact_mean",
               "summary_triple_soft_mean","summary_bertscore_f1_mean"]:
        if nm in _prev_eval.columns:
            print(f"- {nm}: {_safe_mean(_prev_eval[nm]):.4f}")

    print("\n👀 ตัวอย่างบรรทัดแรก ๆ:")
    print(_prev_eval.head().to_string(index=False))

else:
    # === B) ไม่มีไฟล์ (หรือบังคับรันใหม่) → ทำ evaluate ใหม่ ===
    N_SAMPLE = min(20, len(df_val))
    sample_val = df_val.sample(N_SAMPLE, random_state=SEED).reset_index(drop=True)

    # Student answers
    student_answers = []
    if _has("model") and _has("tokenizer"):
        model.eval()
        device = "cuda" if torch.cuda.is_available() else "cpu"
        def generate_answer(prompt, max_new_tokens=160):
            inputs = tokenizer(prompt, return_tensors="pt").to(device)
            with torch.no_grad():
                outs = model.generate(**inputs, max_new_tokens=max_new_tokens)
            return tokenizer.decode(outs[0], skip_special_tokens=True)
        for _, row in sample_val.iterrows():
            student_answers.append(generate_answer(build_prompt(row), max_new_tokens=160))
    else:
        print("⚠️ ข้าม Student answers: ไม่มี model/tokenizer")
        device = "cuda" if torch.cuda.is_available() else "cpu"  # กันเผื่อใช้ใน LLM Judge
        student_answers = [""] * len(sample_val)

    # Teacher answers (เรียก API; ถ้า error → ใช้ reference ใน df_val)
    teacher_answers = []
    try:
        for _, row in sample_val.iterrows():
            teacher_answers.append(
                client.responses.create(model="gpt-4o-mini", input=build_prompt(row)).output_text
            )
    except Exception as e:
        print(f"⚠️ สร้างคำตอบจาก Teacher ไม่ได้: {e}")
        teacher_answers = list(sample_val["output"])

    # ใช้ rougeL_score ที่นิยามไว้แล้วด้านบน (จะคืน None ถ้าไม่มี rouge_scorer)
    rows_compare = []
    for i, row in sample_val.iterrows():
        ref = str(row["output"])
        stu = student_answers[i] if i < len(student_answers) else ""
        tea = teacher_answers[i]

        r_stu = rougeL_score(ref, stu) if 'rougeL_score' in globals() else None
        r_tea = rougeL_score(ref, tea) if 'rougeL_score' in globals() else None
        r_stu = r_stu if (r_stu is not None) else 0.0
        r_tea = r_tea if (r_tea is not None) else 0.0

        winner = "Student" if r_stu > r_tea else ("Teacher" if r_tea > r_stu else "Tie")
        rows_compare.append({
            "id": int(i),
            "rougeL_student": round(float(r_stu), 4),
            "rougeL_teacher": round(float(r_tea), 4),
            "winner": winner
        })

    tbl = pd.DataFrame(rows_compare)
    win_rate = (tbl["winner"] == "Student").mean()

    print("\n🎯 Preference (ROUGE-L-to-Ref) — up to 20 prompts")
    print(tbl.to_string(index=False))
    print(f"\n🏁 Student win-rate: {win_rate:.2%}  "
          f"(Teacher wins {(tbl['winner']=='Teacher').mean():.2%}, "
          f"Ties {(tbl['winner']=='Tie').mean():.2%})")

    # ตัวอย่าง LLM Judge 1 เคส (บันทึกลง CSV เป็น example)
    teacher_answer_example = None
    student_answer_example = None
    judge_reason_example = None
    try:
        eval_prompt = "Translate the mechanism of action of Augmentin 625 Duo Tablet into English and keep it to one sentence."
        if _has("model") and _has("tokenizer"):
            inputs = tokenizer(eval_prompt, return_tensors="pt").to(device)
            with torch.no_grad():
                out = model.generate(**inputs, max_new_tokens=120)
            student_answer_example = tokenizer.decode(out[0], skip_special_tokens=True)

        teacher_answer_example = client.responses.create(
            model="gpt-4o-mini",
            input=eval_prompt
        ).output_text

        judge_prompt = f"""
        You are a fair evaluator. Compare the following responses.

        PROMPT: {eval_prompt}

        [Teacher Answer]
        {teacher_answer_example}

        [Student Answer]
        {student_answer_example}

        Decide which one is better (Teacher or Student) and explain why. And response me in Thai language
        """
        judge = client.responses.create(model="gpt-4o-mini", input=judge_prompt)
        judge_reason_example = judge.output_text

        print("\n🧠 Teacher Output:\n", teacher_answer_example)
        print("\n🎓 Student Output:\n", student_answer_example)
        print("\n⚖️ LLM Judge:\n", judge_reason_example)
    except Exception as e:
        print(f"⚠️ LLM judge skipped: {e}")

    # ----- รวมสรุป/เมตริกจากไฟล์ metrics ล่าสุด (ถ้ามี prev_df) -----
    bleu_mean = _safe_mean(prev_df['bleu']) if 'prev_df' in globals() and 'bleu' in prev_df else None
    rougeL_mean = _safe_mean(prev_df['rougeL']) if 'prev_df' in globals() and 'rougeL' in prev_df else None
    facet_f1_macro_mean = _safe_mean(prev_df['facet_f1_macro']) if 'prev_df' in globals() and 'facet_f1_macro' in prev_df else None
    key_terms_covered_mean = _safe_mean(prev_df['key_terms_covered']) if 'prev_df' in globals() and 'key_terms_covered' in prev_df else None
    triple_exact_mean = _safe_mean(prev_df['triple_exact']) if 'prev_df' in globals() and 'triple_exact' in prev_df else None
    triple_soft_mean = _safe_mean(prev_df['triple_soft']) if 'prev_df' in globals() and 'triple_soft' in prev_df else None
    bertscore_f1_mean = _safe_mean(prev_df['bertscore_f1']) if 'prev_df' in globals() and 'bertscore_f1' in prev_df else None

    # Perplexity (คำนวณจาก eval_loss ถ้ามี)
    try:
        ppl_eval = math.exp(float(eval_loss)) if (eval_loss == eval_loss) else None
    except Exception:
        ppl_eval = None

    # ----- บันทึก evaluateTeacherStudent.csv -----
    summary_cols = {
        "summary_train_loss_last": globals().get("last_train_loss", None),
        "summary_eval_loss_last": globals().get("last_eval_loss", None),
        "summary_perplexity_eval": ppl_eval,
        "summary_bleu_mean": bleu_mean,
        "summary_rougeL_mean": rougeL_mean,
        "summary_facet_f1_macro_mean": facet_f1_macro_mean,
        "summary_key_terms_covered_mean": key_terms_covered_mean,
        "summary_triple_exact_mean": triple_exact_mean,
        "summary_triple_soft_mean": triple_soft_mean,
        "summary_bertscore_f1_mean": bertscore_f1_mean,
        "summary_student_win_rate": float(win_rate),
        "example_teacher_output": teacher_answer_example,
        "example_student_output": student_answer_example,
        "example_judge_reason": judge_reason_example,
    }

    df_save = tbl.copy()
    for k, v in summary_cols.items():
        df_save[k] = v

    df_save.to_csv(EVAL_CSV, index=False, encoding="utf-8")
    print(f"\n💾 Saved evaluateTeacherStudent.csv → {EVAL_CSV}")

    # Preview
    try:
        print("\n👀 Preview evaluateTeacherStudent.csv:")
        print(df_save.head().to_string(index=False))
    except Exception:
        pass

⚠️ ไม่พบตัวแปร 'trainer' หรือไม่มีเมธอด evaluate — ข้ามการประเมิน eval_loss/perplexity
✅ Saved model (save_pretrained) at /content/drive/MyDrive/demo_finetuning/qwen3_1p7b-instruct-lora-best
✅ Saved tokenizer at /content/drive/MyDrive/demo_finetuning/qwen3_1p7b-instruct-lora-best
คอลัมน์ทั้งหมดที่มีใน df_val: ['input', 'output', 'drug']
✅ พบ 'output' ใน df_val แล้ว ดำเนินการต่อได้
🔁 ไม่พบไฟล์เก่ามาก่อน หรือบังคับให้รันใหม่ — กำลังคำนวณ metrics ...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ℹ️ BERTScore ใช้ภาษา: en | rescale_with_baseline=False

📊 Automatic Metrics on Validation:
- Eval loss:           nan | Perplexity: nan
- BLEU:                0.0097
- ROUGE-L:             0.1060
- Facet F1 (macro):    0.1861
- Key-Term Coverage:   0.8500
- Triple Exact/Soft:   0.0500 / 0.0500
- BERTScore (F1):      0.8381

💾 บันทึกผลลัพธ์ลงไฟล์: /content/drive/MyDrive/demo_finetuning/metrics_val_20251023_221443.csv

🎯 Preference (ROUGE-L-to-Ref) — up to 20 prompts
 id  rougeL_student  rougeL_teacher  winner
  0          0.1231          0.4211 Teacher
  1          0.1119          0.2000 Teacher
  2          0.1290          0.2222 Teacher
  3          0.1296          0.2222 Teacher
  4          0.0606          0.3226 Teacher
  5          0.0602          0.1951 Teacher
  6          0.1395          0.2162 Teacher
  7          0.1000          0.2941 Teacher
  8          0.0556          0.2791 Teacher
  9          0.1061          0.4091 Teacher
 10          0.1008          0.4865 Teacher
 1




🧠 Teacher Output:
 Augmentin 625 Duo Tablet works by combining amoxicillin, a penicillin-type antibiotic that inhibits bacterial cell wall synthesis, with clavulanic acid, which protects amoxicillin from being broken down by beta-lactamase enzymes produced by certain bacteria, thereby enhancing the antibiotic's effectiveness against resistant strains.

🎓 Student Output:
 Translate the mechanism of action of Augmentin 625 Duo Tablet into English and keep it to one sentence. Augmentin 625 Duo Tablet is an antibiotic that combines Augmentin 625 and Duo Tablet. The mechanism of action of Augmentin 625 Duo Tablet is the combination of Augmentin 625 (a third-generation cephalosporin) and Duo Tablet (a macrolide). 

Augmentin 625 Duo Tablet is an antibiotic that combines Augmentin 625 (a third-generation cephalosporin) and Duo Tablet (a macrolide), enhancing its efficacy against a wide range of bacterial infections

⚖️ LLM Judge:
 คำตอบที่ดีกว่าคือคำตอบของครู เนื่องจากคำตอบของครูสั้น กระชับ 