In [5]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [11]:
from unsloth import FastLanguageModel
import torch

base_model_id = "unsloth/llama-3.2-1b-instruct-bnb-4bit"
dpo_adapter_path_original = os.path.abspath("./final_dpo_adapter_original")
dpo_adapter_path_mitigated = os.path.abspath("./final_dpo_adapter_mitigated")

# 베이스 모델 로드
model_original, tokenizer = FastLanguageModel.from_pretrained(
    model_name=base_model_id,
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

model_mitigated, _ = FastLanguageModel.from_pretrained(
    model_name=base_model_id,
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

==((====))==  Unsloth 2025.6.5: Fast Llama patching. Transformers: 4.52.4.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.433 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
==((====))==  Unsloth 2025.6.5: Fast Llama patching. Transformers: 4.52.4.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.433 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [12]:
# DPO 어댑터 로드
print(f"DPO 어댑터를 로드")
model_original.load_adapter(dpo_adapter_path_original)
model_mitigated.load_adapter(dpo_adapter_path_mitigated)
print("DPO 어댑터 로딩 완료.")

DPO 어댑터를 로드
DPO 어댑터 로딩 완료.


In [17]:
# 추론 테스트
prompt = "what is carbon dioxide? Response: "
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

outputs_original = model_original.generate(**inputs, max_new_tokens=100, use_cache=True)
decoded_output_original = tokenizer.batch_decode(outputs_original, skip_special_tokens=True)[0]

outputs_mitigated = model_mitigated.generate(**inputs, max_new_tokens=100, use_cache=True)
decoded_output_mitigated = tokenizer.batch_decode(outputs_mitigated, skip_special_tokens=True)[0]

print("===== 추론 결과(original) =====")
print(decoded_output_original)
print("===== 추론 결과(mitigated) =====")
print(decoded_output_mitigated)

===== 추론 결과(original) =====
what is carbon dioxide? Response: 2
## Step 1: Definition of Carbon Dioxide
Carbon dioxide (CO2) is a colorless, odorless, and tasteless gas that is a major component of the Earth's atmosphere. It is a simple, inorganic compound that is produced by the decomposition of carbon-based materials, such as wood and fossil fuels.

## Step 2: Sources of Carbon Dioxide
The primary sources of carbon dioxide are fossil fuel combustion, industrial processes, and natural gas seepage.
===== 추론 결과(mitigated) =====
what is carbon dioxide? Response: 1
A. a colorless, odorless gas
B. a colorless, odorless gas
C. a colorless, odorless liquid
D. a colorless, odorless solid
Answer: B
Explanation: Carbon dioxide is a colorless, odorless gas that is composed of carbon and oxygen atoms. It is the second most abundant gas in the Earth's atmosphere and is a major component of the atmosphere. It is also a key player in the carbon cycle, playing


--------
win rate 측정
--------

In [None]:
GOOGLE_API_KEY = "YOUR_KEY"

In [9]:
import os
import random
import time
import json
import torch
from tqdm import tqdm
from datasets import load_dataset
import google.generativeai as genai
from unsloth import FastLanguageModel

In [19]:
# ==============================================================================
# 1. Configuration (User modification needed)
# ==============================================================================
# *** CRITICAL CHANGE ***
# Use the newly created SFT-merged model as the base model.
BASE_MODEL_ID = os.path.abspath("./sft_merged_model") # 1단계에서 생성된 경로
# SFT_ADAPTER_PATH is no longer needed here.
DPO_ADAPTER_PATH_A = os.path.abspath("./final_dpo_adapter_original")
DPO_ADAPTER_PATH_B = os.path.abspath("./final_dpo_adapter_mitigated")

JUDGE_MODEL = "gemini-2.0-flash-lite"
NUM_EVAL_PROMPTS = 50 

In [15]:
# ==============================================================================
# 2. Load and Prepare Evaluation Dataset
# ==============================================================================
def load_and_prepare_dataset(num_samples):
    """Loads the mt_bench dataset from HuggingFaceH4 and extracts prompts for evaluation."""
    print("Loading HuggingFaceH4/mt_bench_prompts dataset...")
    
    dataset_name = "HuggingFaceH4/mt_bench_prompts"
    dataset = load_dataset(dataset_name)['train']
    
    prompts = []
    for item in dataset:
        # The 'prompt' column contains a list of strings; we need the first element for the first turn.
        first_turn_prompt = item['prompt'][0]
        # Format to match the Unsloth model's input structure
        prompts.append(f"Human: {first_turn_prompt}\n\nAssistant:")
    
    # Randomly sample the specified number of prompts
    if len(prompts) > num_samples:
        print(f"Randomly sampling {num_samples} prompts from the dataset.")
        return random.sample(prompts, num_samples)
    return prompts

In [16]:
# ==============================================================================
# 3. Model Loading and Response Generation Functions
# ==============================================================================
def load_model_with_dpo_adapter(dpo_path):
    """
    Loads the SFT-merged base model and applies a single DPO adapter on top.
    """
    print(f"Loading SFT-merged base model from: {BASE_MODEL_ID}")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=BASE_MODEL_ID, # Load the merged model
        max_seq_length=2048,
        dtype=None,
        load_in_4bit=True,
    )
    
    print(f"Loading DPO adapter: {os.path.basename(dpo_path)}")
    # SFT is already part of the model, so we only load the DPO adapter.
    model.load_adapter(dpo_path)
    
    return model, tokenizer

def generate_responses(prompts, model, tokenizer):
    """Generates responses for a given list of prompts using the specified model."""
    responses = []
    for prompt in tqdm(prompts, desc="Generating responses"):
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, max_new_tokens=1024, use_cache=True, temperature=0.6, top_p=0.9)
        response_with_prompt = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        pure_response = response_with_prompt[len(prompt.replace("Assistant:", "")):]
        responses.append(pure_response.strip())
    return responses

In [3]:
# ==============================================================================
# 4. LLM-as-a-Judge (Gemini) Functions
# ==============================================================================
JUDGE_PROMPT_TEMPLATE = """
You are an impartial evaluator assessing the responses of two AI assistants.
Evaluate which of the two responses is more helpful, accurate, and better aligned with the user's intent.
Don't be biased toward length.

Consider the following criteria:
1.  **Instruction Following**: Does the response accurately follow the user's instructions?
2.  **Accuracy**: Is the information provided factually correct?
3.  **Completeness**: Does the response provide sufficient information for the user's query?
4.  **Conciseness & Fluency**: Is the response well-written, concise, and easy to understand?

[User Prompt]
{prompt}

[Assistant A's Response]
{response_a}

[Assistant B's Response]
{response_b}

[Evaluation]
Compare the two responses and briefly explain which one is better and why.
After your explanation, state your final verdict on the last line in one of the following formats:
- If Assistant A is better: [[A]]
- If Assistant B is better: [[B]]
- If the two responses are of similar quality or it's hard to decide: [[TIE]]
"""

def judge_battle_with_gemini(prompt, response_a, response_b):
    """Calls the Gemini judge model to determine the winner."""
    genai.configure(api_key=GOOGLE_API_KEY)
    model = genai.GenerativeModel(JUDGE_MODEL)

    is_swapped = random.choice([True, False])
    if is_swapped:
        res_a, res_b = response_b, response_a
    else:
        res_a, res_b = response_a, response_b

    pure_prompt = prompt.replace("Human: ", "").replace("\n\nAssistant:", "")
    judge_prompt = JUDGE_PROMPT_TEMPLATE.format(prompt=pure_prompt, response_a=res_a, response_b=res_b)

    for _ in range(3):  # Retry up to 3 times
        try:
            response = model.generate_content(judge_prompt)
            content = response.text

            if "[[A]]" in content:
                return "A" if not is_swapped else "B", content
            elif "[[B]]" in content:
                return "B" if not is_swapped else "A", content
            elif "[[TIE]]" in content:
                return "TIE", content
            else:
                print("Warning: Judge model did not return a clear winner.")
                return "BAD_RESPONSE", content
        except Exception as e:
            print(f"An error occurred while calling the Gemini API: {e}")
            time.sleep(5)
    return "API_ERROR", ""

In [20]:
# ==============================================================================
# 5. Main Execution Logic
# ==============================================================================

if __name__ == '__main__':
    evaluation_prompts = load_and_prepare_dataset(NUM_EVAL_PROMPTS)
    
    print("\nLoading Model A (SFT_merged + DPO_A)...")
    model_a, tokenizer_a = load_model_with_dpo_adapter(DPO_ADAPTER_PATH_A)
    print("\nGenerating responses from Model A...")
    responses_a = generate_responses(evaluation_prompts, model_a, tokenizer_a)
    
    print("\nLoading Model B (SFT_merged + DPO_B)...")
    model_b, tokenizer_b = load_model_with_dpo_adapter(DPO_ADAPTER_PATH_B)
    print("\nGenerating responses from Model B...")
    responses_b = generate_responses(evaluation_prompts, model_b, tokenizer_b)

    # Clean up memory before starting the evaluation loop
    del model_a, tokenizer_a, model_b, tokenizer_b
    torch.cuda.empty_cache()

    print(f"\nStarting evaluation with {JUDGE_MODEL}...")
    results = {"A_wins": 0, "B_wins": 0, "ties": 0, "errors": 0}
    battle_log = []

    for i in tqdm(range(len(evaluation_prompts)), desc="Judging battles"):
        winner, judge_reason = judge_battle_with_gemini(evaluation_prompts[i], responses_a[i], responses_b[i])
        
        if winner == "A":
            results["A_wins"] += 1
        elif winner == "B":
            results["B_wins"] += 1
        elif winner == "TIE":
            results["ties"] += 1
        else:
            results["errors"] += 1
            
        battle_log.append({
            "prompt": evaluation_prompts[i],
            "response_A": responses_a[i],
            "response_B": responses_b[i],
            "winner": winner,
            "judge_reason": judge_reason,
        })

    # Print final results
    print("\n" + "="*50)
    print("          PERFORMANCE EVALUATION RESULTS")
    print("="*50)
    print(f"Model A (Adapter: {os.path.basename(DPO_ADAPTER_PATH_A)})")
    print(f"Model B (Adapter: {os.path.basename(DPO_ADAPTER_PATH_B)})")
    print(f"Judge Model: {JUDGE_MODEL}")
    print(f"Dataset: HuggingFaceH4/mt_bench_prompts ({len(evaluation_prompts)} prompts)")
    print("-"*50)
    
    total_judged = results["A_wins"] + results["B_wins"] + results["ties"]
    if total_judged > 0:
        win_rate_a = (results["A_wins"] / total_judged) * 100
        win_rate_b = (results["B_wins"] / total_judged) * 100
        tie_rate = (results["ties"] / total_judged) * 100
        
        print(f"Model A Win Rate: {win_rate_a:.2f}% ({results['A_wins']} wins)")
        print(f"Model B Win Rate: {win_rate_b:.2f}% ({results['B_wins']} wins)")
        print(f"Tie Rate: {tie_rate:.2f}% ({results['ties']} ties)")
    
    if results["errors"] > 0:
        print(f"\nErrors (API or Bad Response): {results['errors']}")
    print("="*50)

    # Save detailed log to a file
    print("\nSaving detailed battle log to 'battle_log_gemini_en.json'...")
    with open("battle_log_gemini_en.json", "w", encoding="utf-8") as f:
        json.dump(battle_log, f, ensure_ascii=False, indent=4)
    print("Evaluation complete. Check 'battle_log_gemini_en.json' for detailed results.")

Loading HuggingFaceH4/mt_bench_prompts dataset...
Randomly sampling 50 prompts from the dataset.

Loading Model A (SFT_merged + DPO_A)...
Loading SFT-merged base model from: /nfs/home/ryan0507/checkllama/final/sft_merged_model
==((====))==  Unsloth 2025.6.5: Fast Llama patching. Transformers: 4.52.4.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 10. Max memory: 23.433 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Loading DPO adapter: final_dpo_adapter_original

Generating responses from Model A...


Generating responses: 100%|██████████| 50/50 [03:12<00:00,  3.85s/it]



Loading Model B (SFT_merged + DPO_B)...
Loading SFT-merged base model from: /nfs/home/ryan0507/checkllama/final/sft_merged_model
==((====))==  Unsloth 2025.6.5: Fast Llama patching. Transformers: 4.52.4.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 10. Max memory: 23.433 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Loading DPO adapter: final_dpo_adapter_mitigated

Generating responses from Model B...


Generating responses: 100%|██████████| 50/50 [02:51<00:00,  3.44s/it]



Starting evaluation with gemini-2.0-flash-lite...


Judging battles: 100%|██████████| 50/50 [01:39<00:00,  2.00s/it]


          PERFORMANCE EVALUATION RESULTS
Model A (Adapter: final_dpo_adapter_original)
Model B (Adapter: final_dpo_adapter_mitigated)
Judge Model: gemini-2.0-flash-lite
Dataset: HuggingFaceH4/mt_bench_prompts (50 prompts)
--------------------------------------------------
Model A Win Rate: 56.00% (28 wins)
Model B Win Rate: 40.00% (20 wins)
Tie Rate: 4.00% (2 ties)

Saving detailed battle log to 'battle_log_gemini_en.json'...
Evaluation complete. Check 'battle_log_gemini_en.json' for detailed results.





# Test