### 모델 출력 테스트

In [None]:
# Google Drive 마운트
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 필요한 라이브러리 설치
!pip install -q transformers torch accelerate

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

print("라이브러리 임포트 완료!")
print(f"CUDA 사용 가능: {torch.cuda.is_available()}")

라이브러리 임포트 완료!
CUDA 사용 가능: True


In [None]:
# 모델 경로 설정
model_path = "/content/drive/MyDrive/Colab Notebooks/woke-odds/checkpoint-best"

# 토크나이저 로드
print("토크나이저 로딩 중...")
tokenizer = AutoTokenizer.from_pretrained(model_path)

# 특수 토큰 설정
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"✓ 토크나이저 로드 완료!")

# 모델 로드
print("\n모델 로딩 중...")
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map="auto" if torch.cuda.is_available() else None
)

# 평가 모드로 설정
model.eval()

print(f"✓ 모델 로드 완료!")
print(f"Device: {model.device}")

토크나이저 로딩 중...
✓ 토크나이저 로드 완료!

모델 로딩 중...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.77G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✓ 모델 로드 완료!
Device: cuda:0


In [None]:
# 시스템 프롬프트 (데이터셋 원본)
SYSTEM_PROMPT = """You are an AI that generates a single, concise clarifying question when a user's query is ambiguous.

Task:
Generate exactly one clarifying question based on the ambiguity type.
If the query is clear and needs no clarification, output: <NO_CLARIFYING_QUESTION>

Output format: One clarifying question (or <NO_CLARIFYING_QUESTION> if not needed)

Categories:
- EM (Epistemic Misalignment): Questions with unfamiliar entities or self-contradictions
- LA (Linguistic Ambiguity): Questions with lexical or semantic ambiguity
- AO (Aleatoric Output): Questions with missing contextual information causing confusion
- NONE: Clear questions that don't require clarification

Subclasses:
For EM:
- UNF (UNFAMILIAR): Query contains unfamiliar entities or facts
- CONT (CONTRADICTION): Query contains self-contradictions

For LA:
- LEX (LEXICAL): Query contains terms with multiple meanings
- SEM (SEMANTIC): Query lacks context leading to multiple interpretations

For AO:
- WHOM: Query output contains confusion due to missing personal elements
- WHEN: Query output contains confusion due to missing temporal elements
- WHERE: Query output contains confusion due to missing spatial elements
- WHAT: Query output contains confusion due to missing task-specific elements

For Clear Questions:
- NONE: Use when require_clarification=0, output <NO_CLARIFYING_QUESTION>"""

print("시스템 프롬프트 정의 완료!")

시스템 프롬프트 정의 완료!


In [None]:
# 테스트 샘플 3개 (messages 형식으로)
test_samples = [
    {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": "[LA|LEX] What is the most common type of type used in printing?"}
        ],
        "ground_truth": "Are you referring to the most common type of printed characters used in printing, or the most common type of person involved in the printing industry?"
    },
    {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": "[AO|WHEN] Number of oil refineries in the united states?"}
        ],
        "ground_truth": "Which one: 2019, or January 2015?"
    },
    {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": "[NONE|NONE] What are the names of the actors who acted in the movie Padayappa?"}
        ],
        "ground_truth": "<NO_CLARIFYING_QUESTION>"
    }
]

print(f"테스트 샘플 {len(test_samples)}개 준비 완료!")

테스트 샘플 3개 준비 완료!


In [None]:
print("=== 모델 출력 테스트 (3개 샘플) ===\n")

for idx, sample in enumerate(test_samples):
    messages = sample['messages']
    ground_truth = sample['ground_truth']

    # system + user 메시지만 사용 (친구 방식)
    input_messages = [msg for msg in messages if msg['role'] != 'assistant']

    # user query 추출 (출력용)
    user_query = [msg['content'] for msg in messages if msg['role'] == 'user'][0]

    # Chat template 적용
    prompt = tokenizer.apply_chat_template(
        input_messages,
        tokenize=False,
        add_generation_prompt=True
    )

    # 토크나이즈
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # 생성
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    # 디코딩 (입력 부분 제외)
    generated_text = tokenizer.decode(
        outputs[0][inputs['input_ids'].shape[1]:],
        skip_special_tokens=True
    )

    # 출력
    print(f"[샘플 {idx+1}]")
    print(f"User Query: {user_query}")
    print(f"\nGround Truth: {ground_truth}")
    print(f"\nModel Output: {generated_text.strip()}")
    print("\n" + "="*80 + "\n")

print("테스트 완료!")

=== 모델 출력 테스트 (3개 샘플) ===

[샘플 1]
User Query: [LA|LEX] What is the most common type of type used in printing?

Ground Truth: Are you referring to the most common type of printed characters used in printing, or the most common type of person involved in the printing industry?

Model Output: Are you asking about the typeface used in printing, or the type of physical material used in printing?


[샘플 2]
User Query: [AO|WHEN] Number of oil refineries in the united states?

Ground Truth: Which one: 2019, or January 2015?

Model Output: Which period?


[샘플 3]
User Query: [NONE|NONE] What are the names of the actors who acted in the movie Padayappa?

Ground Truth: <NO_CLARIFYING_QUESTION>

Model Output: <NO_CLARIFYING_QUESTION>


테스트 완료!


### Temperature 달리해서 응답 5개씩 생성해보기

In [None]:
def generate_multiple_candidates(messages, model, tokenizer, num_candidates=5, temperatures=[0.7, 0.9, 1.1, 1.3, 1.5]):
    """
    같은 입력에 대해 다양한 temperature로 여러 후보 응답 생성

    Args:
        messages: 입력 메시지 리스트
        model: 학습된 모델
        tokenizer: 토크나이저
        num_candidates: 생성할 후보 개수 (기본 5개)
        temperatures: 사용할 temperature 리스트

    Returns:
        candidates: 생성된 후보 응답 리스트
    """
    candidates = []

    # system + user 메시지만 사용
    input_messages = [msg for msg in messages if msg['role'] != 'assistant']

    # Chat template 적용
    prompt = tokenizer.apply_chat_template(
        input_messages,
        tokenize=False,
        add_generation_prompt=True
    )

    # 토크나이즈
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    print(f"다양한 temperature로 {num_candidates}개 후보 생성 중...")

    # 각 temperature로 생성
    for i, temp in enumerate(temperatures[:num_candidates]):
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=150,
                temperature=temp,
                top_p=0.95,  # 다양성을 위해 top_p도 높게
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

        # 디코딩
        generated_text = tokenizer.decode(
            outputs[0][inputs['input_ids'].shape[1]:],
            skip_special_tokens=True
        ).strip()

        candidates.append({
            'text': generated_text,
            'temperature': temp
        })

        print(f"  [{i+1}/{num_candidates}] T={temp:.1f}: {generated_text[:80]}...")

    # 중복 제거
    unique_candidates = []
    seen_texts = set()
    for cand in candidates:
        if cand['text'] not in seen_texts:
            unique_candidates.append(cand)
            seen_texts.add(cand['text'])

    print(f"✓ 총 {len(unique_candidates)}개 고유 후보 생성 완료 (중복 {len(candidates) - len(unique_candidates)}개 제거)\n")

    return unique_candidates

print("후보 생성 함수 정의 완료!")

후보 생성 함수 정의 완료!


In [None]:
# 샘플 1개에 대해 테스트
print("="*80)
print("[샘플 1] 후보 생성 테스트")
print("="*80 + "\n")

sample = test_samples[0]
messages = sample['messages']
user_query = [msg['content'] for msg in messages if msg['role'] == 'user'][0]
ground_truth = sample['ground_truth']

print(f"User Query: {user_query}\n")
print(f"Ground Truth: {ground_truth}\n")
print("-"*80 + "\n")

# 5개 후보 생성 (temperature: 0.7, 0.9, 1.1, 1.3, 1.5)
candidates_sample1 = generate_multiple_candidates(
    messages=messages,
    model=model,
    tokenizer=tokenizer,
    num_candidates=5
)

print("\n" + "="*80)
print("생성된 후보 목록:")
print("="*80 + "\n")

for idx, cand in enumerate(candidates_sample1, 1):
    print(f"[후보 {idx}] (T={cand['temperature']:.1f})")
    print(f"{cand['text']}")
    print()

[샘플 1] 후보 생성 테스트

User Query: [LA|LEX] What is the most common type of type used in printing?

Ground Truth: Are you referring to the most common type of printed characters used in printing, or the most common type of person involved in the printing industry?

--------------------------------------------------------------------------------

다양한 temperature로 5개 후보 생성 중...
  [1/5] T=0.7: Are you asking about the most common type used in printing in terms of definitio...
  [2/5] T=0.9: Are you asking about the type of printing or the printing process itself?...
  [3/5] T=1.1: What type of word can be used for printing?...
  [4/5] T=1.3: Can I supply: I expect multiple type of type or not?...
  [5/5] T=1.5: Are you asking about words with multiple categories (like "Type") and words for ...
✓ 총 5개 고유 후보 생성 완료 (중복 0개 제거)


생성된 후보 목록:

[후보 1] (T=0.7)
Are you asking about the most common type used in printing in terms of definition (lexical) or in terms of semantic classification?

[후보 2] (T=0

In [None]:
# 모든 샘플에 대해 후보 생성
all_candidates = []

print("\n" + "="*80)
print("3개 샘플 전체에 대한 후보 생성 시작")
print("="*80 + "\n")

for idx, sample in enumerate(test_samples, 1):
    messages = sample['messages']
    user_query = [msg['content'] for msg in messages if msg['role'] == 'user'][0]

    print(f"\n{'='*80}")
    print(f"[샘플 {idx}] {user_query}")
    print('='*80 + "\n")

    # 후보 생성
    candidates = generate_multiple_candidates(
        messages=messages,
        model=model,
        tokenizer=tokenizer,
        num_candidates=5  # 샘플당 5개 후보
    )

    # 저장
    all_candidates.append({
        'sample_idx': idx,
        'user_query': user_query,
        'ground_truth': sample['ground_truth'],
        'category': user_query.split(']')[0] + ']',  # 카테고리 추출
        'candidates': candidates
    })

print("\n" + "="*80)
print(f"✓ 전체 후보 생성 완료! (총 {len(all_candidates)}개 샘플)")
print("="*80)


3개 샘플 전체에 대한 후보 생성 시작


[샘플 1] [LA|LEX] What is the most common type of type used in printing?

다양한 temperature로 5개 후보 생성 중...
  [1/5] T=0.7: Are you asking about the type of font used in printing, or the printing process ...
  [2/5] T=0.9: Are you referring to the difference between "most common type used in printing" ...
  [3/5] T=1.1: Is there any specific category (UNF, CONT, WHOM, WHEN, WHERE, WHAT) mentioned in...
  [4/5] T=1.3: Are You an AI or a Human? You output the clarify question if it requires the dir...
  [5/5] T=1.5: What category of information is missing?...
✓ 총 5개 고유 후보 생성 완료 (중복 0개 제거)


[샘플 2] [AO|WHEN] Number of oil refineries in the united states?

다양한 temperature로 5개 후보 생성 중...
  [1/5] T=0.7: Which time period: past, future, or present?...
  [2/5] T=0.9: Which time period?...
  [3/5] T=1.1: Can you clarify whether you're referring to the United States specifically or an...
  [4/5] T=1.3: Which one's context or missing context: number of provinces, cities, states

In [None]:
# 생성된 모든 후보 출력
for sample_data in all_candidates:
    print("\n" + "="*80)
    print(f"[샘플 {sample_data['sample_idx']}]")
    print("="*80)
    print(f"Query: {sample_data['user_query']}")
    print(f"Category: {sample_data['category']}")
    print(f"Ground Truth: {sample_data['ground_truth']}")
    print(f"\n생성된 후보 {len(sample_data['candidates'])}개:")
    print("-"*80)

    for idx, cand in enumerate(sample_data['candidates'], 1):
        print(f"\n[후보 {idx}] (Temperature={cand['temperature']:.1f})")
        print(f"{cand['text']}")

    print("\n")


[샘플 1]
Query: [LA|LEX] What is the most common type of type used in printing?
Category: [LA|LEX]
Ground Truth: Are you referring to the most common type of printed characters used in printing, or the most common type of person involved in the printing industry?

생성된 후보 5개:
--------------------------------------------------------------------------------

[후보 1] (Temperature=0.7)
Are you asking about the type of font used in printing, or the printing process itself?

[후보 2] (Temperature=0.9)
Are you referring to the difference between "most common type used in printing" and "most common type used in printing as per the academic definition"?

[후보 3] (Temperature=1.1)
Is there any specific category (UNF, CONT, WHOM, WHEN, WHERE, WHAT) mentioned in the question?

[후보 4] (Temperature=1.3)
Are You an AI or a Human? You output the clarify question if it requires the direct query: Are you an AI?

[후보 5] (Temperature=1.5)
What category of information is missing?



[샘플 2]
Query: [AO|WHEN] Numbe

### 데이터셋 불러오기

In [None]:
import json
import pandas as pd

In [None]:
file_path = '/content/drive/MyDrive/Colab Notebooks/woke-odds/clamber_benchmark.jsonl'

In [None]:
data = []
with open(file_path, 'r', encoding='utf-8') as f:
    for line_num, line in enumerate(f, 1):
        line = line.strip()
        if line:
            try:
                # 첫 번째 파싱: 바깥쪽 따옴표로 감싸진 문자열을 파싱
                parsed_once = json.loads(line)

                # 두 번째 파싱: 실제 JSON 객체로 파싱
                if isinstance(parsed_once, str):
                    item = json.loads(parsed_once)
                else:
                    item = parsed_once

                data.append(item)
            except json.JSONDecodeError as e:
                print(f"Line {line_num} - Error: {e}")
                print(f"Content preview: {line[:100]}...")
                continue

# DataFrame으로 변환
df = pd.DataFrame(data)

In [None]:
print(f"총 데이터 개수: {len(df)}")
print(f"\n컬럼 목록:\n{df.columns.tolist()}")

총 데이터 개수: 3202

컬럼 목록:
['question', 'context', 'clarifying_question', 'require_clarification', 'category', 'subclass', 'predict_ambiguous', 'predict_is_ambiguous_response', 'predict_clarifying_question']


In [None]:
from IPython.display import display
display(df.head())

Unnamed: 0,question,context,clarifying_question,require_clarification,category,subclass,predict_ambiguous,predict_is_ambiguous_response,predict_clarifying_question
0,Give me a list of good coffee shops?,,What do you personally consider important in a...,1,MC,whom,0,"{""Output"": ""False"", ""Confidence"": 4}",Clarifying question: \n- Could you please spec...
1,Give me some Mother's Day gift ideas.,,"What are your mother's interests, hobbies, or ...",1,MC,whom,0,"{""Output"": ""False"", ""Confidence"": 4}",Clarifying question: \nCould you please specif...
2,Help me come up with 3 ideas for a new busines...,,"What are your areas of interest or expertise, ...",1,MC,what,0,"{""Output"": ""False"", ""Confidence"": 4}",Clarifying question: \nWhat specific industry ...
3,Write the first paragraph of a blog post descr...,,What tone or perspective should I use for the ...,1,MC,whom,1,"{""Output"": ""True"", ""Confidence"": 4}",Clarifying question: \n- Which aspect or topic...
4,Give me some tips on how to train for a marathon.,,Can you provide your current fitness level and...,1,MC,whom,0,"{""Output"": ""False"", ""Confidence"": 4}",Clarifying question: \n- Are you looking for t...


In [None]:
# 필요한 컬럼만 선택
df_cleaned = df[['question', 'clarifying_question', 'require_clarification', 'category', 'subclass']].copy()

print(f"✓ 컬럼 선택 완료!")
print(f"총 데이터 개수: {len(df_cleaned)}")
print(f"컬럼 목록: {df_cleaned.columns.tolist()}")

print("\n결측치 확인:")
print(df_cleaned.isnull().sum())

print("\n첫 5개 샘플:")
print(display(df_cleaned.head()))

✓ 컬럼 선택 완료!
총 데이터 개수: 3202
컬럼 목록: ['question', 'clarifying_question', 'require_clarification', 'category', 'subclass']

결측치 확인:
question                 0
clarifying_question      0
require_clarification    0
category                 0
subclass                 0
dtype: int64

첫 5개 샘플:


Unnamed: 0,question,clarifying_question,require_clarification,category,subclass
0,Give me a list of good coffee shops?,What do you personally consider important in a...,1,MC,whom
1,Give me some Mother's Day gift ideas.,"What are your mother's interests, hobbies, or ...",1,MC,whom
2,Help me come up with 3 ideas for a new busines...,"What are your areas of interest or expertise, ...",1,MC,what
3,Write the first paragraph of a blog post descr...,What tone or perspective should I use for the ...,1,MC,whom
4,Give me some tips on how to train for a marathon.,Can you provide your current fitness level and...,1,MC,whom


None


In [None]:
# 필요한 컬럼만 선택
df_cleaned = df[['question', 'clarifying_question', 'require_clarification', 'category', 'subclass']].copy()

# Category 매핑 (FD->EM, MC->AO)
category_mapping = {
    'FD': 'EM',  # Epistemic Misalignment
    'MC': 'AO',  # Aleatoric Output
    'LA': 'LA'   # Linguistic Ambiguity (유지)
}

# Category 이름 변경
df_cleaned['category'] = df_cleaned['category'].replace(category_mapping)

# 모호하지 않은 질문(0)에 대해 카테고리 NONE으로 변경
df_cleaned.loc[df_cleaned['require_clarification'] == 0, 'category'] = 'NONE'

# Subclass 매핑
subclass_mapping = {
    'whom': 'WHOM',
    'what': 'WHAT',
    'when': 'WHEN',
    'where': 'WHERE',
    'NK': 'UNF',
    'ICL': 'CONT',
    'co-reference': 'SEM',
    'polysemy': 'LEX'
}

# Subclass 이름 변경
df_cleaned['subclass'] = df_cleaned['subclass'].replace(subclass_mapping)

# 모호하지 않은 질문(0)에 대해 서브클래스 NONE으로 변경
df_cleaned.loc[df_cleaned['require_clarification'] == 0, 'subclass'] = 'NONE'

print(f"✓ 매핑 완료!")
print(f"총 데이터 개수: {len(df_cleaned)}")

print("\n변경 후 카테고리 분포:")
print(df_cleaned['category'].value_counts())

print("\n변경 후 서브클래스 분포:")
print(df_cleaned['subclass'].value_counts())

print("\n첫 5개 샘플:")
display(df_cleaned.head())  # display만 단독으로 사용

print("\nrequire_clarification=0인 샘플 확인:")
display(df_cleaned[df_cleaned['require_clarification'] == 0].head())

✓ 매핑 완료!
총 데이터 개수: 3202

변경 후 카테고리 분포:
category
NONE    1601
AO       801
EM       400
LA       400
Name: count, dtype: int64

변경 후 서브클래스 분포:
subclass
NONE     1601
WHAT      201
WHOM      200
WHERE     200
WHEN      200
UNF       200
CONT      200
SEM       200
LEX       200
Name: count, dtype: int64

첫 5개 샘플:


Unnamed: 0,question,clarifying_question,require_clarification,category,subclass
0,Give me a list of good coffee shops?,What do you personally consider important in a...,1,AO,WHOM
1,Give me some Mother's Day gift ideas.,"What are your mother's interests, hobbies, or ...",1,AO,WHOM
2,Help me come up with 3 ideas for a new busines...,"What are your areas of interest or expertise, ...",1,AO,WHAT
3,Write the first paragraph of a blog post descr...,What tone or perspective should I use for the ...,1,AO,WHOM
4,Give me some tips on how to train for a marathon.,Can you provide your current fitness level and...,1,AO,WHOM



require_clarification=0인 샘플 확인:


Unnamed: 0,question,clarifying_question,require_clarification,category,subclass
178,Is Mozambique a geographic distribution of Man...,,0,NONE,NONE
186,What's the latitude range where Diomedeidae is...,,0,NONE,NONE
193,Does Ophisaurus live in subtropical habitat?,,0,NONE,NONE
196,Does Pseudophanella have hemimetabolous as its...,,0,NONE,NONE
208,What kind of visual system does Baratha have?,,0,NONE,NONE


In [None]:
# user_query 생성: [category|subclass] question
df_cleaned['user_query'] = df_cleaned.apply(
    lambda row: f"[{row['category']}|{row['subclass']}] {row['question']}",
    axis=1
)

# ground_truth는 clarifying_question으로
df_cleaned['ground_truth'] = df_cleaned['clarifying_question']

# 필요한 컬럼만 남기기
df_final = df_cleaned[['user_query', 'ground_truth']].copy()

# NONE 케이스 마스크 (regex=False 추가!)
none_mask = df_final['user_query'].str.contains('[NONE|NONE]', regex=False)

# ground_truth를 <NO_CLARIFYING_QUESTION>으로 채우기
df_final.loc[none_mask, 'ground_truth'] = '<NO_CLARIFYING_QUESTION>'

print(f"✓ DataFrame 생성 완료!")
print(f"총 데이터 개수: {len(df_final)}")
print(f"NONE 데이터: {none_mask.sum()}개 ({none_mask.sum()/len(df_final)*100:.1f}%)")
print(f"모호한 질문: {(~none_mask).sum()}개 ({(~none_mask).sum()/len(df_final)*100:.1f}%)")

✓ DataFrame 생성 완료!
총 데이터 개수: 3202
NONE 데이터: 1601개 (50.0%)
모호한 질문: 1601개 (50.0%)


In [None]:
# 랜덤으로 10개 샘플 확인
import random

sample_indices = random.sample(range(len(df_final)), 10)

for i, idx in enumerate(sample_indices, 1):
    row = df_final.iloc[idx]
    print(f"\n[샘플 {i}] (인덱스: {idx})")
    print("-"*80)
    print(f"User Query: {row['user_query']}")
    print(f"Ground Truth: {row['ground_truth']}")


[샘플 1] (인덱스: 1035)
--------------------------------------------------------------------------------
User Query: [AO|WHAT] Who broke england's ties to the catholic church?
Ground Truth: Which one: initiated, or completed the break of England's ties to the Catholic Church?

[샘플 2] (인덱스: 2867)
--------------------------------------------------------------------------------
User Query: [LA|LEX] What is the genre of Pollyanna?
Ground Truth: Are you referring to the book or the movie?

[샘플 3] (인덱스: 2652)
--------------------------------------------------------------------------------
User Query: [NONE|NONE] The sister-in-law told Amanda that she tricked the client.
Who tricked the client?
Ground Truth: <NO_CLARIFYING_QUESTION>

[샘플 4] (인덱스: 72)
--------------------------------------------------------------------------------
User Query: [AO|WHAT] Give me a list of 10 nematode species
Ground Truth: Are you looking for nematode species from a specific region, habitat, or for a particular resea

In [None]:
import numpy as np

# 모호한 질문과 명확한 질문 분리
ambiguous_df = df_final[~none_mask].copy()
none_df = df_final[none_mask].copy()

print(f"\n분리 전:")
print(f"- 모호한 질문: {len(ambiguous_df)}개")
print(f"- 명확한 질문 (NONE): {len(none_df)}개")

# NONE 데이터를 5%만 샘플링
target_none_count = int(len(ambiguous_df) * 0.05)  # 모호한 질문의 5%
none_sampled = none_df.sample(n=min(target_none_count, len(none_df)), random_state=42)

# 합치기
df_final = pd.concat([ambiguous_df, none_sampled], ignore_index=True)

# 섞기
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\n샘플링 후:")
print(f"- 모호한 질문: {len(ambiguous_df)}개")
print(f"- 명확한 질문 (NONE): {len(none_sampled)}개 ({len(none_sampled)/len(df_final)*100:.1f}%)")
print(f"- 총 데이터: {len(df_final)}개")

print("\n컬럼 목록:", df_final.columns.tolist())
print("\n결측치 확인:")
print(df_final.isnull().sum())


분리 전:
- 모호한 질문: 1601개
- 명확한 질문 (NONE): 1601개

샘플링 후:
- 모호한 질문: 1601개
- 명확한 질문 (NONE): 80개 (4.8%)
- 총 데이터: 1681개

컬럼 목록: ['user_query', 'ground_truth']

결측치 확인:
user_query      0
ground_truth    0
dtype: int64


In [None]:
# 최종 분포 확인
print("="*80)
print("최종 데이터 분포:")
print("="*80)

none_final_mask = df_final['user_query'].str.contains('[NONE|NONE]', regex=False)
print(f"NONE: {none_final_mask.sum()}개 ({none_final_mask.sum()/len(df_final)*100:.1f}%)")
print(f"모호한 질문: {(~none_final_mask).sum()}개 ({(~none_final_mask).sum()/len(df_final)*100:.1f}%)")

print("\n" + "="*80)
print("NONE 샘플 3개:")
print("="*80)
display(df_final[none_final_mask].head(3))

print("\n" + "="*80)
print("모호한 질문 샘플 3개:")
print("="*80)
display(df_final[~none_final_mask].head(3))

최종 데이터 분포:
NONE: 80개 (4.8%)
모호한 질문: 1601개 (95.2%)

NONE 샘플 3개:


Unnamed: 0,user_query,ground_truth
0,[NONE|NONE] Is Pensacola sylvestris a multicel...,<NO_CLARIFYING_QUESTION>
6,[NONE|NONE] Tell me which of the following are...,<NO_CLARIFYING_QUESTION>
8,[NONE|NONE] The sister-in-law told Amanda that...,<NO_CLARIFYING_QUESTION>



모호한 질문 샘플 3개:


Unnamed: 0,user_query,ground_truth
1,[AO|WHERE] When did call of duty ww2 come out?,"Which one: release, release in the North Ameri..."
2,[EM|UNF] Does Helicotylenchus have Phaseolus g...,What is Helicotylenchus referring to?
3,[AO|WHEN] Give me a bulleted list of the top f...,As of which date or period would you like the ...


In [None]:
import json

# 저장 경로
save_path = '/content/drive/MyDrive/Colab Notebooks/woke-odds/dpo_base_data.jsonl'

# JSONL 저장
with open(save_path, 'w', encoding='utf-8') as f:
    for idx, row in df_final.iterrows():
        json_line = {
            'user_query': row['user_query'],
            'ground_truth': row['ground_truth']
        }
        f.write(json.dumps(json_line, ensure_ascii=False) + '\n')

print(f"✓ JSONL 저장 완료!")
print(f"저장 경로: {save_path}")
print(f"저장된 데이터 개수: {len(df_final)}개")

# 저장 확인
print("\n저장된 파일 크기 확인:")
import os
file_size = os.path.getsize(save_path)
print(f"파일 크기: {file_size / 1024:.2f} KB ({file_size / 1024 / 1024:.2f} MB)")

✓ JSONL 저장 완료!
저장 경로: /content/drive/MyDrive/Colab Notebooks/woke-odds/dpo_base_data.jsonl
저장된 데이터 개수: 1681개

저장된 파일 크기 확인:
파일 크기: 383.58 KB (0.37 MB)


In [None]:
# 저장된 파일 다시 불러와서 확인
print("="*80)
print("저장된 파일 확인:")
print("="*80)

test_data = []
with open(save_path, 'r', encoding='utf-8') as f:
    for line in f:
        test_data.append(json.loads(line))

print(f"불러온 데이터 개수: {len(test_data)}")
print("\n첫 3개 샘플:")
for i, sample in enumerate(test_data[:3], 1):
    print(f"\n[샘플 {i}]")
    print(f"User Query: {sample['user_query']}")
    print(f"Ground Truth: {sample['ground_truth']}")

print("\n✓ 파일이 정상적으로 저장되었습니다!")

저장된 파일 확인:
불러온 데이터 개수: 1681

첫 3개 샘플:

[샘플 1]
User Query: [NONE|NONE] Is Pensacola sylvestris a multicellular organism in terms of cellularity?
Ground Truth: <NO_CLARIFYING_QUESTION>

[샘플 2]
User Query: [AO|WHERE] When did call of duty ww2 come out?
Ground Truth: Which one: release, release in the North America, release in Australia, or release in the EU?

[샘플 3]
User Query: [EM|UNF] Does Helicotylenchus have Phaseolus galactoides as its host?
Ground Truth: What is Helicotylenchus referring to?

✓ 파일이 정상적으로 저장되었습니다!


### 후보 생성 테스트

In [None]:
from tqdm import tqdm

def generate_candidates_for_query(user_query, model, tokenizer, num_candidates=5):
    """
    하나의 쿼리에 대해 여러 후보 생성

    Returns:
        list: 생성된 후보 텍스트 리스트
    """
    # messages 구성
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user_query}
    ]

    # Chat template 적용
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    # 토크나이즈
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    candidates = []
    temperatures = [0.7, 0.9, 1.1, 1.3, 1.5]

    for temp in temperatures[:num_candidates]:
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=150,
                temperature=temp,
                top_p=0.95,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

        generated_text = tokenizer.decode(
            outputs[0][inputs['input_ids'].shape[1]:],
            skip_special_tokens=True
        ).strip()

        candidates.append(generated_text)

    # 중복 제거하되, 5개 유지
    unique_candidates = []
    seen = set()
    for cand in candidates:
        if cand not in seen:
            unique_candidates.append(cand)
            seen.add(cand)

    # 5개 미만이면 재생성 (높은 temperature로)
    while len(unique_candidates) < num_candidates:
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=150,
                temperature=1.5 + len(unique_candidates) * 0.1,  # 점점 높임
                top_p=0.95,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

        generated_text = tokenizer.decode(
            outputs[0][inputs['input_ids'].shape[1]:],
            skip_special_tokens=True
        ).strip()

        if generated_text not in seen:
            unique_candidates.append(generated_text)
            seen.add(generated_text)

    return unique_candidates[:num_candidates]

print("후보 생성 함수 정의 완료!")

후보 생성 함수 정의 완료!


In [None]:
# 먼저 3개만 테스트
print("3개 샘플로 테스트 시작...\n")

test_df = df_final.head(3).copy()

# 후보 컬럼 추가
for i in range(5):
    test_df[f'candidate_{i+1}'] = None

# 생성
for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="후보 생성 중"):
    user_query = row['user_query']

    try:
        candidates = generate_candidates_for_query(user_query, model, tokenizer)

        # DataFrame에 저장
        for i, cand in enumerate(candidates):
            test_df.at[idx, f'candidate_{i+1}'] = cand

    except Exception as e:
        print(f"\n에러 발생 (idx={idx}): {e}")
        continue

print("\n테스트 완료!")
print("\n결과 확인:")
print(test_df[['user_query', 'ground_truth', 'candidate_1', 'candidate_2']].head(3))

3개 샘플로 테스트 시작...



후보 생성 중: 100%|██████████| 3/3 [00:57<00:00, 19.03s/it]


테스트 완료!

결과 확인:
                                          user_query  \
0  [NONE|NONE] Is Pensacola sylvestris a multicel...   
1     [AO|WHERE] When did call of duty ww2 come out?   
2  [EM|UNF] Does Helicotylenchus have Phaseolus g...   

                                        ground_truth  \
0                           <NO_CLARIFYING_QUESTION>   
1  Which one: release, release in the North Ameri...   
2              What is Helicotylenchus referring to?   

                                         candidate_1  \
0                           <NO_CLARIFYING_QUESTION>   
1                           Which one: Where? Where?   
2  What are the possible host plants for Helicoty...   

                                         candidate_2  
0                                          <MEM|LEM>  
1  Are you asking if it has a specific location i...  
2  Are you referring to the nematode Helicotylenc...  





In [None]:
# 한 샘플씩 세로로 보기 (더 깔끔함)
def show_sample(df, idx):
    """
    DataFrame의 특정 행을 보기 좋게 출력
    """
    row = df.iloc[idx]
    print("="*80)
    print(f"[샘플 {idx}]")
    print("="*80)
    for col in df.columns:
        print(f"\n{col}:")
        print(f"  {row[col]}")
    print("\n" + "="*80 + "\n")

# 사용
show_sample(test_df, 0)
show_sample(test_df, 1)
show_sample(test_df, 2)

[샘플 0]

user_query:
  [NONE|NONE] Is Pensacola sylvestris a multicellular organism in terms of cellularity?

ground_truth:
  <NO_CLARIFYING_QUESTION>

candidate_1:
  <NO_CLARIFYING_QUESTION>

candidate_2:
  <MEM|LEM>

candidate_3:
  I do the evidence require with? What version of output? 1NO Claraifying: Need output? What is output? <NO_CLARIFYING_QUESTION> If need output? What kind of EM from user need category of output?

From EM with Clarifying category: if UNF | CONTACT or UNF | CONTRADICTION, I output one Clarifying by category example: "How familiar is this type: name|email or name?"

What categories: Is the user clarif questions contain any unfamiliar type output?

User query: [NONE] Need clarifications category in "Name|email" is required: EM UNF: user mention any unfamiliar type. So, what categories: EM requires either output: "Which

candidate_4:
  As a paragraph summarizing evidence that the current task is classified as Clear Questions: NONE,
In the given query from user: "

### 3개 샘플 테스트

In [None]:
# 3개 샘플만 추출
df_test = df_final.head(3).copy()

print("="*80)
print("테스트용 3개 샘플:")
print("="*80)
display(df_test)

# 후보 생성할 컬럼 추가 (5개)
for i in range(5):
    df_test[f'candidate_{i+1}'] = None

print("\n후보 생성 시작...\n")

# 각 샘플에 대해 5개 후보 생성
for idx in range(len(df_test)):
    row = df_test.iloc[idx]
    user_query = row['user_query']

    print(f"[{idx+1}/3] 생성 중: {user_query[:60]}...")

    # messages 구성
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user_query}
    ]

    try:
        candidates = generate_candidates_for_query(user_query, model, tokenizer, num_candidates=5)

        # DataFrame에 저장
        for i, cand in enumerate(candidates):
            df_test.at[idx, f'candidate_{i+1}'] = cand

        print(f"  ✓ 완료!\n")

    except Exception as e:
        print(f"  ✗ 에러: {e}\n")

print("="*80)
print("후보 생성 완료!")
print("="*80)

테스트용 3개 샘플:


Unnamed: 0,user_query,ground_truth
0,[NONE|NONE] Is Pensacola sylvestris a multicel...,<NO_CLARIFYING_QUESTION>
1,[AO|WHERE] When did call of duty ww2 come out?,"Which one: release, release in the North Ameri..."
2,[EM|UNF] Does Helicotylenchus have Phaseolus g...,What is Helicotylenchus referring to?



후보 생성 시작...

[1/3] 생성 중: [NONE|NONE] Is Pensacola sylvestris a multicellular organism...
  ✓ 완료!

[2/3] 생성 중: [AO|WHERE] When did call of duty ww2 come out?...
  ✓ 완료!

[3/3] 생성 중: [EM|UNF] Does Helicotylenchus have Phaseolus galactoides as ...
  ✓ 완료!

후보 생성 완료!


In [None]:
# Pandas 표시 옵션
pd.set_option('display.max_colwidth', 80)

print("\n생성된 DataFrame 구조:")
print(f"컬럼: {df_test.columns.tolist()}")
print(f"Shape: {df_test.shape}")

print("\n" + "="*80)
print("전체 DataFrame:")
print("="*80)
display(df_test)


생성된 DataFrame 구조:
컬럼: ['user_query', 'ground_truth', 'candidate_1', 'candidate_2', 'candidate_3', 'candidate_4', 'candidate_5']
Shape: (3, 7)

전체 DataFrame:


Unnamed: 0,user_query,ground_truth,candidate_1,candidate_2,candidate_3,candidate_4,candidate_5
0,[NONE|NONE] Is Pensacola sylvestris a multicellular organism in terms of cel...,<NO_CLARIFYING_QUESTION>,<NO_CLARIFYING_QUESTION>,<CLARIFYING_QUESTION>\nWhose relationship is that of Pensacola sylvestris be...,<NO_CLARIFYING_QUESTION> \n\nOR \nWhat question do you want to ask about NIG...,"I'm not familiar with ""Folkone-Wingol, The Wolf or Dogs"" since their informa...",Output: Which city of Georgia features Pensacola Sylvestris along the Chauga...
1,[AO|WHERE] When did call of duty ww2 come out?,"Which one: release, release in the North America, release in Australia, or r...",Can you provide the year for call of duty ww2 or the specific year range (e....,Which version: The First or the First?,Which war?,Which call of duty ww2 game: the older or the newer version?,Which call of duty part are we questioning exactly?
2,[EM|UNF] Does Helicotylenchus have Phaseolus galactoides as its host?,What is Helicotylenchus referring to?,"Is the entity ""Phaseolus galactoides"" familiar to you?",Are you referring to the plant-pathogen interaction in Phaseolus vulgaris an...,What word is the other option for?,"Is what you're refering to is ""Does something"" related to ""has an entity"" or...",Can you provide information about any specific species? Use specific taxonom...


In [None]:
# 각 샘플을 예쁘게 출력
for idx in range(len(df_test)):
    row = df_test.iloc[idx]

    print("\n" + "="*80)
    print(f"[샘플 {idx+1}]")
    print("="*80)
    print(f"User Query:\n  {row['user_query']}\n")
    print(f"Ground Truth:\n  {row['ground_truth']}\n")
    print(f"생성된 후보 5개:")
    print("-"*80)

    for i in range(5):
        cand = row[f'candidate_{i+1}']
        print(f"\n[후보 {i+1}]")
        print(f"  {cand}")

    print("\n")


[샘플 1]
User Query:
  [NONE|NONE] Is Pensacola sylvestris a multicellular organism in terms of cellularity?

Ground Truth:
  <NO_CLARIFYING_QUESTION>

생성된 후보 5개:
--------------------------------------------------------------------------------

[후보 1]
  <NO_CLARIFYING_QUESTION>

[후보 2]
  <CLARIFYING_QUESTION>
Whose relationship is that of Pensacola sylvestris being a cell and what taxonomic categories or species or subspecies are in conflict?

[후보 3]
  <NO_CLARIFYING_QUESTION> 

OR 
What question do you want to ask about NIGER. What type would you like to ask WHO and WHO?


In my approach i said question either confusions?

[후보 4]
  I'm not familiar with "Folkone-Wingol, The Wolf or Dogs" since their information need to be familiar with a category with missing information. In the given example, no ambiguity arises and clarification question was not asked. This is not a scenario requiring a clarifying question. We'll refer to the sample and tell user which to do in the next message. Once

In [None]:
# 테스트 df 삭제
del df_test
print("✓ 테스트 DataFrame 삭제 완료\n")

✓ 테스트 DataFrame 삭제 완료



## 전체 데이터 작업

In [None]:
# 전체 데이터로 작업
df_with_candidates = df_final.copy()

print("="*80)
print(f"전체 데이터 후보 생성 시작")
print("="*80)
print(f"총 데이터 개수: {len(df_with_candidates)}개")
print(f"예상 소요 시간: 약 2-3시간\n")

# 후보 생성할 컬럼 추가 (5개)
for i in range(5):
    df_with_candidates[f'candidate_{i+1}'] = None

# 배치 처리 (100개씩 중간 저장)
batch_size = 100
total_batches = (len(df_with_candidates) + batch_size - 1) // batch_size

for batch_idx in range(total_batches):
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, len(df_with_candidates))

    print(f"\n{'='*80}")
    print(f"[Batch {batch_idx+1}/{total_batches}] Processing {start_idx}-{end_idx}")
    print('='*80)

    for idx in tqdm(range(start_idx, end_idx), desc=f"Batch {batch_idx+1}"):
        row = df_with_candidates.iloc[idx]
        user_query = row['user_query']

        try:
            candidates = generate_candidates_for_query(user_query, model, tokenizer, num_candidates=5)

            # DataFrame에 저장
            for i, cand in enumerate(candidates):
                df_with_candidates.at[df_with_candidates.index[idx], f'candidate_{i+1}'] = cand

        except Exception as e:
            print(f"\n에러 발생 (idx={idx}): {e}")
            continue

    # 중간 저장 (100개마다)
    temp_save_path = f'/content/drive/MyDrive/Colab Notebooks/woke-odds/candidates_batch_{batch_idx+1}.csv'
    df_with_candidates.iloc[:end_idx].to_csv(temp_save_path, index=False, encoding='utf-8-sig')
    print(f"\n✓ 중간 저장 완료: {temp_save_path}")

print("\n" + "="*80)
print("✓ 전체 후보 생성 완료!")
print("="*80)

전체 데이터 후보 생성 시작
총 데이터 개수: 1681개
예상 소요 시간: 약 2-3시간


[Batch 1/17] Processing 0-100


Batch 1: 100%|██████████| 100/100 [21:23<00:00, 12.83s/it]



✓ 중간 저장 완료: /content/drive/MyDrive/Colab Notebooks/woke-odds/candidates_batch_1.csv

[Batch 2/17] Processing 100-200


Batch 2: 100%|██████████| 100/100 [19:36<00:00, 11.76s/it]



✓ 중간 저장 완료: /content/drive/MyDrive/Colab Notebooks/woke-odds/candidates_batch_2.csv

[Batch 3/17] Processing 200-300


Batch 3: 100%|██████████| 100/100 [20:22<00:00, 12.23s/it]



✓ 중간 저장 완료: /content/drive/MyDrive/Colab Notebooks/woke-odds/candidates_batch_3.csv

[Batch 4/17] Processing 300-400


Batch 4: 100%|██████████| 100/100 [22:03<00:00, 13.23s/it]



✓ 중간 저장 완료: /content/drive/MyDrive/Colab Notebooks/woke-odds/candidates_batch_4.csv

[Batch 5/17] Processing 400-500


Batch 5: 100%|██████████| 100/100 [21:24<00:00, 12.85s/it]



✓ 중간 저장 완료: /content/drive/MyDrive/Colab Notebooks/woke-odds/candidates_batch_5.csv

[Batch 6/17] Processing 500-600


Batch 6: 100%|██████████| 100/100 [21:07<00:00, 12.68s/it]



✓ 중간 저장 완료: /content/drive/MyDrive/Colab Notebooks/woke-odds/candidates_batch_6.csv

[Batch 7/17] Processing 600-700


Batch 7: 100%|██████████| 100/100 [18:56<00:00, 11.36s/it]



✓ 중간 저장 완료: /content/drive/MyDrive/Colab Notebooks/woke-odds/candidates_batch_7.csv

[Batch 8/17] Processing 700-800


Batch 8: 100%|██████████| 100/100 [19:31<00:00, 11.72s/it]



✓ 중간 저장 완료: /content/drive/MyDrive/Colab Notebooks/woke-odds/candidates_batch_8.csv

[Batch 9/17] Processing 800-900


Batch 9: 100%|██████████| 100/100 [20:37<00:00, 12.37s/it]



✓ 중간 저장 완료: /content/drive/MyDrive/Colab Notebooks/woke-odds/candidates_batch_9.csv

[Batch 10/17] Processing 900-1000


Batch 10: 100%|██████████| 100/100 [19:47<00:00, 11.88s/it]



✓ 중간 저장 완료: /content/drive/MyDrive/Colab Notebooks/woke-odds/candidates_batch_10.csv

[Batch 11/17] Processing 1000-1100


Batch 11: 100%|██████████| 100/100 [20:12<00:00, 12.13s/it]



✓ 중간 저장 완료: /content/drive/MyDrive/Colab Notebooks/woke-odds/candidates_batch_11.csv

[Batch 12/17] Processing 1100-1200


Batch 12: 100%|██████████| 100/100 [21:53<00:00, 13.13s/it]



✓ 중간 저장 완료: /content/drive/MyDrive/Colab Notebooks/woke-odds/candidates_batch_12.csv

[Batch 13/17] Processing 1200-1300


Batch 13: 100%|██████████| 100/100 [20:12<00:00, 12.12s/it]



✓ 중간 저장 완료: /content/drive/MyDrive/Colab Notebooks/woke-odds/candidates_batch_13.csv

[Batch 14/17] Processing 1300-1400


Batch 14: 100%|██████████| 100/100 [21:53<00:00, 13.14s/it]



✓ 중간 저장 완료: /content/drive/MyDrive/Colab Notebooks/woke-odds/candidates_batch_14.csv

[Batch 15/17] Processing 1400-1500


Batch 15: 100%|██████████| 100/100 [20:34<00:00, 12.35s/it]



✓ 중간 저장 완료: /content/drive/MyDrive/Colab Notebooks/woke-odds/candidates_batch_15.csv

[Batch 16/17] Processing 1500-1600


Batch 16: 100%|██████████| 100/100 [18:49<00:00, 11.29s/it]



✓ 중간 저장 완료: /content/drive/MyDrive/Colab Notebooks/woke-odds/candidates_batch_16.csv

[Batch 17/17] Processing 1600-1681


Batch 17: 100%|██████████| 81/81 [16:12<00:00, 12.00s/it]


✓ 중간 저장 완료: /content/drive/MyDrive/Colab Notebooks/woke-odds/candidates_batch_17.csv

✓ 전체 후보 생성 완료!





In [None]:
# 최종 CSV 저장
final_save_path = '/content/drive/MyDrive/Colab Notebooks/woke-odds/dpo_with_candidates_full.csv'
df_with_candidates.to_csv(final_save_path, index=False, encoding='utf-8-sig')

print(f"✓ 최종 CSV 저장 완료!")
print(f"저장 경로: {final_save_path}")
print(f"저장된 데이터 개수: {len(df_with_candidates)}개")

# 통계
print("\n" + "="*80)
print("생성 통계:")
print("="*80)
for i in range(5):
    col = f'candidate_{i+1}'
    valid_count = df_with_candidates[col].notna().sum()
    print(f"{col}: {valid_count}/{len(df_with_candidates)} ({valid_count/len(df_with_candidates)*100:.1f}%)")

✓ 최종 CSV 저장 완료!
저장 경로: /content/drive/MyDrive/Colab Notebooks/woke-odds/dpo_with_candidates_full.csv
저장된 데이터 개수: 1681개

생성 통계:
candidate_1: 1681/1681 (100.0%)
candidate_2: 1681/1681 (100.0%)
candidate_3: 1681/1681 (100.0%)
candidate_4: 1681/1681 (100.0%)
candidate_5: 1681/1681 (100.0%)


In [None]:
# 랜덤 샘플 3개 확인
import random

sample_indices = random.sample(range(len(df_with_candidates)), 3)

for idx in sample_indices:
    row = df_with_candidates.iloc[idx]
    print("="*80)
    print(f"[샘플 {idx}]")
    print("="*80)
    print(f"User Query: {row['user_query']}")
    print(f"\nGround Truth: {row['ground_truth']}")
    print(f"\n생성된 후보들:")
    for i in range(5):
        print(f"  {i+1}. {row[f'candidate_{i+1}']}")
    print()

print(f"\n✓ 전체 작업 완료! 총 {len(df_with_candidates)}개 데이터 처리됨")

[샘플 1174]
User Query: [AO|WHERE] Where was once i was a beehive filmed?

Ground Truth: Which one: country, or states?

생성된 후보들:
  1. Which location: The location where I was a beehive?
  2. Is the location a place or a specific region?
  3. Which type of camera was used for which film: a mobile phone or a traditional camera?
  4. As the "Where" seems ambiguous due to missing details, if it refers to a location for filming, we need to know the context. Please provide more clarity.
  5. When or after a type changedNone. The required query is clear and doesn't require a clarification question:

[샘플 608]
User Query: [AO|WHERE] What channel did fresh prince of bel air air on?

Ground Truth:  On which country: America, or the United Kingdom?

생성된 후보들:
  1. Can you provide either the channel name or the air date of the show?
  2. Which city?
  3. Which city's name does the person in the text know: "Where" or "London"?
  4. Which channel aired in general of be Air TV: ALJ Air TV or Bel Air?
  