In [None]:
!pip install anthropic
!pip install tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import json
import anthropic
from tqdm import tqdm
import time

print("라이브러리 임포트 완료!")

In [None]:
# CSV 불러오기
csv_path = '/content/drive/MyDrive/Colab Notebooks/woke-odds/dpo_with_candidates_full.csv'
df = pd.read_csv(csv_path)

print(f"데이터 로드 완료!")
print(f"총 데이터 개수: {len(df)}개")
print(f"컬럼: {df.columns.tolist()}")

# 샘플 확인
print("\n첫 번째 샘플:")
display(df.head(1))

In [None]:
# Claude API 키 입력 (실행 시 입력)
import getpass

api_key = getpass.getpass('Claude API Key를 입력하세요: ')
client = anthropic.Anthropic(api_key=api_key)

print("Claude API 클라이언트 설정 완료!")

In [None]:
def evaluate_candidates_with_rlaif(user_query, ground_truth, candidates):
    """
    Claude API로 후보들을 평가하고 best/rejected 선택

    Args:
        user_query: 사용자 질문 (카테고리 포함)
        ground_truth: 정답 clarifying question
        candidates: 생성된 후보 5개 리스트

    Returns:
        dict: {'best_idx', 'rejected_idx', 'scores', 'reasoning'}
    """

    # 카테고리 추출
    category = user_query.split(']')[0] + ']'

    # 프롬프트 생성
    prompt = f"""You are an expert rater of clarifying questions for ambiguous queries.

AMBIGUITY CATEGORIES GUIDE:

- EM (Epistemic Misalignment): Questions with unfamiliar entities or self-contradictions
  * UNF (UNFAMILIAR): Query contains unfamiliar entities or facts
  * CONT (CONTRADICTION): Query contains self-contradictions

- LA (Linguistic Ambiguity): Questions with lexical or semantic ambiguity
  * LEX (LEXICAL): Query contains terms with multiple meanings
  * SEM (SEMANTIC): Query lacks context leading to multiple interpretations

- AO (Aleatoric Output): Questions with missing contextual information causing confusion
  * WHOM: Missing information about WHO (person/agent)
  * WHEN: Missing temporal information
  * WHERE: Missing spatial/location information
  * WHAT: Missing task-specific or object information

- NONE: Clear questions that don't require clarification
  * Expected response: <NO_CLARIFYING_QUESTION>

---

EVALUATION TASK:

Query: {user_query}
Category: {category}

Reference Answer (Ground Truth):
{ground_truth}

Generated Candidates to Evaluate:
1. {candidates[0]}
2. {candidates[1]}
3. {candidates[2]}
4. {candidates[3]}
5. {candidates[4]}

---

Your Task:

Step 1 - Analyze each candidate:
- Does it correctly address the specific ambiguity type indicated in the category?
- For example, if category is [AO|WHOM], does it ask about WHO is involved?
- Is it clear, specific, and easy to understand?
- How does it compare to the reference answer?

Step 2 - Assign scores (0-100):
- 85-100: Excellent (addresses ambiguity type correctly, comparable to reference)
- 70-84: Good (addresses ambiguity but slightly less effective)
- 55-69: Acceptable (addresses core issue but noticeably lacking in clarity or specificity)
- 40-54: Weak (partially addresses ambiguity or misses key aspects)
- 0-39: Poor (doesn't address the right ambiguity type or is ineffective)

Step 3 - Select candidates:
- BEST: Choose the highest scoring candidate
- REJECTED: Choose a candidate in the 60-75 point range
  * IMPORTANT: The rejected candidate MUST be different from the best candidate
  * If the best candidate is in 60-75 range, choose the SECOND best candidate in that range
  * If no other candidate is in 60-75 range, choose the candidate closest to 67 points (excluding the best)
  * Avoid candidates above 80 or below 50

Provide your response in JSON format ONLY:
{{
  "analysis": "Brief analysis of all candidates",
  "scores": [score1, score2, score3, score4, score5],
  "best_index": 1-5,
  "rejected_index": 1-5,
  "rejected_score": actual_rejected_score,
  "reasoning": "Why this rejected candidate has appropriate quality gap"
}}

Response:"""

    try:
        # Claude API 호출
        message = client.messages.create(
            model="claude-sonnet-4-20250514",
            max_tokens=2000,
            messages=[
                {"role": "user", "content": prompt}
            ]
        )

        # 응답 파싱
        response_text = message.content[0].text

        # JSON 추출 (마크다운 코드블록 제거)
        response_text = response_text.replace('```json\n', '').replace('\n```', '').strip()
        if response_text.startswith('```'):
            response_text = response_text.split('\n', 1)[1]
        if response_text.endswith('```'):
            response_text = response_text.rsplit('\n', 1)[0]

        # JSON 파싱
        result = json.loads(response_text)

        return result

    except Exception as e:
        print(f"에러 발생: {e}")
        print(f"응답 텍스트: {response_text[:200] if 'response_text' in locals() else 'N/A'}...")
        return None

print("✓ RLAIF 평가 함수 정의 완료!")

In [None]:
def evaluate_candidates_with_rlaif(user_query, ground_truth, candidates):
    """
    Claude API로 후보들을 평가하고 rejected만 선택

    Args:
        user_query: 사용자 질문 (카테고리 포함)
        ground_truth: 정답 clarifying question (이게 chosen!)
        candidates: 생성된 후보 5개 리스트

    Returns:
        dict: {'rejected_idx', 'rejected_score', 'scores', 'reasoning'}
    """

    # 카테고리 추출
    category = user_query.split(']')[0] + ']'

    # 프롬프트 생성
    prompt = f"""You are an expert rater of clarifying questions for ambiguous queries.

AMBIGUITY CATEGORIES GUIDE:

- EM (Epistemic Misalignment): Questions with unfamiliar entities or self-contradictions
  * UNF (UNFAMILIAR): Query contains unfamiliar entities or facts
  * CONT (CONTRADICTION): Query contains self-contradictions

- LA (Linguistic Ambiguity): Questions with lexical or semantic ambiguity
  * LEX (LEXICAL): Query contains terms with multiple meanings
  * SEM (SEMANTIC): Query lacks context leading to multiple interpretations

- AO (Aleatoric Output): Questions with missing contextual information causing confusion
  * WHOM: Missing information about WHO (person/agent)
  * WHEN: Missing temporal information
  * WHERE: Missing spatial/location information
  * WHAT: Missing task-specific or object information

- NONE: Clear questions that don't require clarification
  * Expected response: <NO_CLARIFYING_QUESTION>

---

EVALUATION TASK:

Query: {user_query}
Category: {category}

Reference Answer (Ground Truth - this is the CHOSEN response):
{ground_truth}

Generated Candidates to Evaluate:
1. {candidates[0]}
2. {candidates[1]}
3. {candidates[2]}
4. {candidates[3]}
5. {candidates[4]}

---

Your Task:

Step 1 - Analyze and score each candidate (0-100):
- Does it correctly address the specific ambiguity type indicated in the category?
- Is it clear, specific, and easy to understand?
- How does it compare to the reference answer?

Scoring guide:
- 85-100: Excellent (addresses ambiguity type correctly, comparable to reference)
- 70-84: Good (addresses ambiguity but slightly less effective)
- 55-69: Acceptable (addresses core issue but noticeably lacking in clarity or specificity)
- 40-54: Weak (partially addresses ambiguity or misses key aspects)
- 0-39: Poor (doesn't address the right ambiguity type or is ineffective)

Step 2 - Select the REJECTED candidate:
- Choose a candidate in the 60-75 point range
- This range is CRITICAL for effective learning in Direct Preference Optimization (DPO)
- The rejected candidate should be "noticeably worse but not terrible"
- If multiple candidates are in 60-75 range, pick the one closest to 67
- If NO candidate is in 60-75 range, pick the one closest to this range
- Avoid candidates above 80 (too similar to reference) or below 50 (too poor to learn from)

Provide your response in JSON format ONLY:
{{
  "analysis": "Brief analysis of all candidates",
  "scores": [score1, score2, score3, score4, score5],
  "rejected_index": 1-5,
  "rejected_score": actual_rejected_score,
  "reasoning": "Why this rejected candidate has appropriate quality gap for DPO training"
}}

Response:"""

    try:
        # Claude API 호출
        message = client.messages.create(
            model="claude-sonnet-4-20250514",
            max_tokens=2000,
            messages=[
                {"role": "user", "content": prompt}
            ]
        )

        # 응답 파싱
        response_text = message.content[0].text

        # JSON 추출 (마크다운 코드블록 제거)
        response_text = response_text.replace('```json\n', '').replace('\n```', '').strip()
        if response_text.startswith('```'):
            response_text = response_text.split('\n', 1)[1]
        if response_text.endswith('```'):
            response_text = response_text.rsplit('\n', 1)[0]

        # JSON 파싱
        result = json.loads(response_text)

        return result

    except Exception as e:
        print(f"에러 발생: {e}")
        print(f"응답 텍스트: {response_text[:200] if 'response_text' in locals() else 'N/A'}...")
        return None

print("✓ RLAIF 평가 함수 정의 완료!")

3개 샘플로 출력 테스트

In [None]:
# 3개 샘플로 테스트
print("="*80)
print("3개 샘플로 RLAIF 테스트")
print("="*80 + "\n")

df_test = df.head(3).copy()

for idx in range(len(df_test)):
    row = df_test.iloc[idx]

    print(f"\n[샘플 {idx+1}/3]")
    print("-"*80)
    print(f"User Query: {row['user_query']}")
    print(f"Ground Truth: {row['ground_truth']}")

    # 후보 리스트
    candidates = [
        row['candidate_1'],
        row['candidate_2'],
        row['candidate_3'],
        row['candidate_4'],
        row['candidate_5']
    ]

    print("\n후보들:")
    for i, cand in enumerate(candidates, 1):
        print(f"  {i}. {cand}")

    # RLAIF 평가
    print("\nClaude 평가 중...")
    result = evaluate_candidates_with_rlaif(
        row['user_query'],
        row['ground_truth'],
        candidates
    )

    if result:
        print(f"\n✓ 평가 완료!")
        print(f"점수: {result['scores']}")
        print(f"Rejected: 후보 {result['rejected_index']}")
        print(f"이유: {result['reasoning']}")
    else:
        print("\n✗ 평가 실패")

    print("\n" + "="*80)

    # Rate limit 방지
    time.sleep(2)

print("\n✓ 테스트 완료!")

2개 샘플로 df 잘 저장되나 테스트

In [None]:
# 2개 샘플만 추출
df_test_2 = df.head(2).copy()

print("="*80)
print("2개 샘플로 RLAIF 테스트 및 저장 확인")
print("="*80 + "\n")

# 결과 저장할 컬럼 추가
df_test_2['rejected_candidate_idx'] = None
df_test_2['rejected_text'] = None  # ← 추가!
df_test_2['rejected_score'] = None
df_test_2['all_scores'] = None
df_test_2['rlaif_reasoning'] = None

# 각 샘플 처리
for idx in range(len(df_test_2)):
    row = df_test_2.iloc[idx]

    print(f"\n[샘플 {idx+1}/2]")
    print("-"*80)
    print(f"User Query: {row['user_query'][:80]}...")
    print(f"Ground Truth: {row['ground_truth'][:80]}...")

    # 후보 리스트
    candidates = [
        row['candidate_1'],
        row['candidate_2'],
        row['candidate_3'],
        row['candidate_4'],
        row['candidate_5']
    ]

    # RLAIF 평가
    print("Claude 평가 중...")
    result = evaluate_candidates_with_rlaif(
        row['user_query'],
        row['ground_truth'],
        candidates
    )

    if result:
        rejected_idx = result['rejected_index']
        rejected_text = candidates[rejected_idx - 1]  # 텍스트 추출

        # DataFrame에 저장 (둘 다!)
        df_test_2.at[idx, 'rejected_candidate_idx'] = rejected_idx
        df_test_2.at[idx, 'rejected_text'] = rejected_text  # ← 텍스트 저장!
        df_test_2.at[idx, 'rejected_score'] = result['rejected_score']
        df_test_2.at[idx, 'all_scores'] = str(result['scores'])
        df_test_2.at[idx, 'rlaif_reasoning'] = result['reasoning']

        print(f"✓ 저장 완료!")
        print(f"  - Rejected: 후보 {rejected_idx} (점수: {result['rejected_score']})")
    else:
        print("✗ 평가 실패")

    # Rate limit 방지
    time.sleep(2)

print("\n" + "="*80)
print("✓ 테스트 완료!")
print("="*80)

In [None]:
# 전체 컬럼 확인
print("저장된 DataFrame 정보:")
print(f"Shape: {df_test_2.shape}")
print(f"컬럼: {df_test_2.columns.tolist()}\n")

# 주요 컬럼만 표시
print("="*80)
print("저장된 데이터 확인:")
print("="*80)


display(df_test_2)

In [None]:
# 결과 저장할 컬럼 추가
df['rejected_candidate_idx'] = None
df['rejected_text'] = None
df['rejected_score'] = None
df['all_scores'] = None
df['rlaif_reasoning'] = None

print(f"전체 {len(df)}개 데이터 RLAIF 평가 시작...")

# 배치 처리
batch_size = 50
total_batches = (len(df) + batch_size - 1) // batch_size

for batch_idx in range(total_batches):
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, len(df))

    print(f"\n{'='*80}")
    print(f"[Batch {batch_idx+1}/{total_batches}] Processing {start_idx}-{end_idx}")
    print('='*80)

    for idx in tqdm(range(start_idx, end_idx), desc=f"Batch {batch_idx+1}"):
        row = df.iloc[idx]

        # 후보 리스트
        candidates = [
            row['candidate_1'],
            row['candidate_2'],
            row['candidate_3'],
            row['candidate_4'],
            row['candidate_5']
        ]

        try:
            # RLAIF 평가
            result = evaluate_candidates_with_rlaif(
                row['user_query'],
                row['ground_truth'],
                candidates
            )

            if result:
                rejected_idx = result['rejected_index']
                rejected_text = candidates[rejected_idx - 1]

                df.at[idx, 'rejected_candidate_idx'] = rejected_idx
                df.at[idx, 'rejected_text'] = rejected_text
                df.at[idx, 'rejected_score'] = result['rejected_score']
                df.at[idx, 'all_scores'] = str(result['scores'])
                df.at[idx, 'rlaif_reasoning'] = result['reasoning']

            # Rate limit 방지
            time.sleep(1.5)

        except Exception as e:
            print(f"\n에러 (idx={idx}): {e}")
            time.sleep(5)
            continue

    # 중간 저장
    temp_path = f'/content/drive/MyDrive/Colab Notebooks/woke-odds/rlaif_batch_{batch_idx+1}.csv'
    df.iloc[:end_idx].to_csv(temp_path, index=False, encoding='utf-8-sig')
    print(f"\n✓ 중간 저장: {temp_path}")

print("\n전체 RLAIF 평가 완료!")

# 최종 저장
final_path = '/content/drive/MyDrive/Colab Notebooks/woke-odds/rlaif_full_results.csv'
df.to_csv(final_path, index=False, encoding='utf-8-sig')
print(f"최종 저장: {final_path}")

In [None]:
# DPO 포맷으로 변환
dpo_dataset = []

for idx, row in df.iterrows():
    dpo_dataset.append({
        'prompt': row['user_query'],
        'chosen': row['ground_truth'],
        'rejected': row['rejected_text']
    })

df_dpo = pd.DataFrame(dpo_dataset)

# 저장
dpo_path = '/content/drive/MyDrive/Colab Notebooks/woke-odds/dpo_final_dataset.jsonl'
with open(dpo_path, 'w', encoding='utf-8') as f:
    for idx, row in df_dpo.iterrows():
        f.write(json.dumps(row.to_dict(), ensure_ascii=False) + '\n')

print(f"✓ DPO 데이터셋 저장 완료: {dpo_path}")