In [None]:
!nvidia-smi

In [None]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"사용 가능 메모리: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
print(f"CUDA version: {torch.version.cuda}")

In [None]:
#작업 경로 지정
import os
os.chdir('/content/drive/MyDrive/woke-odds')
print(os.getcwd())

In [None]:
#데이터셋 로드
from datasets import load_dataset
dataset = load_dataset('json', data_files={
    'train': 'clarify_sft_train.jsonl',
    'validation': 'clarify_sft_valid.jsonl',
})
print(f"훈련 데이터: {len(dataset['train'])}개")
print(f"검증 데이터: {len(dataset['validation'])}개")

In [None]:
dataset['train'][0]

### 훈련

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
import torch

In [None]:
model_name = "microsoft/Phi-4-mini-reasoning"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    use_cache=False  # Gradient checkpointing과 호환되도록 설정
    )

In [None]:
#LoRA Config 설정
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["qkv_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

#requires_grad 확인
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"✅ {name}: requires_grad=True")
        break
else:
    print("❌ 학습 가능한 파라미터가 없습니다!")

In [None]:
# chat template이 올바르게 적용되는지 테스트
test_messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "[EM|UNF] Test question?"},
    {"role": "assistant", "content": "Test answer."}
]

formatted_text = tokenizer.apply_chat_template(
    test_messages,
    tokenize=False,
    add_generation_prompt=False
)

print("=== Formatted Text ===")
print(formatted_text)
print("\n=== Tokenized ===")
tokens = tokenizer(formatted_text)
print(f"Token count: {len(tokens['input_ids'])}")

In [None]:
# 데이터 전처리
def preprocess_function(examples):
    # 'messages' 형식을 text로 변환 (Phi-4 chat template 적용)
    texts = []
    for messages in examples['messages']:
        # Phi-4의 chat template 사용
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False
        )
        texts.append(text)

    # 토크나이즈
    model_inputs = tokenizer(
        texts,
        max_length=768,
        truncation=True,
        padding=False  # DataCollator가 처리
    )

    # labels 설정 (CausalLM은 input_ids를 그대로 사용)
    model_inputs["labels"] = model_inputs["input_ids"].copy()

    return model_inputs

tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

In [None]:
# 토크나이즈 결과 확인
print("=== 토크나이즈 확인 ===")
print(f"Keys: {tokenized_dataset['train'].features}")
print(f"Sample input_ids type: {type(tokenized_dataset['train'][0]['input_ids'])}")
print(f"Sample input_ids length: {len(tokenized_dataset['train'][0]['input_ids'])}")

In [None]:
#W&B 비활성화
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
!pip install -q bitsandbytes

In [None]:
#TrainingArguments 설정
training_args = TrainingArguments(
    output_dir='./clarifying_phi_v1',
    num_train_epochs=3,
    bf16=True,

    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=8,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},

    max_grad_norm=1.0,
    weight_decay=0.01,

    dataloader_pin_memory=False,
    dataloader_num_workers=2, #데이터 로딩 병렬
    torch_empty_cache_steps=50,

    logging_dir='./logs_clarifying_phi_v1',
    logging_steps= 25,

    eval_strategy="steps",
    eval_steps=128,
    save_steps=128,
    save_safetensors=True,

    optim="adamw_8bit",

    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=2,

    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,

    report_to=["tensorboard"],
)

In [None]:
# Data Collator

#data_collator = DataCollatorForLanguageModeling(
#    tokenizer=tokenizer,
#    mlm=False
#)

In [None]:
# Custom Data Collator
from dataclasses import dataclass
from typing import Any, Dict, List
import torch

@dataclass
class DataCollatorForCausalLM:
    tokenizer: Any

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        # input_ids의 최대 길이 찾기
        max_length = max(len(f["input_ids"]) for f in features)

        batch = {
            "input_ids": [],
            "attention_mask": [],
            "labels": []
        }

        for feature in features:
            input_ids = feature["input_ids"]
            attention_mask = feature["attention_mask"]
            labels = feature["labels"]

            # 패딩 길이 계산
            padding_length = max_length - len(input_ids)

            # 오른쪽에 패딩 추가
            padded_input_ids = input_ids + [self.tokenizer.pad_token_id] * padding_length
            padded_attention_mask = attention_mask + [0] * padding_length
            padded_labels = labels + [-100] * padding_length  # -100은 loss 계산에서 무시됨

            batch["input_ids"].append(padded_input_ids)
            batch["attention_mask"].append(padded_attention_mask)
            batch["labels"].append(padded_labels)

        # 리스트를 텐서로 변환
        batch = {k: torch.tensor(v) for k, v in batch.items()}

        return batch

# Data Collator 생성
data_collator = DataCollatorForCausalLM(tokenizer=tokenizer)

In [None]:
# 테스트: Collator 동작 확인
test_features = [
    tokenized_dataset["train"][i] for i in range(2)
]

print("=== Collator 테스트 ===")
print(f"샘플 1 길이: {len(test_features[0]['input_ids'])}")
print(f"샘플 2 길이: {len(test_features[1]['input_ids'])}")

batch = data_collator(test_features)

print(f"\n배치 shape:")
print(f"input_ids: {batch['input_ids'].shape}")
print(f"attention_mask: {batch['attention_mask'].shape}")
print(f"labels: {batch['labels'].shape}")
print(f"\nlabels에서 -100 개수: {(batch['labels'] == -100).sum().item()}")

In [None]:
# Trainer 생성
from transformers import EarlyStoppingCallback
import torch

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[
        EarlyStoppingCallback(
            early_stopping_patience=3,  # 3번 연속 개선 없으면 중단
            early_stopping_threshold=0.01  # 최소 개선 임계값
        )
    ]
)

In [None]:
trainer.train()

In [None]:
#베스트 모델 저장
model.save_pretrained('./clarifying_phi_v1/checkpoint-best')
tokenizer.save_pretrained('./clarifying_phi_v1/checkpoint-best')

###평가

In [None]:
from transformers import pipeline
from datasets import load_dataset
import torch
from tqdm import tqdm
import pandas as pd
from sklearn.metrics import accuracy_score
import re

In [None]:
# 모델 경로 설정
model_path = './clarifying_phi_v1/checkpoint-best'

# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained(model_path)

# 특수 토큰 설정이 필요한 경우 확인
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 모델 로드
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map="auto" if torch.cuda.is_available() else None
)

# GPU 메모리 사용량 최적화
if torch.cuda.is_available():
    model = model.eval()

# 데이터 콜레이터 설정
data_collator = DataCollatorForCausalLM(tokenizer=tokenizer)

In [None]:
# 학습된 모델로 테스트 (Trainer의 모델 사용)
import torch

# 테스트할 샘플 3개 선택
test_samples = [tokenized_dataset["validation"][i] for i in range(3)]

print("=== 모델 출력 테스트 (3개 샘플) ===\n")

for idx, sample_data in enumerate(test_samples):
    # 원본 messages 가져오기 (토크나이즈 전 데이터에서)
    original_sample = dataset["validation"][idx]
    messages = original_sample['messages']

    # system + user 메시지만 사용
    input_messages = [msg for msg in messages if msg['role'] != 'assistant']

    # Chat template 적용
    prompt = tokenizer.apply_chat_template(
        input_messages,
        tokenize=False,
        add_generation_prompt=True
    )

    # 토크나이즈
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # 생성
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    # 디코딩 (입력 부분 제외)
    generated_text = tokenizer.decode(
        outputs[0][inputs['input_ids'].shape[1]:],
        skip_special_tokens=True
    )

    # 정답 추출
    ground_truth = [msg['content'] for msg in messages if msg['role'] == 'assistant'][0]
    user_query = [msg['content'] for msg in messages if msg['role'] == 'user'][0]

    # 출력
    print(f"[샘플 {idx+1}]")
    print(f"User Query: {user_query}")
    print(f"\nGround Truth: {ground_truth}")
    print(f"\nModel Output: {generated_text.strip()}")
    print("\n" + "="*80 + "\n")

In [None]:
# 평가 데이터셋 선택
eval_dataset = tokenized_dataset["validation"]

print(f"테스트 샘플 수: {len(eval_dataset)}")
print(f"첫 번째 샘플:\n{eval_dataset[0]}")

In [None]:
# 모델 예측 함수
def generate_clarifying_question(messages, model, tokenizer, max_new_tokens=150):
    """
    주어진 messages에 대해 명확화 질문 생성
    """
    # system + user 메시지만 사용 (assistant 제외)
    input_messages = [msg for msg in messages if msg['role'] != 'assistant']

    # Chat template 적용
    prompt = tokenizer.apply_chat_template(
        input_messages,
        tokenize=False,
        add_generation_prompt=True  # assistant 응답 생성 프롬프트 추가
    )

    # 토크나이즈
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # 생성
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    # 디코딩 (입력 부분 제외)
    generated_text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

    return generated_text.strip()

In [None]:
#원본 데이터셋 사용
eval_dataset = dataset["validation"]

# 전체 테스트 데이터셋에 대해 예측 수행
predictions = []
ground_truths = []

print("예측 시작...")
for example in tqdm(eval_dataset):
    messages = example['messages']

    # 모델 예측
    predicted = generate_clarifying_question(messages, model, tokenizer)
    predictions.append(predicted)

    # 정답 (assistant의 응답)
    ground_truth = [msg['content'] for msg in messages if msg['role'] == 'assistant'][0]
    ground_truths.append(ground_truth)

print(f"예측 완료: {len(predictions)}개 샘플")

In [None]:
# 결과를 DataFrame으로 정리
results_df = pd.DataFrame({
    'user_query': [msg['content'] for example in eval_dataset for msg in example['messages'] if msg['role'] == 'user'],
    'ground_truth': ground_truths,
    'prediction': predictions
})

# 처음 5개 결과 확인
print("=== 예측 결과 샘플 ===")
for idx in range(min(5, len(results_df))):
    print(f"\n[샘플 {idx+1}]")
    print(f"Query: {results_df.iloc[idx]['user_query']}")
    print(f"Ground Truth: {results_df.iloc[idx]['ground_truth']}")
    print(f"Prediction: {results_df.iloc[idx]['prediction']}")
    print("-" * 80)

In [None]:
results_df.to_csv('clarify_phi_v1_pred_results.csv', index=False, encoding='utf-8')

In [None]:
# 정성적 평가: <NO_CLARIFYING_QUESTION> 정확도
def extract_no_clarification_tag(text):
    """텍스트에 <NO_CLARIFYING_QUESTION> 태그가 있는지 확인"""
    return '<NO_CLARIFYING_QUESTION>' in text.upper()

# 태그 존재 여부 비교
gt_has_tag = [extract_no_clarification_tag(gt) for gt in ground_truths]
pred_has_tag = [extract_no_clarification_tag(pred) for pred in predictions]

tag_accuracy = accuracy_score(gt_has_tag, pred_has_tag)

print(f"\n=== <NO_CLARIFYING_QUESTION> 태그 정확도 ===")
print(f"정확도: {tag_accuracy:.2%}")
print(f"Ground Truth에서 태그 있는 샘플: {sum(gt_has_tag)}/{len(gt_has_tag)}")
print(f"Prediction에서 태그 있는 샘플: {sum(pred_has_tag)}/{len(pred_has_tag)}")

Semantic Similarity & BERTScore

In [None]:
!pip install sentence-transformers bert-score

In [None]:
from sentence_transformers import SentenceTransformer, util
from bert_score import score
import numpy as np
import pandas as pd

In [None]:
# Sentence-BERT 모델 로드
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')

# 임베딩 생성
pred_embeddings = semantic_model.encode(predictions, show_progress_bar=True)
gt_embeddings = semantic_model.encode(ground_truths, show_progress_bar=True)

# 코사인 유사도 계산
semantic_scores = []
for pred_emb, gt_emb in zip(pred_embeddings, gt_embeddings):
    similarity = util.cos_sim(pred_emb, gt_emb).item()
    semantic_scores.append(similarity)

# 결과 통계
avg_semantic = np.mean(semantic_scores)
std_semantic = np.std(semantic_scores)
min_semantic = np.min(semantic_scores)
max_semantic = np.max(semantic_scores)

print("=== Semantic Similarity 결과 ===")
print(f"평균 유사도: {avg_semantic:.4f}")
print(f"표준편차: {std_semantic:.4f}")
print(f"최소값: {min_semantic:.4f}")
print(f"최대값: {max_semantic:.4f}")


In [None]:
# 샘플별 Semantic Similarity 확인 (상위 5개, 하위 5개)
semantic_df = pd.DataFrame({
    'ground_truth': ground_truths,
    'prediction': predictions,
    'semantic_score': semantic_scores
})

semantic_df_sorted = semantic_df.sort_values('semantic_score', ascending=False)

print("\n=== Semantic Similarity 상위 5개 (가장 유사) ===")
for idx, row in semantic_df_sorted.head(5).iterrows():
    print(f"\n[순위 {idx+1}] Score: {row['semantic_score']:.4f}")
    print(f"GT: {row['ground_truth'][:100]}...")
    print(f"Pred: {row['prediction'][:100]}...")
    print("-" * 80)

print("\n=== Semantic Similarity 하위 5개 (가장 불일치) ===")
for idx, row in semantic_df_sorted.tail(5).iterrows():
    print(f"\n[순위 {idx+1}] Score: {row['semantic_score']:.4f}")
    print(f"GT: {row['ground_truth'][:100]}...")
    print(f"Pred: {row['prediction'][:100]}...")
    print("-" * 80)

In [None]:
# BERTScore 계산
P, R, F1 = score(
    predictions,
    ground_truths,
    lang='en',
    verbose=True,
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

# numpy로 변환
P_scores = P.cpu().numpy()
R_scores = R.cpu().numpy()
F1_scores = F1.cpu().numpy()

# 결과 통계
avg_P = np.mean(P_scores)
avg_R = np.mean(R_scores)
avg_F1 = np.mean(F1_scores)

print("=== BERTScore 결과 ===")
print(f"평균 Precision: {avg_P:.4f}")
print(f"평균 Recall: {avg_R:.4f}")
print(f"평균 F1: {avg_F1:.4f}")

In [None]:
# 샘플별 BERTScore 확인 (상위 5개, 하위 5개)
bertscore_df = pd.DataFrame({
    'ground_truth': ground_truths,
    'prediction': predictions,
    'bert_P': P_scores,
    'bert_R': R_scores,
    'bert_F1': F1_scores
})

bertscore_df_sorted = bertscore_df.sort_values('bert_F1', ascending=False)

print("\n=== BERTScore F1 상위 5개 (가장 유사) ===")
for idx, row in bertscore_df_sorted.head(5).iterrows():
    print(f"\n[순위 {idx+1}] P: {row['bert_P']:.4f}, R: {row['bert_R']:.4f}, F1: {row['bert_F1']:.4f}")
    print(f"GT: {row['ground_truth'][:100]}...")
    print(f"Pred: {row['prediction'][:100]}...")
    print("-" * 80)

print("\n=== BERTScore F1 하위 5개 (가장 불일치) ===")
for idx, row in bertscore_df_sorted.tail(5).iterrows():
    print(f"\n[순위 {idx+1}] P: {row['bert_P']:.4f}, R: {row['bert_R']:.4f}, F1: {row['bert_F1']:.4f}")
    print(f"GT: {row['ground_truth'][:100]}...")
    print(f"Pred: {row['prediction'][:100]}...")
    print("-" * 80)

In [None]:
results_df['semantic_similarity'] = semantic_scores
results_df['bert_F1'] = F1_scores

print(results_df.head())
print(f"\n컬럼 목록: {list(results_df.columns)}")

In [None]:
results_df.to_csv('clarify_phi_v1_pred_results.csv', index=False, encoding='utf-8')

In [None]:
# 두 지표 비교 분석
combined_df = pd.DataFrame({
    'ground_truth': ground_truths,
    'prediction': predictions,
    'semantic_similarity': semantic_scores,
    'bert_F1': F1_scores
})

# 상관관계 분석
correlation = np.corrcoef(semantic_scores, F1_scores)[0, 1]

print("\n" + "="*80)
print("=== 두 지표 비교 ===")
print("="*80)
print(f"Semantic Similarity vs BERTScore F1 상관계수: {correlation:.4f}")

# 차이 계산
combined_df['score_diff'] = abs(combined_df['semantic_similarity'] - combined_df['bert_F1'])
combined_df_sorted = combined_df.sort_values('score_diff', ascending=False)

print("\n" + "="*80)
print("=== 최종 종합 평가 ===")
print("="*80)
print(f"총 샘플 수: {len(predictions)}")
print(f"\nSemantic Similarity:")
print(f"  - 평균: {avg_semantic:.4f}")
print(f"  - 표준편차: {std_semantic:.4f}")
print(f"\nBERTScore:")
print(f"  - 평균 Precision: {avg_P:.4f}")
print(f"  - 평균 Recall: {avg_R:.4f}")
print(f"  - 평균 F1: {avg_F1:.4f}")
print(f"\n상관계수: {correlation:.4f}")

In [None]:
# Semantic Similarity 하위 10개 조회
bottom_10 = results_df.nsmallest(10, 'semantic_similarity')

print("=== Semantic Similarity 하위 10개 ===\n")

for idx, row in bottom_10.iterrows():
    print(f"User Query: {row['user_query']}")
    print(f"Ground Truth: {row['ground_truth']}")
    print(f"Prediction: {row['prediction']}")
    print("-" * 100)
    print()

In [None]:
# BERTscore 하위 10개 조회
bottom_10 = results_df.nsmallest(10, 'bert_F1')

print("=== bert_F1 하위 10개 ===\n")

for idx, row in bottom_10.iterrows():
    print(f"User Query: {row['user_query']}")
    print(f"Ground Truth: {row['ground_truth']}")
    print(f"Prediction: {row['prediction']}")
    print("-" * 100)
    print()