In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# None만 절반으로 줄이기
# 이미 formatted된 데이터 불러오기

import json
import random
random.seed(42)

def load_jsonl(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

# Formatted 데이터 로드
formatted_train = load_jsonl('/content/drive/MyDrive/Colab Notebooks/woke-odds/ambiguity_train_1110_formatted.jsonl')

print(f"원본 데이터: {len(formatted_train)}개")

# NONE과 기타 분류 (text 필드에서 확인)
none_samples = []
other_samples = []

for sample in formatted_train:
    # text에서 답변 부분 확인 ([/INST] 뒤)
    if 'NONE|NONE' in sample['text']:
        none_samples.append(sample)
    else:
        other_samples.append(sample)

print(f"\n=== 원본 분포 ===")
print(f"NONE: {len(none_samples)}개 (50.0%)")
print(f"기타: {len(other_samples)}개 (50.0%)")

# NONE을 25%로
none_target = len(other_samples) // 3
sampled_none = random.sample(none_samples, none_target)

# 합치기
balanced_data = other_samples + sampled_none
random.shuffle(balanced_data)

print(f"\n=== 조정 후 분포 ===")
print(f"NONE: {len(sampled_none)}개 ({len(sampled_none)/len(balanced_data)*100:.1f}%)")
print(f"기타: {len(other_samples)}개 ({len(other_samples)/len(balanced_data)*100:.1f}%)")
print(f"총: {len(balanced_data)}개")

# 저장
output_path = '/content/drive/MyDrive/Colab Notebooks/woke-odds/ambiguity_train_1110_formatted_balanced_25.jsonl'

with open(output_path, 'w', encoding='utf-8') as f:
    for item in balanced_data:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print(f"\n✓ 저장 완료: {output_path}")

원본 데이터: 2561개

=== 원본 분포 ===
NONE: 1280개 (50.0%)
기타: 1281개 (50.0%)

=== 조정 후 분포 ===
NONE: 427개 (25.0%)
기타: 1281개 (75.0%)
총: 1708개

✓ 저장 완료: /content/drive/MyDrive/Colab Notebooks/woke-odds/ambiguity_train_1110_formatted_balanced_25.jsonl


In [None]:
# Undersampling(NONE) + Oversampling(AO 2배)

import json
import random
random.seed(42)

def load_jsonl(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

# Formatted 데이터 로드
formatted_train = load_jsonl('/content/drive/MyDrive/Colab Notebooks/woke-odds/ambiguity_train_1110_formatted.jsonl')

print(f"원본 데이터: {len(formatted_train)}개")

# NONE, AO, 기타로 분류
none_samples = []
ao_samples = []
other_samples = []

for sample in formatted_train:
    text = sample['text']
    if 'NONE|NONE' in text:
        none_samples.append(sample)
    elif 'AO|' in text:
        ao_samples.append(sample)
    else:
        other_samples.append(sample)

print(f"\n=== 원본 분포 ===")
print(f"NONE: {len(none_samples)}개 (50.0%)")
print(f"AO: {len(ao_samples)}개")
print(f"기타(EM/LA): {len(other_samples)}개")
print(f"총: {len(formatted_train)}개")

# AO 2배 복제
ao_duplicated = ao_samples + ao_samples

# NONE 조정 (available 개수와 target 비교)
total_non_none = len(ao_duplicated) + len(other_samples)
none_target = int(total_non_none * 0.4 / 0.6)

# NONE이 부족하면 전부 사용
if none_target > len(none_samples):
    print(f"\n⚠️ none_target({none_target})이 너무 많아서 전체 NONE 사용")
    sampled_none = none_samples  # 전부 사용
else:
    sampled_none = random.sample(none_samples, none_target)

# 합치기
balanced_data = sampled_none + ao_duplicated + other_samples
random.shuffle(balanced_data)

print(f"\n=== 조정 후 분포 ===")
print(f"NONE: {len(sampled_none)}개 ({len(sampled_none)/len(balanced_data)*100:.1f}%)")
print(f"AO: {len(ao_duplicated)}개 (×2 복제!)")
print(f"기타(EM/LA): {len(other_samples)}개")
print(f"총: {len(balanced_data)}개 (원본: {len(formatted_train)}개)")

# 저장
output_path = '/content/drive/MyDrive/Colab Notebooks/woke-odds/ambiguity_train_1110_formatted_balanced_ao2x.jsonl'

with open(output_path, 'w', encoding='utf-8') as f:
    for item in balanced_data:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print(f"\n✓ 저장 완료: {output_path}")
print(f"파일명: ambiguity_train_1110_formatted_balanced_ao2x.jsonl")

원본 데이터: 2561개

=== 원본 분포 ===
NONE: 1280개 (50.0%)
AO: 641개
기타(EM/LA): 640개
총: 2561개

⚠️ none_target(1281)이 너무 많아서 전체 NONE 사용

=== 조정 후 분포 ===
NONE: 1280개 (40.0%)
AO: 1282개 (×2 복제!)
기타(EM/LA): 640개
총: 3202개 (원본: 2561개)

✓ 저장 완료: /content/drive/MyDrive/Colab Notebooks/woke-odds/ambiguity_train_1110_formatted_balanced_ao2x.jsonl
파일명: ambiguity_train_1110_formatted_balanced_ao2x.jsonl


In [None]:
# AO 카테고리랑 UNF만 1.5배
# 세분화된 분류

import json
import random
random.seed(42)

def load_jsonl(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

formatted_train = load_jsonl('/content/drive/MyDrive/Colab Notebooks/woke-odds/ambiguity_train_1110_formatted.jsonl')

none_samples = []
ao_samples = []
em_unf_samples = []
em_cont_samples = []
la_samples = []

for sample in formatted_train:
    text = sample['text']
    if 'NONE|NONE' in text:
        none_samples.append(sample)
    elif 'AO|' in text:
        ao_samples.append(sample)
    elif 'EM|UNF' in text:
        em_unf_samples.append(sample)
    elif 'EM|CONT' in text:
        em_cont_samples.append(sample)
    elif 'LA|' in text:
        la_samples.append(sample)

print(f"=== 원본 분포 ===")
print(f"NONE: {len(none_samples)}개")
print(f"AO: {len(ao_samples)}개")
print(f"EM|UNF: {len(em_unf_samples)}개")
print(f"EM|CONT: {len(em_cont_samples)}개")
print(f"LA: {len(la_samples)}개")

# AO 1.5배
ao_15x = ao_samples + [s for s in ao_samples if random.random() < 0.5]

# UNF만 1.5배
unf_15x = em_unf_samples + [s for s in em_unf_samples if random.random() < 0.5]

# 나머지는 그대로
balanced_data = none_samples + ao_15x + unf_15x + em_cont_samples + la_samples
random.shuffle(balanced_data)

print(f"\n=== 조정 후 분포 ===")
print(f"NONE: {len(none_samples)}개 (유지)")
print(f"AO: {len(ao_15x)}개 (×1.5)")
print(f"EM|UNF: {len(unf_15x)}개 (×1.5) ⭐")
print(f"EM|CONT: {len(em_cont_samples)}개 (유지)")
print(f"LA: {len(la_samples)}개 (유지)")
print(f"총: {len(balanced_data)}개")

# 저장
output_path = '/content/drive/MyDrive/Colab Notebooks/woke-odds/ambiguity_train_1110_formatted_balanced_ao15x_unf15x.jsonl'

with open(output_path, 'w', encoding='utf-8') as f:
    for item in balanced_data:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print(f"\n✓ 저장 완료: ao15x_unf15x")

=== 원본 분포 ===
NONE: 1280개
AO: 641개
EM|UNF: 160개
EM|CONT: 160개
LA: 320개

=== 조정 후 분포 ===
NONE: 1280개 (유지)
AO: 961개 (×1.5)
EM|UNF: 232개 (×1.5) ⭐
EM|CONT: 160개 (유지)
LA: 320개 (유지)
총: 2953개

✓ 저장 완료: ao15x_unf15x


In [None]:
# AO 2배, UNF 1.5배

import json
import random
random.seed(44)  # 새로운 시드

def load_jsonl(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

formatted_train = load_jsonl('/content/drive/MyDrive/Colab Notebooks/woke-odds/ambiguity_train_1110_formatted.jsonl')

none_samples = []
ao_samples = []
em_unf_samples = []
em_cont_samples = []
la_samples = []

for sample in formatted_train:
    text = sample['text']
    if 'NONE|NONE' in text:
        none_samples.append(sample)
    elif 'AO|' in text:
        ao_samples.append(sample)
    elif 'EM|UNF' in text:
        em_unf_samples.append(sample)
    elif 'EM|CONT' in text:
        em_cont_samples.append(sample)
    elif 'LA|' in text:
        la_samples.append(sample)

print(f"=== 원본 분포 ===")
print(f"NONE: {len(none_samples)}개")
print(f"AO: {len(ao_samples)}개")
print(f"EM|UNF: {len(em_unf_samples)}개")
print(f"EM|CONT: {len(em_cont_samples)}개")
print(f"LA: {len(la_samples)}개")

# AO 2배 (리스트 2번 더하기!)
ao_2x = ao_samples + ao_samples

# UNF 1.5배
unf_15x = em_unf_samples + [s for s in em_unf_samples if random.random() < 0.5]

# 나머지는 그대로 (NONE 유지!)
balanced_data = none_samples + ao_2x + unf_15x + em_cont_samples + la_samples
random.shuffle(balanced_data)

print(f"\n=== 조정 후 분포 ===")
print(f"NONE: {len(none_samples)}개 (유지)")
print(f"AO: {len(ao_2x)}개 (×2) ⭐")
print(f"EM|UNF: {len(unf_15x)}개 (×1.5) ⭐")
print(f"EM|CONT: {len(em_cont_samples)}개 (유지)")
print(f"LA: {len(la_samples)}개 (유지)")
print(f"총: {len(balanced_data)}개")

# 비율 출력
total = len(balanced_data)
print(f"\n=== 비율 ===")
print(f"NONE: {len(none_samples)/total*100:.1f}%")
print(f"AO: {len(ao_2x)/total*100:.1f}%")
print(f"EM|UNF: {len(unf_15x)/total*100:.1f}%")

# 저장
output_path = '/content/drive/MyDrive/Colab Notebooks/woke-odds/ambiguity_train_1110_formatted_balanced_ao2x_unf15x.jsonl'

with open(output_path, 'w', encoding='utf-8') as f:
    for item in balanced_data:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print(f"\n✓ 저장 완료: ao2x_unf15x")

=== 원본 분포 ===
NONE: 1280개
AO: 641개
EM|UNF: 160개
EM|CONT: 160개
LA: 320개

=== 조정 후 분포 ===
NONE: 1280개 (유지)
AO: 1282개 (×2) ⭐
EM|UNF: 243개 (×1.5) ⭐
EM|CONT: 160개 (유지)
LA: 320개 (유지)
총: 3285개

=== 비율 ===
NONE: 39.0%
AO: 39.0%
EM|UNF: 7.4%

✓ 저장 완료: ao2x_unf15x


In [None]:
# 7번째 시도: AO 1.5배, UNF 1.5배, NONE 10% 감소

import json
import random
random.seed(45)  # 새로운 시드

def load_jsonl(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

formatted_train = load_jsonl('/content/drive/MyDrive/Colab Notebooks/woke-odds/ambiguity_train_1110_formatted.jsonl')

none_samples = []
ao_samples = []
em_unf_samples = []
em_cont_samples = []
la_samples = []

for sample in formatted_train:
    text = sample['text']
    if 'NONE|NONE' in text:
        none_samples.append(sample)
    elif 'AO|' in text:
        ao_samples.append(sample)
    elif 'EM|UNF' in text:
        em_unf_samples.append(sample)
    elif 'EM|CONT' in text:
        em_cont_samples.append(sample)
    elif 'LA|' in text:
        la_samples.append(sample)

print(f"=== 원본 분포 ===")
print(f"NONE: {len(none_samples)}개")
print(f"AO: {len(ao_samples)}개")
print(f"EM|UNF: {len(em_unf_samples)}개")
print(f"EM|CONT: {len(em_cont_samples)}개")
print(f"LA: {len(la_samples)}개")

# NONE 10% 감소 (0.9배)
none_target = int(len(none_samples) * 0.9)
sampled_none = random.sample(none_samples, none_target)

# AO 1.5배 (5번 방식 유지!)
ao_15x = ao_samples + [s for s in ao_samples if random.random() < 0.5]

# UNF 1.5배
unf_15x = em_unf_samples + [s for s in em_unf_samples if random.random() < 0.5]

# 합치기
balanced_data = sampled_none + ao_15x + unf_15x + em_cont_samples + la_samples
random.shuffle(balanced_data)

print(f"\n=== 조정 후 분포 ===")
print(f"NONE: {len(sampled_none)}개 (×0.9) ⭐")
print(f"AO: {len(ao_15x)}개 (×1.5)")
print(f"EM|UNF: {len(unf_15x)}개 (×1.5)")
print(f"EM|CONT: {len(em_cont_samples)}개 (유지)")
print(f"LA: {len(la_samples)}개 (유지)")
print(f"총: {len(balanced_data)}개")

# 비율 출력
total = len(balanced_data)
print(f"\n=== 비율 ===")
print(f"NONE: {len(sampled_none)/total*100:.1f}%")
print(f"AO: {len(ao_15x)/total*100:.1f}%")
print(f"EM|UNF: {len(unf_15x)/total*100:.1f}%")
print(f"EM|CONT: {len(em_cont_samples)/total*100:.1f}%")
print(f"LA: {len(la_samples)/total*100:.1f}%")

# 저장
output_path = '/content/drive/MyDrive/Colab Notebooks/woke-odds/ambiguity_train_1110_formatted_balanced_none90_ao15x_unf15x.jsonl'

with open(output_path, 'w', encoding='utf-8') as f:
    for item in balanced_data:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print(f"\n✓ 저장 완료: none90_ao15x_unf15x")

=== 원본 분포 ===
NONE: 1280개
AO: 641개
EM|UNF: 160개
EM|CONT: 160개
LA: 320개

=== 조정 후 분포 ===
NONE: 1152개 (×0.9) ⭐
AO: 967개 (×1.5)
EM|UNF: 245개 (×1.5)
EM|CONT: 160개 (유지)
LA: 320개 (유지)
총: 2844개

=== 비율 ===
NONE: 40.5%
AO: 34.0%
EM|UNF: 8.6%
EM|CONT: 5.6%
LA: 11.3%

✓ 저장 완료: none90_ao15x_unf15x


샘플 테스트

In [None]:
# 균형 조정된 데이터 샘플 확인

import json
import random

def load_jsonl(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

# 데이터 로드
balanced_data = load_jsonl('/content/drive/MyDrive/Colab Notebooks/woke-odds/ambiguity_train_1110_formatted_balanced_none90_ao15x_unf15x.jsonl')

print(f"=== 데이터 로드 완료 ===")
print(f"총 샘플: {len(balanced_data)}개\n")

# 랜덤 샘플 3개 확인
sample_indices = random.sample(range(len(balanced_data)), 3)

for i, idx in enumerate(sample_indices):
    sample = balanced_data[idx]
    text = sample['text']

    print(f"=== 샘플 {i+1} (인덱스: {idx}) ===")

    # 질문 추출
    if '[INST]' in text and '[/INST]' in text:
        start = text.find('<</SYS>>\n\n') + 10
        end = text.find('[/INST]')
        question = text[start:end].strip()
    else:
        question = "추출 실패"

    # 답변 추출
    if '[/INST]' in text and '</s>' in text:
        start = text.find('[/INST]') + 8
        end = text.find('</s>')
        answer = text[start:end].strip()
    else:
        answer = "추출 실패"

    print(f"질문: {question[:100]}...")
    print(f"답변: {answer}")
    print(f"\n전체 텍스트 (처음 200자):")
    print(text[:200] + "...\n")
    print("-" * 70 + "\n")

# NONE 비율 확인
none_count = sum(1 for s in balanced_data if 'NONE|NONE' in s['text'])
print(f"=== 최종 확인 ===")
print(f"NONE|NONE: {none_count}개 ({none_count/len(balanced_data)*100:.1f}%)")
print(f"기타: {len(balanced_data) - none_count}개 ({(len(balanced_data)-none_count)/len(balanced_data)*100:.1f}%)")

=== 데이터 로드 완료 ===
총 샘플: 2844개

=== 샘플 1 (인덱스: 731) ===
질문: Who did stephen curry play for in college?...
답변: NONE|NONE

전체 텍스트 (처음 200자):
<s>[INST] <<SYS>>
You are an AI system that determines if the question requires clarification and classifies the ambiguity.

Task:
1. Determine if the question requires clarification: clear(no clarifi...

----------------------------------------------------------------------

=== 샘플 2 (인덱스: 3) ===
질문: Give me a list of the greatest basketball players of all time....
답변: AO|WHOM

전체 텍스트 (처음 200자):
<s>[INST] <<SYS>>
You are an AI system that determines if the question requires clarification and classifies the ambiguity.

Task:
1. Determine if the question requires clarification: clear(no clarifi...

----------------------------------------------------------------------

=== 샘플 3 (인덱스: 2759) ===
질문: The sister-in-law built Amanda a garden after she earned a billion dollars.
Who earned a billion dol...
답변: LA|SEM

전체 텍스트 (처음 200자):
<s>[INST] <<SYS>>
You 