In [1]:
# from utils.data_build import build_kru_dataframe_and_save

# df = build_kru_dataframe_and_save("/workspace/kru_data", "/workspace/kru_data/data_tag.json", "data_source.csv")

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
import simdjson as json

# data_tag.json 로드
with open('/workspace/kru_data/data_tag.json', 'r', encoding='utf-8') as f:
    tag_data = json.loads(f.read())

# 각 카테고리별 허용된 값 추출 (values의 value 값들)
allowed_values = {}
for key, value in tag_data.items():
    col_name = key.strip()
    if 'values' in value:
        # values 딕셔너리의 value 값들을 허용된 값으로 설정
        allowed_values[col_name] = set(value['values'].values())
        print(f"{col_name} 허용된 값: {allowed_values[col_name]}")

# 데이터 로드
all_df = pd.read_csv('/workspace/kru_data/data_source.csv')

# transcriptions 추가
transcriptions = list()

for index, row in all_df.iterrows():
    wav_abs_path = row['abs_path']
    label_abs_path = wav_abs_path.replace("원천데이터", "라벨링데이터").replace(".wav", ".txt")

    with open(label_abs_path, 'r', encoding='utf-8') as f:
        label = f.read()

    transcriptions.append(label)

all_df['transcription'] = transcriptions

print(f"\n{'='*80}")
print(f"원본 데이터 수: {len(all_df):,}개")
print(f"{'='*80}")

# data_tag.json의 values에 정의된 값만 필터링
print(f"\n데이터 필터링 중...")
print(f"{'='*80}")

# 각 카테고리별로 필터링
filtered_df = all_df.copy()

for col_name, allowed_set in allowed_values.items():
    if col_name in filtered_df.columns:
        before_count = len(filtered_df)
        # 허용된 값만 남기기
        mask = filtered_df[col_name].isin(allowed_set)
        filtered_df = filtered_df[mask].copy()
        after_count = len(filtered_df)
        removed_count = before_count - after_count
        
        print(f"\n{col_name}:")
        print(f"  필터링 전: {before_count:,}개")
        print(f"  필터링 후: {after_count:,}개")
        print(f"  제거된 데이터: {removed_count:,}개")
        
        # 제거된 값 확인
        if removed_count > 0:
            removed_values = all_df[~mask][col_name].value_counts()
            print(f"  제거된 값들:")
            for val, count in removed_values.items():
                print(f"    {val}: {count}개")

all_df = filtered_df.reset_index(drop=True)

print(f"\n{'='*80}")
print(f"최종 데이터 수: {len(all_df):,}개")
print(f"{'='*80}")
print(f"\n컬럼: {list(all_df.columns)}")
print(f"\n데이터 샘플:")
print(all_df.head())
print(f"\n각 카테고리별 고유값 수:")
print(f"  데이터_카테고리: {all_df['데이터_카테고리'].nunique()}개 - {sorted(all_df['데이터_카테고리'].unique())}")
print(f"  성별: {all_df['성별'].nunique()}개 - {sorted(all_df['성별'].unique())}")
print(f"  연령: {all_df['연령'].nunique()}개 - {sorted(all_df['연령'].unique())}")
print(f"  지역: {all_df['지역'].nunique()}개 - {sorted(all_df['지역'].unique())}")

데이터_항목 허용된 값: {'노년', '일반', '소아'}
데이터_카테고리 허용된 값: {'간호사', '가정방문자', '의사'}
성별 허용된 값: {'여성', '남성', '비공개/혼합'}
연령 허용된 값: {'40–49세', '60–69세', '30–39세', '3–6세', '50–59세', '70세 이상', '11–19세', '20–29세', '비공개/혼합', '7–10세'}
지역 허용된 값: {'기타/비공개', '서울/경기/인천', '영남', '강원/충청', '호남/제주'}


KeyboardInterrupt: 

In [None]:
# ============================================================================
# 복합 라벨 생성 및 분석
# ============================================================================

# 복합 라벨 생성: 데이터_카테고리 + 성별 + 연령 + 지역
all_df['composite_label'] = (
    all_df['데이터_카테고리'].astype(str) + '_' +
    all_df['성별'].astype(str) + '_' +
    all_df['연령'].astype(str) + '_' +
    all_df['지역'].astype(str),
)

print("="*80)
print("복합 라벨 생성 완료")
print("="*80)
print(f"총 고유한 복합 라벨 수: {all_df['composite_label'].nunique():,}개")
print(f"\n복합 라벨별 데이터 수 (상위 10개):")
label_counts = all_df['composite_label'].value_counts()
print(label_counts.head(10))
print(f"\n복합 라벨별 데이터 수 (하위 10개):")
print(label_counts.tail(10))

# 각 복합 라벨의 샘플 수 분포 확인
print(f"\n복합 라벨 샘플 수 통계:")
print(f"  최소: {label_counts.min()}개")
print(f"  최대: {label_counts.max()}개")
print(f"  평균: {label_counts.mean():.2f}개")
print(f"  중앙값: {label_counts.median():.0f}개")
print(f"\n샘플 수가 1개인 라벨: {(label_counts == 1).sum()}개")
print(f"샘플 수가 2개 이상인 라벨: {(label_counts >= 2).sum()}개")


In [None]:
# ============================================================================
# Stratified Split (8:2) - 복합 라벨 기반
# ============================================================================

def stratified_split_by_composite_label(df, test_size=0.2, random_state=42):
    """
    복합 라벨을 기반으로 stratified split 수행
    
    Args:
        df: DataFrame with 'composite_label' column
        test_size: 테스트 세트 비율 (기본 0.2 = 20%)
        random_state: 랜덤 시드
    
    Returns:
        train_df, test_df
    """
    train_indices = []
    test_indices = []
    
    # 각 복합 라벨별로 분리
    for label in df['composite_label'].unique():
        label_mask = df['composite_label'] == label
        label_data = df[label_mask].copy()
        n_samples = len(label_data)
        
        if n_samples == 1:
            # 샘플이 1개인 경우 train에 포함
            train_indices.extend(label_data.index.tolist())
        elif n_samples == 2:
            # 샘플이 2개인 경우 1개씩 분리
            label_data_shuffled = label_data.sample(frac=1, random_state=random_state)
            train_indices.append(label_data_shuffled.iloc[0].name)
            test_indices.append(label_data_shuffled.iloc[1].name)
        else:
            # 샘플이 3개 이상인 경우 비율대로 분리
            n_test = max(1, int(n_samples * test_size))  # 최소 1개는 test에 포함
            n_train = n_samples - n_test
            
            label_data_shuffled = label_data.sample(frac=1, random_state=random_state)
            train_indices.extend(label_data_shuffled.iloc[:n_train].index.tolist())
            test_indices.extend(label_data_shuffled.iloc[n_train:].index.tolist())
    
    train_df = df.loc[train_indices].copy()
    test_df = df.loc[test_indices].copy()
    
    return train_df, test_df

# Split 수행
print("="*80)
print("Stratified Split 수행 중...")
print("="*80)

train_df, test_df = stratified_split_by_composite_label(all_df, test_size=0.2, random_state=42)

print(f"\n✓ Split 완료!")
print(f"  Train: {len(train_df):,}개 ({len(train_df)/len(all_df)*100:.2f}%)")
print(f"  Test:  {len(test_df):,}개 ({len(test_df)/len(all_df)*100:.2f}%)")
print(f"  Total: {len(all_df):,}개")


train_df.to_csv('/workspace/kru_data/train.csv', index=False)
test_df.to_csv('/workspace/kru_data/test.csv', index=False)


In [None]:
# ============================================================================
# 데이터 분할 결과 시각화
# ============================================================================

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
import os 
# Nature 스타일 설정
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("colorblind")

# 고해상도 설정
rcParams['figure.dpi'] = 300
rcParams['savefig.dpi'] = 300
rcParams['savefig.bbox'] = 'tight'
rcParams['font.size'] = 9
rcParams['axes.labelsize'] = 10
rcParams['axes.titlesize'] = 11
rcParams['xtick.labelsize'] = 8
rcParams['ytick.labelsize'] = 8
rcParams['legend.fontsize'] = 8
# 나눔 스퀘어 폰트 설정
rcParams['font.family'] = 'sans-serif'
# 나눔 스퀘어 폰트 우선순위 설정
rcParams['font.sans-serif'] = ['NanumSquare', 'NanumSquareOTF', 'NanumSquareNeo', 'NanumGothic', 'DejaVu Sans']
rcParams['axes.unicode_minus'] = False

# 폰트 확인 및 설정
try:
    from matplotlib import font_manager
    # 사용 가능한 나눔 스퀘어 폰트 찾기
    available_fonts = [f.name for f in font_manager.fontManager.ttflist]
    nanum_fonts = [f for f in available_fonts if 'Nanum' in f and 'Square' in f]
    if nanum_fonts:
        print(f"✓ 나눔 스퀘어 폰트 발견: {nanum_fonts[0]}")
        rcParams['font.sans-serif'] = [nanum_fonts[0]] + rcParams['font.sans-serif']
    else:
        print("⚠ 나눔 스퀘어 폰트를 찾을 수 없습니다. 기본 폰트를 사용합니다.")
        print(f"사용 가능한 나눔 폰트: {[f for f in available_fonts if 'Nanum' in f]}")
except Exception as e:
    print(f"폰트 확인 중 오류: {e}")
    print("기본 폰트 설정을 사용합니다.")

# 색상 팔레트
colors = {
    'train': '#2E86AB',
    'test': '#F24236',
    'blue': '#2E86AB',
    'orange': '#F24236',
    'green': '#6A994E',
    'red': '#BC4749',
    'purple': '#7209B7',
    'yellow': '#F77F00',
    'teal': '#06A77D',
}

# Figure 1: Train/Test 분할 비율 및 각 카테고리별 분포 비교
fig = plt.figure(figsize=(14, 10))
gs = fig.add_gridspec(3, 2, hspace=0.4, wspace=0.3, left=0.1, right=0.95, top=0.95, bottom=0.08)

# A: Train/Test 분할 비율
ax1 = fig.add_subplot(gs[0, 0])
sizes = [len(train_df), len(test_df)]
labels = ['Train', 'Test']
colors_pie = [colors['train'], colors['test']]
wedges, texts, autotexts = ax1.pie(sizes, labels=labels, autopct='%1.1f%%', 
                                     colors=colors_pie, startangle=90,
                                     textprops={'fontsize': 10, 'fontweight': 'bold'})
ax1.set_title('A. Train/Test Split Ratio', fontweight='bold', pad=15, fontsize=11)
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontweight('bold')

# B: 각 카테고리별 Train/Test 개수 비교
ax2 = fig.add_subplot(gs[0, 1])
category_names = ['Category', 'Gender', 'Age', 'Region']
train_counts = [
    train_df['데이터_카테고리'].nunique(),
    train_df['성별'].nunique(),
    train_df['연령'].nunique(),
    train_df['지역'].nunique()
]
test_counts = [
    test_df['데이터_카테고리'].nunique(),
    test_df['성별'].nunique(),
    test_df['연령'].nunique(),
    test_df['지역'].nunique()
]
x = np.arange(len(category_names))
width = 0.35
bars1 = ax2.bar(x - width/2, train_counts, width, label='Train', color=colors['train'], alpha=0.8, edgecolor='black', linewidth=0.5)
bars2 = ax2.bar(x + width/2, test_counts, width, label='Test', color=colors['test'], alpha=0.8, edgecolor='black', linewidth=0.5)
ax2.set_ylabel('Number of Unique Values', fontweight='bold')
ax2.set_title('B. Unique Values per Category', fontweight='bold', pad=15, fontsize=11)
ax2.set_xticks(x)
ax2.set_xticklabels(category_names)
ax2.legend(frameon=True, fancybox=False, edgecolor='black')
ax2.spines['top'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.grid(axis='y', alpha=0.3, linestyle='--', linewidth=0.5)
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height,
                f'{int(height)}', ha='center', va='bottom', fontsize=8)

# C: 데이터_카테고리 분포 비교
ax3 = fig.add_subplot(gs[1, 0])
cat_order = sorted(all_df['데이터_카테고리'].unique())
train_cat = train_df['데이터_카테고리'].value_counts().reindex(cat_order, fill_value=0)
test_cat = test_df['데이터_카테고리'].value_counts().reindex(cat_order, fill_value=0)
x = np.arange(len(cat_order))
bars1 = ax3.bar(x - width/2, train_cat.values, width, label='Train', color=colors['train'], alpha=0.8, edgecolor='black', linewidth=0.5)
bars2 = ax3.bar(x + width/2, test_cat.values, width, label='Test', color=colors['test'], alpha=0.8, edgecolor='black', linewidth=0.5)
ax3.set_ylabel('Number of Samples', fontweight='bold')
ax3.set_title('C. Category Distribution', fontweight='bold', pad=15, fontsize=11)
ax3.set_xticks(x)
ax3.set_xticklabels(cat_order, rotation=0)
ax3.legend(frameon=True, fancybox=False, edgecolor='black')
ax3.spines['top'].set_visible(False)
ax3.spines['right'].set_visible(False)
ax3.grid(axis='y', alpha=0.3, linestyle='--', linewidth=0.5)
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        if height > 0:
            ax3.text(bar.get_x() + bar.get_width()/2., height,
                    f'{int(height):,}', ha='center', va='bottom', fontsize=7)

# D: 성별 분포 비교
ax4 = fig.add_subplot(gs[1, 1])
gender_order = sorted(all_df['성별'].unique())
train_gender = train_df['성별'].value_counts().reindex(gender_order, fill_value=0)
test_gender = test_df['성별'].value_counts().reindex(gender_order, fill_value=0)
x = np.arange(len(gender_order))
bars1 = ax4.bar(x - width/2, train_gender.values, width, label='Train', color=colors['train'], alpha=0.8, edgecolor='black', linewidth=0.5)
bars2 = ax4.bar(x + width/2, test_gender.values, width, label='Test', color=colors['test'], alpha=0.8, edgecolor='black', linewidth=0.5)
ax4.set_ylabel('Number of Samples', fontweight='bold')
ax4.set_title('D. Gender Distribution', fontweight='bold', pad=15, fontsize=11)
ax4.set_xticks(x)
ax4.set_xticklabels(gender_order, rotation=0)
ax4.legend(frameon=True, fancybox=False, edgecolor='black')
ax4.spines['top'].set_visible(False)
ax4.spines['right'].set_visible(False)
ax4.grid(axis='y', alpha=0.3, linestyle='--', linewidth=0.5)
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        if height > 0:
            ax4.text(bar.get_x() + bar.get_width()/2., height,
                    f'{int(height):,}', ha='center', va='bottom', fontsize=7)

# E: 연령대 분포 비교 (상위 5개만)
ax5 = fig.add_subplot(gs[2, 0])
age_order = sorted(all_df['연령'].unique())
train_age = train_df['연령'].value_counts().reindex(age_order, fill_value=0)
test_age = test_df['연령'].value_counts().reindex(age_order, fill_value=0)
top_5_indices = train_age.nlargest(5).index
train_age_top = train_age.reindex(top_5_indices)
test_age_top = test_age.reindex(top_5_indices)
x = np.arange(len(top_5_indices))
bars1 = ax5.bar(x - width/2, train_age_top.values, width, label='Train', color=colors['train'], alpha=0.8, edgecolor='black', linewidth=0.5)
bars2 = ax5.bar(x + width/2, test_age_top.values, width, label='Test', color=colors['test'], alpha=0.8, edgecolor='black', linewidth=0.5)
ax5.set_ylabel('Number of Samples', fontweight='bold')
ax5.set_title('E. Age Group Distribution (Top 5)', fontweight='bold', pad=15, fontsize=11)
ax5.set_xticks(x)
ax5.set_xticklabels(top_5_indices, rotation=45, ha='right')
ax5.legend(frameon=True, fancybox=False, edgecolor='black')
ax5.spines['top'].set_visible(False)
ax5.spines['right'].set_visible(False)
ax5.grid(axis='y', alpha=0.3, linestyle='--', linewidth=0.5)
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        if height > 0:
            ax5.text(bar.get_x() + bar.get_width()/2., height,
                    f'{int(height):,}', ha='center', va='bottom', fontsize=7)

# F: 지역 분포 비교
ax6 = fig.add_subplot(gs[2, 1])
region_order = sorted(all_df['지역'].unique())
train_region = train_df['지역'].value_counts().reindex(region_order, fill_value=0)
test_region = test_df['지역'].value_counts().reindex(region_order, fill_value=0)
x = np.arange(len(region_order))
bars1 = ax6.bar(x - width/2, train_region.values, width, label='Train', color=colors['train'], alpha=0.8, edgecolor='black', linewidth=0.5)
bars2 = ax6.bar(x + width/2, test_region.values, width, label='Test', color=colors['test'], alpha=0.8, edgecolor='black', linewidth=0.5)
ax6.set_ylabel('Number of Samples', fontweight='bold')
ax6.set_title('F. Geographic Region Distribution', fontweight='bold', pad=15, fontsize=11)
ax6.set_xticks(x)
ax6.set_xticklabels(region_order, rotation=45, ha='right')
ax6.legend(frameon=True, fancybox=False, edgecolor='black')
ax6.spines['top'].set_visible(False)
ax6.spines['right'].set_visible(False)
ax6.grid(axis='y', alpha=0.3, linestyle='--', linewidth=0.5)
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        if height > 0:
            ax6.text(bar.get_x() + bar.get_width()/2., height,
                    f'{int(height):,}', ha='center', va='bottom', fontsize=7)

fig.suptitle(f'Data Split Visualization (Total N = {len(all_df):,})', 
             fontsize=13, fontweight='bold', y=0.98)

# 저장
figures_dir = '/workspace/notebook/figures'
os.makedirs(figures_dir, exist_ok=True)
plt.savefig(f'{figures_dir}/Data_Split_Visualization.png', 
            dpi=300, bbox_inches='tight', facecolor='white', edgecolor='none')
plt.savefig(f'{figures_dir}/Data_Split_Visualization.pdf', 
            dpi=300, bbox_inches='tight', facecolor='white', edgecolor='none')
plt.show()

print("✓ Figure 1 saved: Data_Split_Visualization.png & .pdf")
