In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

def split_data_and_save(csv, output_columns, save_path, flip_grade='등심3', n_splits=5):
    fold_data = [[] for _ in range(n_splits)]
    fold_sizes = [0] * n_splits
    
    # grade별로 데이터를 나누어 fold에 분배
    for grade in csv['grade'].unique():
        grade_data = csv[csv['grade'] == grade].copy()
        
        # Output columns을 정규화하여 combined_score 생성
        scaler = StandardScaler()
        normalized_labels = scaler.fit_transform(grade_data[output_columns])
        
        grade_data['combined_score'] = np.mean(normalized_labels, axis=1)
        
        # No 기준으로 그룹화하여 combined_score 기준으로 정렬
        grouped = grade_data.groupby('No')
        group_scores = grouped['combined_score'].mean().sort_values()
        group_sizes = grouped.size()
        sorted_groups = list(group_scores.index)
        
        # 각 그룹을 fold에 분배
        for no in sorted_groups:
            group_size = group_sizes[no]
            group_data = grade_data[grade_data['No'] == no]
            
            target_fold = min(range(n_splits), key=lambda i: fold_sizes[i])
            fold_data[target_fold].extend(group_data.to_dict('records'))
            fold_sizes[target_fold] += group_size
    
    # Fold를 DataFrame으로 변환
    fold_data = [pd.DataFrame(fold).drop(columns=['combined_score'], errors='ignore') for fold in fold_data]
    
    # Fold 0~3을 train으로, Fold 4를 validation으로 설정
    train_folds = pd.concat(fold_data[:n_splits-1], ignore_index=True)
    val_fold = fold_data[n_splits-1]
    
    # Train set에만 flip을 적용
    train_folds = add_flipped_images_to_dataset(train_folds, grade=flip_grade)
    
    # Train과 Validation 세트를 각각 파일로 저장
    train_folds.to_csv(f"{save_path}/1211_train_4.csv", index=False)
    val_fold.to_csv(f"{save_path}/1211_val_1.csv", index=False)
    
    print("Train and Validation CSV files saved.")

def add_flipped_images_to_dataset(df, grade='등심3'):
    original_grade_count = len(df[df['grade'] == grade])
    flipped_rows = []
    
    for _, row in df[df['grade'] == grade].iterrows():
        flipped_row = row.copy()
        flipped_row['is_flipped'] = True
        flipped_rows.append(flipped_row)
    
    df_flipped = pd.DataFrame(flipped_rows)
    df = pd.concat([df, df_flipped], ignore_index=True)
    
    new_grade_count = len(df[df['grade'] == grade])
    
    print(f"Added flipped images for {grade}. Original count: {original_grade_count}, New total for {grade}: {new_grade_count}")
    
    return df


In [2]:
csv_path = '../dataset/only_new_1211.csv'
csv = pd.read_csv(csv_path)
save_path = '../dataset'
output_columns = ["Marbling", "Color", "Texture", "Surface_Moisture", "Total"]

split_data_and_save(csv, output_columns, save_path, n_splits=5)

Added flipped images for 등심3. Original count: 29, New total for 등심3: 58
Train and Validation CSV files saved.
