<a href="https://colab.research.google.com/github/dansojo/Medical_CV/blob/main/data_%EC%A0%84%EC%B2%98%EB%A6%AC_%EC%A6%9D%EA%B0%95_%EA%B2%B0%ED%95%A9%EC%9D%B4%EB%AF%B8%EC%A7%80(Part2).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install -U albumentations

Collecting albumentations
  Downloading albumentations-1.4.23-py3-none-any.whl.metadata (36 kB)
Collecting albucore==0.0.21 (from albumentations)
  Downloading albucore-0.0.21-py3-none-any.whl.metadata (5.3 kB)
Collecting simsimd>=5.9.2 (from albucore==0.0.21->albumentations)
  Downloading simsimd-6.2.1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.0/66.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Downloading albumentations-1.4.23-py3-none-any.whl (269 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m269.9/269.9 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading albucore-0.0.21-py3-none-any.whl (12 kB)
Downloading simsimd-6.2.1-cp310-cp310-manylinux_2_28_x86_64.whl (632 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m632.7/632.7 kB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: simsimd, albucore, albumentations
  Attempting un

In [None]:
import os
import torch
import pandas as pd
import numpy as np
from PIL import Image
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from albumentations import Compose, Resize, HorizontalFlip, VerticalFlip, Rotate, ColorJitter, ElasticTransform, Normalize
from albumentations.pytorch import ToTensorV2

In [None]:
# 설정 값
class Config:
    DATA_DIR = "/content/drive/MyDrive/Medical_CV/피부암 분류 및 Segmentation/part2 + mask이미지"
    METADATA_DIR = "/content/drive/MyDrive/Medical_CV/피부암 분류 및 Segmentation/HAM10000_metadata"
    SAVE_DIR = "/content/drive/MyDrive/Medical_CV/피부암 분류 및 Segmentation/part2_datasets"
    RANDOM_SEED = 42

In [None]:
# 2. SkinCancerDataset 클래스 정의
class SkinCancerDataset(Dataset):
    def __init__(self, image_dir, metadata, transform=None):
        self.image_dir = image_dir
        self.metadata = metadata  # 이미 필터링 및 정리된 메타데이터
        self.transform = transform

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        # 이미지 로드
        img_name = self.metadata.iloc[idx]['image_id']
        img_path = os.path.join(self.image_dir, f"{img_name}.jpg")
        image = Image.open(img_path).convert('RGB')

        # 라벨 가져오기
        label = self.metadata.iloc[idx]['label']

        # 변환 적용
        if self.transform:
            image = self.transform(image=np.array(image))['image']

        return image, label

In [None]:
# 3. 증강 기법 정의
def get_data_transforms():
    train_transform = Compose([
        Resize(224, 224),
        HorizontalFlip(p=0.5),
        VerticalFlip(p=0.2),
        Rotate(limit=20, p=0.5),
        ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, p=0.5),
        ElasticTransform(p=0.3),
        Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
        ToTensorV2()
    ])

    val_test_transform = Compose([
        Resize(224, 224),
        Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
        ToTensorV2()
    ])

    return train_transform, val_test_transform


In [None]:
# 4. 이미지 데이터 분리 함수
def split_data(metadata_path, image_dir):
    # 메타데이터 로드
    metadata = pd.read_csv(metadata_path)

    # 실제 파일과 매칭
    image_files = set([f.split('.')[0] for f in os.listdir(image_dir)])
    metadata = metadata[metadata['image_id'].isin(image_files)]

    # 클래스 매핑
    class_map = {
        "bkl": 0,
        "nv": 1,
        "df": 2,
        "mel": 3,
        "vasc": 4,
        "bcc": 5,
        "akiec": 6
    }
    metadata['label'] = metadata['dx'].map(class_map)


    # 데이터 분할 (6:2:2)
    train_data, temp_data = train_test_split(metadata, test_size=0.4,
                                             random_state=Config.RANDOM_SEED, stratify=metadata['label'])
    val_data, test_data = train_test_split(temp_data, test_size=0.5,
                                           random_state=Config.RANDOM_SEED, stratify=temp_data['label'])

    return train_data, val_data, test_data

In [None]:
# 5. 데이터 변환 및 저장 함수
def process_and_save_datasets(train_data, val_data, test_data, save_dir, image_dir):
    # 증강 기법 가져오기
    train_transform, val_test_transform = get_data_transforms()

    # 데이터셋 생성
    train_dataset = SkinCancerDataset(image_dir, train_data, transform=train_transform)
    val_dataset = SkinCancerDataset(image_dir, val_data, transform=val_test_transform)
    test_dataset = SkinCancerDataset(image_dir, test_data, transform=val_test_transform)

    # 데이터 저장 디렉토리 생성
    os.makedirs(save_dir, exist_ok=True)

    def dataset_to_tensor(dataset):
        images, labels = [], []
        for img, label in dataset:
            images.append(img)  # 이미 텐서 형식
            labels.append(label)
        return torch.stack(images), torch.tensor(labels)

    # 저장
    torch.save(dataset_to_tensor(train_dataset), os.path.join(save_dir, "train_dataset.pt"))
    torch.save(dataset_to_tensor(val_dataset), os.path.join(save_dir, "val_dataset.pt"))
    torch.save(dataset_to_tensor(test_dataset), os.path.join(save_dir, "test_dataset.pt"))

    print(f"Datasets saved to {save_dir}")

In [None]:
# 실행 흐름
metadata_path = Config.METADATA_DIR
image_dir = Config.DATA_DIR
save_dir = Config.SAVE_DIR

# 1. 데이터 분리
train_data, val_data, test_data = split_data(metadata_path, image_dir)

# 2. 데이터 가공 및 저장
process_and_save_datasets(train_data, val_data, test_data, save_dir, image_dir)

In [None]:
print("Train class distribution:\n", train_data['label'].value_counts())
print("Validation class distribution:\n", val_data['label'].value_counts())
print("Test class distribution:\n", test_data['label'].value_counts())