In [None]:
from dotenv import load_dotenv
from lib.seed import seed_prefix 
import sys, os 
load_dotenv()
seed_prefix(seed = 42)

### Startified Sampling

In [None]:
from lib.dataset import data_split, k_fold_data_split
import pandas as pd 
import numpy as np

data_df = pd.read_csv(os.getenv('DATASHEET_PATH'))
data_dir = os.getenv('DATA_DIR')
df, test_df = data_split(data_df, split_num = 5)

#%% Not K-Fold 
# train_df, valid_df = data_split(train_df, split_num = 5) # 80% train, 20% valid 

#%% K-Fold 
folds = k_fold_data_split(df, n_splits= 5) # K-Fold 80% train, 20% valid 

print(f"Fold N : Train, Valid, Test")
print('-'*30)
for fdx in range(len(folds)):
    valid_idx = folds[fdx]
    train_idx = [idx for i, fold in enumerate(folds) if i not in [fdx] for idx in fold]
    
    train_df = df.iloc[train_idx].reset_index(drop=True)
    valid_df = df.iloc[valid_idx].reset_index(drop=True)
    
    print(f"Fold {fdx+1} : {len(train_df)}, {len(valid_df)}, {len(test_df)}")
test_df.to_csv('./data/before_datasheet_test.csv', encoding = 'utf-8-sig', index = False)

In [None]:
Fold N : Train, Valid, Test
------------------------------
Fold 1 : 1887, 489, 592
Fold 2 : 1896, 480, 592
Fold 3 : 1922, 454, 592
Fold 4 : 1936, 440, 592
Fold 5 : 1863, 513, 592


In [None]:
# 같은 PID끼리는 label이 같으므로 같은 PID끼리 묶어서 label을 정한다.
# train_df['label|0:양성, 1:중간형, 2:악성'] = 

# 중복갑 제거
# train_df[]
명수_count = train_df.iloc[train_df['PID'].drop_duplicates().index]['label|0:양성, 1:중간형, 2:악성'].value_counts()
장수_count = train_df['label|0:양성, 1:중간형, 2:악성'].value_counts()

# train_df['label|0:양성, 1:중간형, 2:악성'].value_counts()


print(f"장수 : 총 {np.sum(num for num in 장수_count.to_dict().values())}장")
for label, num in 장수_count.to_dict().items():
    print(f"{label} : {num}장")
    
print(f"명수 : 총 {np.sum(num for num in 명수_count.to_dict().values())}명")
for label, num in 명수_count.to_dict().items():
    print(f"{label} : {num}장")

In [None]:
# 같은 PID끼리는 label이 같으므로 같은 PID끼리 묶어서 label을 정한다.
명수_count = valid_df.iloc[valid_df['PID'].drop_duplicates().index]['label|0:양성, 1:중간형, 2:악성'].value_counts()
장수_count = valid_df['label|0:양성, 1:중간형, 2:악성'].value_counts()

# valid_df['label|0:양성, 1:중간형, 2:악성'].value_counts()

print(f"장수 : 총 {np.sum(num for num in 장수_count.to_dict().values())}장")
for label, num in 장수_count.to_dict().items():
    print(f"{label} : {num}장")
    
print(f"명수 : 총 {np.sum(num for num in 명수_count.to_dict().values())}명")
for label, num in 명수_count.to_dict().items():
    print(f"{label} : {num}장")

In [None]:
# 같은 PID끼리는 label이 같으므로 같은 PID끼리 묶어서 label을 정한다.
명수_count = test_df.iloc[test_df['PID'].drop_duplicates().index]['label|0:양성, 1:중간형, 2:악성'].value_counts()
장수_count = test_df['label|0:양성, 1:중간형, 2:악성'].value_counts()

# test_df['label|0:양성, 1:중간형, 2:악성'].value_counts()


print(f"장수 : 총 {np.sum(num for num in 장수_count.to_dict().values())}장")
for label, num in 장수_count.to_dict().items():
    print(f"{label} : {num}장")
    
print(f"명수 : 총 {np.sum(num for num in 명수_count.to_dict().values())}명")
for label, num in 명수_count.to_dict().items():
    print(f"{label} : {num}장")
    
    


#### Dataset 제작

In [None]:
from lib.dataset import PCOS_Dataset
import torchvision.transforms as T
from torchvision.transforms import v2
from lib.augmentation import SpeckleNoise

train_dataset = PCOS_Dataset(
    data_filenames = train_df['filename'],
    data_dir_path  = data_dir,
    labels         = train_df['label|0:양성, 1:중간형, 2:악성'],
    transform = v2.Compose([
        v2.Resize((296, 296)), # 먼저 296x296으로 Resize
        # v2.CenterCrop(224),           # 224x224 중앙 자르기 -> 0.7977
        # Augmenttation 추가
        # RandomEqualize(p=0.5),    # Histogram Equalized
        v2.RandomRotation(degrees = 15), # 랜덤 회전
        v2.RandomHorizontalFlip(p = 0.5),    # 랜덤 수평 뒤집기
        v2.RandomVerticalFlip(p = 0.5), # 랜덤 수직 뒤집기
        v2.RandomResizedCrop(224),           # 224x224 렌담 중앙 자르기 -> 0.8089

        # Default Augmentation
        v2.Grayscale(num_output_channels=3),  # 3채널 회색변환 (RGB 형태 유지)
        v2.ToTensor(),                # 텐서 변환 
        v2.RandomApply([
            v2.GaussianNoise(mean = 0, sigma = 0.1, clip = True)], p=0.5), # 가우시안 노이즈
            # [Refer] SpeckleNoise : Automatic ovarian tumors recognition system based on ensemble convolutional neural network with ultrasound imaging
            SpeckleNoise(noise_level = 0.05) # Ultraosound Speckle Noise
            # v2.RandomApply([v2.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5.))], p=0.5), # 가우시안 Blur
        # T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ImageNet 정규화 -> 0.7119
    ])
)

valid_dataset = PCOS_Dataset(
    data_filenames = valid_df['filename'],
    data_dir_path  = data_dir,
    labels         = valid_df['label|0:양성, 1:중간형, 2:악성'],
    transform = v2.Compose([
        v2.Resize((224, 224)),        # 검증 시에는 224로만 Resize (예시)
        v2.Grayscale(num_output_channels=3),
        v2.ToTensor(),
    ])
)

#### Loader 적용

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size = 16, shuffle = True)
valid_loader = DataLoader(valid_dataset, batch_size = 16, shuffle = False)

X, y = next(iter(train_loader))

y

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(4, 4, figsize = (12, 12))
fig.suptitle('PCOS Dataset Augmentation Example', fontsize = 16)
for i in range(16):
    axes[i//4, i%4].imshow(X[i].cpu().permute(1, 2, 0).numpy())
    axes[i//4, i%4].set_title(f'Label: {y[i].item()}')
    axes[i//4, i%4].axis('off')

plt.tight_layout()
plt.show()

In [None]:
from dotenv import load_dotenv
load_dotenv()
import os, sys 
from glob import glob 

from PIL import Image

for img_path in glob(os.environ["DATA_DIR"]+'/*'):
    img = Image.open(img_path)
    
    # img center crop 0.8 ratio
    left_up = (img.size[0] * 0.1, img.size[1] * 0.1)
    left_down = (img.size[0] * 0.1, img.size[1] * 0.9)
    right_up = (img.size[0] * 0.9, img.size[1] * 0.1)
    right_down = (img.size[0] * 0.9, img.size[1] * 0.9)
    
    img = img.crop((left_up[0], left_up[1], right_down[0], right_down[1]))
    
    img.save(img_path.replace("Dataset", "Dataset_center_crop"))

In [1]:


import sys, os 
from glob import glob


filepaths = glob("/mnt/hdd/octc/BACKUP/Dataset/*")

import pandas as pd 

df =pd.read_csv("./data/datasheet.csv")
df

Unnamed: 0,filename,"label|0:양성, 1:중간형, 2:악성"
0,0_R001_00001,1
1,0_R004_00001,2
2,0_R006_00001,2
3,0_R007_00001,2
4,0_R008_00001,2
...,...,...
2963,2_R999_00003,1
2964,2_R999_00004,1
2965,2_R999_00005,1
2966,2_R999_00006,1


## 크롭된 데이터셋

In [5]:
import shutil
from PIL import Image
for filepath in filepaths:
    filename = os.path.basename(filepath).replace('.png', '')

    label = df[df['filename'] == filename]['label|0:양성, 1:중간형, 2:악성'].values[0]
    
    if label == 0:
        save_to_dir = "/home/eiden/eiden/otion_project/kohya_ss/dataset/images/ovarian_cyst_data/1_ovarian cyst benigns/"
        dignosis_type = "benign"
    elif label == 1:
        save_to_dir = "/home/eiden/eiden/otion_project/kohya_ss/dataset/images/ovarian_cyst_data/2_ovarian cyst borderlines/"
        dignosis_type = "borderline"
    else:
        save_to_dir = "/home/eiden/eiden/otion_project/kohya_ss/dataset/images/ovarian_cyst_data/3_ovarian cyst malignants/"
        dignosis_type = "malignant"
        
    img = Image.open(filepath)
    
    # 가로세로 0.1 비율로 자르기
    left_up = (img.size[0] * 0.1, img.size[1] * 0.1)
    left_down = (img.size[0] * 0.1, img.size[1] * 0.9)
    right_up = (img.size[0] * 0.9, img.size[1] * 0.1)
    right_down = (img.size[0] * 0.9, img.size[1] * 0.9)
    
    img = img.crop((left_up[0], left_up[1], right_down[0], right_down[1]))
    
    img.save(f"{save_to_dir}/{filename}.png")
    
    
    # with open(f"{save_to_dir}/{filename}.txt", 'w') as f:
    #     f.write(f"An ovarian ultrasound image showing {dignosis_type} characteristic")
        

## 샘플링 데이터에서 weight를 통해 데이터를 가지고 오기 

In [1]:
import pandas as pd 

df = pd.read_csv("/home/eiden/eiden/otion_project/stable-diffusion-webui/outputs/sample_datasheet.csv")
df

Unnamed: 0,filename,"label|0:양성, 1:중간형, 2:악성"
0,benign_0,0
1,benign_1,0
2,benign_10,0
3,benign_100,0
4,benign_1000,0
...,...,...
5954,malignant_995,2
5955,malignant_996,2
5956,malignant_997,2
5957,malignant_998,2


In [7]:


def sample_weight_distribution(sample_df:pd.DataFrame, sample_weight:list = [1.0, 1.0, 1.0], seed:int = 42) -> pd.DataFrame:
    """
    3개의 클래스에 대해 얼마나 샘플링을 할지 결정하는 함수
    sample_weight index 순서대로 0, 1, 2 클래스에 대해 얼마나 가져올지에 대한 가중치를 의미
    """
    
    # sample_df에서 'label|0:양성, 1:중간형, 2:악성' 컬럼을 기준으로 각 클래스별로 샘플링할 개수를 결정
    class_0_df = sample_df[sample_df['label|0:양성, 1:중간형, 2:악성'] == 0]
    class_1_df = sample_df[sample_df['label|0:양성, 1:중간형, 2:악성'] == 1]
    class_2_df = sample_df[sample_df['label|0:양성, 1:중간형, 2:악성'] == 2]   
    
    
    # 각 클래스별로 샘플링할 개수를 가중치를 곱하여 결정
    # 가중치가 1이면 원래 개수만큼 샘플링
    class_0_sample_num = int(len(class_0_df) * sample_weight[0])
    class_1_sample_num = int(len(class_1_df) * sample_weight[1])
    class_2_sample_num = int(len(class_2_df) * sample_weight[2])
    
    # 랜덤으로 가져오기
    class_0_df = class_0_df.sample(n = class_0_sample_num, random_state = seed)
    class_1_df = class_1_df.sample(n = class_1_sample_num, random_state = seed)
    class_2_df = class_2_df.sample(n = class_2_sample_num, random_state = seed)
    
    # 샘플링한 데이터프레임을 합쳐서 반환
    sample_weight_df = pd.concat([class_0_df, class_1_df, class_2_df], axis = 0)
    
    return sample_weight_df
    

In [8]:
sample_weight_distribution(df, sample_weight = [0.5, 0.5, 0.5])

Unnamed: 0,filename,"label|0:양성, 1:중간형, 2:악성"
1860,benign_873,0
353,benign_1315,0
1333,benign_399,0
905,benign_1812,0
1289,benign_359,0
...,...,...
4300,malignant_1287,2
3984,malignant_1001,2
4201,malignant_1198,2
5338,malignant_44,2
