In [1]:
import os
import json
import shutil
import pandas as pd
from tqdm import tqdm

In [21]:
def _extract_info(file_path):
    with open(file_path, 'r') as file:
        json_data = json.load(file)

        # 필요한 정보만 추출
        extracted_info = dict(
            filename = json_data['description']['image'],
            height = json_data['description']['height'],
            width = json_data['description']['width'],
            diseaseID = json_data['annotations']['disease'],
            risk = json_data['annotations']['risk'],
            classID = json_data['annotations']['area'],
            xtl = json_data['annotations']['points'][0]['xtl'],
            ytl = json_data['annotations']['points'][0]['ytl'],
            xbr = json_data['annotations']['points'][0]['xbr'],
            ybr = json_data['annotations']['points'][0]['ybr']
        )
    return extracted_info

def make_df(folder_path):
    # 결과를 저장할 리스트
    data_lists = []

    # 전체 JSON 파일 수 계산
    total_files = sum([len(files) for r, d, files in os.walk(folder_path) if any(f.endswith('.json') for f in files)])

    # tqdm으로 진행 상황 표시
    with tqdm(total=total_files, desc='Pricessing JSON files') as pbar:
        for root, _, files in os.walk(folder_path):
            for filename in files:
                if filename.endswith('.json'):
                    file_path = os.path.join(root, filename)

                    extracted_info = _extract_info(file_path)

                    data_lists.append(extracted_info)

                    # 진행 상황 업데이트
                    pbar.update(1)

    return pd.DataFrame(data_lists)

In [22]:
t_path1 = "training_normal_label_path"
t_path2 = "training_disease_label_path"
v_path1 = "validation_normal_label_path"
v_path2 = "validation_disease_label_path"

t_df1 = make_df(t_path1)
t_df2 = make_df(t_path2)
v_df1 = make_df(v_path1)
v_df2 = make_df(v_path2)

Pricessing JSON files: 100%|██████████| 19328/19328 [08:51<00:00, 36.38it/s]
Pricessing JSON files: 100%|██████████| 851/851 [00:25<00:00, 32.96it/s]
Pricessing JSON files: 100%|██████████| 336/336 [00:10<00:00, 33.46it/s]
Pricessing JSON files: 100%|██████████| 106/106 [00:03<00:00, 33.69it/s]


In [29]:
t_df1.to_csv('t_정상.csv', index=False)
t_df2.to_csv('t_질병.csv', index=False)
v_df1.to_csv('v_정상.csv', index=False)
v_df2.to_csv('v_질병.csv', index=False)

# 데이터 복사

In [2]:
# csv 파일 읽기
t_normal = pd.read_csv('t_정상.csv')
t_disease = pd.read_csv('t_질병.csv')
v_normal = pd.read_csv('v_정상.csv')
v_disease = pd.read_csv('v_질병.csv')

In [4]:
t_normal_sample = t_normal[t_normal['classID']==3].sample(n=191, random_state=42)
t_normal_sample.reset_index(inplace=True, drop=True)

In [5]:
t_normal_sample

Unnamed: 0,filename,height,width,diseaseID,risk,classID,xtl,ytl,xbr,ybr
0,V006_77_0_00_10_03_12_0_c14_20201224_0054_S01_...,4032,1816,0,0,3,535,1681,1403,2666
1,V006_77_0_00_10_03_11_0_c14_20201229_0018_S01_...,4032,1816,0,0,3,221,1791,1607,3018
2,V006_77_0_00_10_03_12_0_c22_20201222_0080_S01_...,4032,3024,0,0,3,1077,1360,2632,3367
3,V006_77_0_00_10_03_12_0_c18_20201221_0046_S01_...,4032,3024,0,0,3,308,705,2252,2968
4,V006_77_0_00_10_03_11_0_c39_20201209_0158_S01_...,4032,3024,0,0,3,341,1441,2535,3732
...,...,...,...,...,...,...,...,...,...,...
186,V006_77_0_00_10_03_11_0_b08_20201210_0361_S01_...,4000,1800,0,0,3,71,614,1631,2386
187,V006_77_0_00_10_03_12_0_c34_20201222_0479_S01_...,4032,3024,0,0,3,623,1331,2618,3308
188,V006_77_0_00_10_03_12_0_c22_20201222_0073_S01_...,3024,4032,0,0,3,1335,831,3434,2671
189,V006_77_0_00_10_03_12_0_b09_20201222_0582_S01_...,1860,4032,0,0,3,1171,160,2720,1489


In [6]:
src_path = "source_images_path"
dst_path = "destination_forder_path"

# 복사할 이미지 목록 생성
images_to_copy = t_normal_sample['filename'].tolist()

# 해당되는 이미지만 복사
for img in tqdm(images_to_copy, desc="Copying images", unit="image"):
    src_file = os.path.join(src_path, img)
    dst_file = os.path.join(dst_path, img)

    # 파일이 존재하는지 확인
    if os.path.exists(src_file):
        shutil.copy(src_file, dst_file)
    else:
        print(f"Warning: File not found - {src_file}")

print("정상 image 복사 완료")

Copying images: 100%|██████████| 191/191 [01:37<00:00,  1.96image/s]

정상 image 복사 완료





In [7]:
src_path = "source_images_path"
dst_path = "destination_forder_path"

# 복사할 이미지 목록 생성
labels_to_copy = [filename + '.json' for filename in t_normal_sample['filename'].tolist()]

# 해당되는 이미지만 복사
for lbl in tqdm(labels_to_copy, desc="Copying images", unit="image"):
    src_file = os.path.join(src_path, lbl)
    dst_file = os.path.join(dst_path, lbl)

    # 파일이 존재하는지 확인
    if os.path.exists(src_file):
        shutil.copy(src_file, dst_file)
    else:
        print(f"Warning: File not found - {src_file}")

print("정상 label 복사 완료")

Copying images: 100%|██████████| 191/191 [00:33<00:00,  5.62image/s]

정상 label 복사 완료



