In [3]:
!pip install tqdm



In [29]:
import os
import sys
from glob import glob
import numpy as np
import pandas as pd
import cv2
from PIL import Image
from tqdm import tqdm
from time import time
import json
import matplotlib.pyplot as plt
import seaborn as sns
import multiprocessing as mp

In [54]:
data_dir = "/opt/ml/segmentation/input/data/"
json_all_path = "/opt/ml/segmentation/input/data/train_all.json"
json_eval_path = "/opt/ml/segmentation/input/data/val.json"
# json_all_data json_eval_data 

## image size, rgb mean, std

In [35]:
def get_img_stats(img_dir, img_ids):
    """
    데이터셋에 있는 이미지들의 크기와 RGB 평균 및 표준편차를 수집하는 함수입니다.
    
    Args:
        img_dir: 학습 데이터셋 이미지 폴더 경로 
        img_ids: 학습 데이터셋 하위폴더 이름들

    Returns:
        img_info: 이미지들의 정보 (크기, 평균, 표준편차)
    """
    img_info = dict(heights=[], widths=[], means=[], stds=[])
    for img_id in tqdm(img_ids):
        img = np.array(Image.open(os.path.join(img_dir, img_id)))
        h, w, _ = img.shape
        img_info['heights'].append(h)
        img_info['widths'].append(w)
        img_info['means'].append(img.mean(axis=(0,1)))
        img_info['stds'].append(img.std(axis=(0,1)))
    return img_info

In [51]:
# trian
with open(json_all_path, 'r') as j :
    json_all_data = json.load(j)
    img_all_file = []
    
for x in json_all_data["images"]:
    img_all_file.append(x["file_name"])
    
# eval
with open(json_eval_path, 'r') as j :
    json_eval_data = json.load(j)
    img_eval_file = []
    
for x in json_eval_data["images"]:
    img_eval_file.append(x["file_name"])

In [57]:
img_all_info = get_img_stats(data_dir, img_all_file)
img_eval_info = get_img_stats(data_dir, img_eval_file)

100%|██████████| 3272/3272 [01:45<00:00, 30.59it/s]
100%|██████████| 655/655 [00:21<00:00, 30.77it/s]


### Train

In [58]:
print(f'Total number of images is {len(img_all_file)}')
print()
print(f'Minimum height for dataset is {np.min(img_all_info["heights"])}')
print(f'Maximum height for dataset is {np.max(img_all_info["heights"])}')
print(f'Average height for dataset is {int(np.mean(img_all_info["heights"]))}')
print(f'Minimum width for dataset is {np.min(img_all_info["widths"])}')
print(f'Maximum width for dataset is {np.max(img_all_info["widths"])}')
print(f'Average width for dataset is {int(np.mean(img_all_info["widths"]))}')
print()
print(f'RGB Mean: {np.mean(img_all_info["means"], axis=0) / 255.}')
print(f'RGB Standard Deviation: {np.mean(img_all_info["stds"], axis=0) / 255.}')

Total number of images is 3272

Minimum height for dataset is 512
Maximum height for dataset is 512
Average height for dataset is 512
Minimum width for dataset is 512
Maximum width for dataset is 512
Average width for dataset is 512

RGB Mean: [0.46009655 0.43957878 0.41827092]
RGB Standard Deviation: [0.2108204  0.20766491 0.21656131]


### Eval

In [59]:
print(f'Total number of images is {len(img_eval_file)}')
print()
print(f'Minimum height for dataset is {np.min(img_eval_info["heights"])}')
print(f'Maximum height for dataset is {np.max(img_eval_info["heights"])}')
print(f'Average height for dataset is {int(np.mean(img_eval_info["heights"]))}')
print(f'Minimum width for dataset is {np.min(img_eval_info["widths"])}')
print(f'Maximum width for dataset is {np.max(img_eval_info["widths"])}')
print(f'Average width for dataset is {int(np.mean(img_eval_info["widths"]))}')
print()
print(f'RGB Mean: {np.mean(img_eval_info["means"], axis=0) / 255.}')
print(f'RGB Standard Deviation: {np.mean(img_eval_info["stds"], axis=0) / 255.}')

Total number of images is 655

Minimum height for dataset is 512
Maximum height for dataset is 512
Average height for dataset is 512
Minimum width for dataset is 512
Maximum width for dataset is 512
Average width for dataset is 512

RGB Mean: [0.46034062 0.43985595 0.4168375 ]
RGB Standard Deviation: [0.21103533 0.20915556 0.21880394]


## EDA