In [1]:
# 모든 파일 리스팅
import pandas as pd
from glob import glob
from pathlib import Path
from tqdm import tqdm
import random

BASE = '/data/kts123/aihub/reid'
# ai hub 의 데이터는 train 과 valid 로 나누어져 있다.
# 동일한 사람이 두 셋에 동시에 나오지 않는다.
train_imgs = glob(f'{BASE}/train_imgs/*.png')
test_imgs = glob(f'{BASE}/val_imgs/*.png')

# 이미지 갯수를 출력해 본다.
len(train_imgs), len(test_imgs)

(149019, 119361)

In [9]:
# ai hub 의 train 이미지를 train/val로 쓰고
#           val 이미지를 test로 쓴다.
def to_label(imgs, phase):
    label = [[Path(e).stem, e] for e in imgs]
    label = [s.split('_') + [n] for s, n in label]
    df = pd.DataFrame(label, columns=['INOUT', 'PID', 'SN', 'D1', 'D2', 'NAME'])
    df['K_SN'] = df['PID'] + '_' + df['SN']
    df['K_D1'] = df['K_SN'] + '_' + df['D1']
    df['K_D2'] = df['K_D1'] + '_' + df['D2'] 
    df['CLS_NAME'] = df['K_SN']
    df['PHASE'] = phase
    return df

In [11]:
# train 과 test를 DataFrame 포멧으로 만든다. 
df_train = to_label(train_imgs, 'train')
df_test = to_label(test_imgs, 'test')

df_test
# 하나의 DataFrame 으로 통합한다.
df_total = pd.concat([df_train, df_test])

# 통합된 데이터 프레임을 출력해 본다
df_total

Unnamed: 0,INOUT,PID,SN,D1,D2,NAME,K_SN,K_D1,K_D2,CLS_NAME,PHASE
0,OUT,H00033,SN2,207044,7182,/data/kts123/aihub/reid/train_imgs/OUT_H00033_...,H00033_SN2,H00033_SN2_207044,H00033_SN2_207044_7182,H00033_SN2,train
1,OUT,H00022,SN1,207235,10256,/data/kts123/aihub/reid/train_imgs/OUT_H00022_...,H00022_SN1,H00022_SN1_207235,H00022_SN1_207235_10256,H00022_SN1,train
2,OUT,H00045,SN1,206673,41063,/data/kts123/aihub/reid/train_imgs/OUT_H00045_...,H00045_SN1,H00045_SN1_206673,H00045_SN1_206673_41063,H00045_SN1,train
3,IN,H00338,SN1,091610,39906,/data/kts123/aihub/reid/train_imgs/IN_H00338_S...,H00338_SN1,H00338_SN1_091610,H00338_SN1_091610_39906,H00338_SN1,train
4,OUT,H00496,SN3,092504,14548,/data/kts123/aihub/reid/train_imgs/OUT_H00496_...,H00496_SN3,H00496_SN3_092504,H00496_SN3_092504_14548,H00496_SN3,train
...,...,...,...,...,...,...,...,...,...,...,...
119356,OUT,H00159,SN2,081804,32021,/data/kts123/aihub/reid/val_imgs/OUT_H00159_SN...,H00159_SN2,H00159_SN2_081804,H00159_SN2_081804_32021,H00159_SN2,test
119357,IN,H00702,SN2,101910,32847,/data/kts123/aihub/reid/val_imgs/IN_H00702_SN2...,H00702_SN2,H00702_SN2_101910,H00702_SN2_101910_32847,H00702_SN2,test
119358,OUT,H00203,SN4,082003,14157,/data/kts123/aihub/reid/val_imgs/OUT_H00203_SN...,H00203_SN4,H00203_SN4_082003,H00203_SN4_082003_14157,H00203_SN4,test
119359,IN,H00752,SN1,102407,14345,/data/kts123/aihub/reid/val_imgs/IN_H00752_SN1...,H00752_SN1,H00752_SN1_102407,H00752_SN1_102407_14345,H00752_SN1,test


In [None]:
# 통계 정보를 출력해 본다
df_total[['SN','PID','K_SN', 'K_D1', 'K_D2', 'CLS_NAME']].describe()

In [None]:
#클래스를 인덱스화 한다.
kls_idx = {e:i for i, e in enumerate(sorted(list(set(df_total['CLS_NAME'].unique()))))}
df_total['KLS_IDX'] = df_total['CLS_NAME'].map(kls_idx) 
df_total


In [None]:
# 무작위로 섞은 후 클래스 인덱스로 그룹화 한다.
df_train_g = df_total.query('PHASE != "test"').sample(frac=1.0).groupby('KLS_IDX')

# 각 클래스에서 맨 마지막 30개 이미지를 val 셋으로 옮긴다.
df_trains, df_vals = [], []
for KLS_IDX, df_i in df_train_g:
    df_trains.append(df_i.iloc[:-30])
    df_vals.append(df_i.iloc[-30:])

In [None]:
# train/val/test 를 하나의 데이터 프레임으로 통합한다.
df_train = pd.concat(df_trains)
df_val = pd.concat(df_vals)
df_val['PHASE'] = 'val'
df_test = df_total.query('PHASE == "test"')
df_total = pd.concat([df_train, df_val, df_test])
df_total

In [None]:
# 생성한 데이터 프레임을 csv 형태로 저장한다.
df_total.to_csv('ai_hub_reid.csv', index=False)

In [None]:
# csv 파일에서 특정 phase만 추출하여 저장하는 유틸 함수
def to_label_txt(df, phase):
    txt = df_total.query('PHASE == @phase')
    txt = txt[['NAME', 'KLS_IDX']]
    
    file_name = f'img_list_{phase}.txt'
    txt.to_csv(file_name, index=False)
    return file_name 



In [None]:
# train/val/test 이미지 리스트를 각각 저장
txt_train = to_label_txt(df_total, 'train')
txt_val = to_label_txt(df_total, 'val')
txt_test = to_label_txt(df_total, 'test')

In [None]:
# 이미지 파일이 정상인지 체크
import cv2
from tqdm.auto import tqdm

df = pd.read_csv(txt_train)
imgs = df.to_dict('records')
for d in tqdm(imgs):
    try: 
        im = cv2.imread(d['NAME'])
    except Exception as e:
        print(err, e)
    

In [None]:
# 이미지 샘플 show
from PIL import Image
import numpy as np

item = df_total.sample(n=1).iloc[0]
path = item['NAME']
im = Image.open(path)

In [None]:
# 정사각형 이미지가 되도록 패딩
def rect_pad(pil_image):
    import numpy as np
    from PIL import Image
    
    im = np.array(pil_image) 
    h, w = im.shape[:2]
    size = max(h, w)
    pad_h, pad_w = size-h, size -w
    im = np.pad(im, 
           ((pad_h//2, pad_h-pad_h//2), (pad_w//2, pad_w - pad_w//2), (0,0)), 
           mode='constant', constant_values=0)
    return Image.fromarray(im)
rect_pad(im)