In [29]:
# 모든 파일 리스팅
import pandas as pd
from glob import glob
from pathlib import Path
from tqdm import tqdm
import random

train_imgs = glob('train_imgs/*.png')
test_imgs = glob('val_imgs/*.png')
len(train_imgs), len(test_imgs)

(149019, 119361)

In [30]:
# ai hub 의 train 이미지를 train/val로 쓰고
#           val 이미지를 test로 쓴다.
def to_label(imgs, phase):
    label = [[Path(e).stem, e] for e in imgs]
    label = [s.split('_') + [n] for s, n in label]
    df = pd.DataFrame(label, columns=['INOUT', 'PID', 'SN', 'D1', 'D2', 'NAME'])
    df['K_SN'] = df['PID'] + '_' + df['SN']
    df['K_D1'] = df['K_SN'] + '_' + df['D1']
    df['K_D2'] = df['K_D1'] + '_' + df['D2'] 
    df['CLS_NAME'] = df['K_SN']
    df['PHASE'] = phase
    return df
    
df_train = to_label(train_imgs, 'train')
df_test = to_label(test_imgs, 'test')
df_total = pd.concat([df_train, df_test])
df_total

Unnamed: 0,INOUT,PID,SN,D1,D2,NAME,K_SN,K_D1,K_D2,CLS_NAME,PHASE
0,OUT,H00033,SN2,207044,7182,train_imgs/OUT_H00033_SN2_207044_7182.png,H00033_SN2,H00033_SN2_207044,H00033_SN2_207044_7182,H00033_SN2,train
1,OUT,H00022,SN1,207235,10256,train_imgs/OUT_H00022_SN1_207235_10256.png,H00022_SN1,H00022_SN1_207235,H00022_SN1_207235_10256,H00022_SN1,train
2,OUT,H00045,SN1,206673,41063,train_imgs/OUT_H00045_SN1_206673_41063.png,H00045_SN1,H00045_SN1_206673,H00045_SN1_206673_41063,H00045_SN1,train
3,IN,H00338,SN1,091610,39906,train_imgs/IN_H00338_SN1_091610_39906.png,H00338_SN1,H00338_SN1_091610,H00338_SN1_091610_39906,H00338_SN1,train
4,OUT,H00496,SN3,092504,14548,train_imgs/OUT_H00496_SN3_092504_14548.png,H00496_SN3,H00496_SN3_092504,H00496_SN3_092504_14548,H00496_SN3,train
...,...,...,...,...,...,...,...,...,...,...,...
119356,OUT,H00159,SN2,081804,32021,val_imgs/OUT_H00159_SN2_081804_32021.png,H00159_SN2,H00159_SN2_081804,H00159_SN2_081804_32021,H00159_SN2,test
119357,IN,H00702,SN2,101910,32847,val_imgs/IN_H00702_SN2_101910_32847.png,H00702_SN2,H00702_SN2_101910,H00702_SN2_101910_32847,H00702_SN2,test
119358,OUT,H00203,SN4,082003,14157,val_imgs/OUT_H00203_SN4_082003_14157.png,H00203_SN4,H00203_SN4_082003,H00203_SN4_082003_14157,H00203_SN4,test
119359,IN,H00752,SN1,102407,14345,val_imgs/IN_H00752_SN1_102407_14345.png,H00752_SN1,H00752_SN1_102407,H00752_SN1_102407_14345,H00752_SN1,test


In [31]:
df_total[['SN','PID','K_SN', 'K_D1', 'K_D2', 'CLS_NAME']].describe()

Unnamed: 0,SN,PID,K_SN,K_D1,K_D2,CLS_NAME
count,268380,268380,268380,268380,268380,268380
unique,4,1002,1002,8977,268380,1002
top,SN3,H00281,H00281_SN1,H00013_SN3_207041,H00785_SN2_102502_13395,H00281_SN1
freq,67882,330,330,30,1,330


In [32]:
kls_idx = {e:i for i, e in enumerate(sorted(list(set(df_total['CLS_NAME'].unique()))))}
df_total['KLS_IDX'] = df_total['CLS_NAME'].map(kls_idx) 
df_total

Unnamed: 0,INOUT,PID,SN,D1,D2,NAME,K_SN,K_D1,K_D2,CLS_NAME,PHASE,KLS_IDX
0,OUT,H00033,SN2,207044,7182,train_imgs/OUT_H00033_SN2_207044_7182.png,H00033_SN2,H00033_SN2_207044,H00033_SN2_207044_7182,H00033_SN2,train,24
1,OUT,H00022,SN1,207235,10256,train_imgs/OUT_H00022_SN1_207235_10256.png,H00022_SN1,H00022_SN1_207235,H00022_SN1_207235_10256,H00022_SN1,train,15
2,OUT,H00045,SN1,206673,41063,train_imgs/OUT_H00045_SN1_206673_41063.png,H00045_SN1,H00045_SN1_206673,H00045_SN1_206673_41063,H00045_SN1,train,31
3,IN,H00338,SN1,091610,39906,train_imgs/IN_H00338_SN1_091610_39906.png,H00338_SN1,H00338_SN1_091610,H00338_SN1_091610_39906,H00338_SN1,train,318
4,OUT,H00496,SN3,092504,14548,train_imgs/OUT_H00496_SN3_092504_14548.png,H00496_SN3,H00496_SN3_092504,H00496_SN3_092504_14548,H00496_SN3,train,476
...,...,...,...,...,...,...,...,...,...,...,...,...
119356,OUT,H00159,SN2,081804,32021,val_imgs/OUT_H00159_SN2_081804_32021.png,H00159_SN2,H00159_SN2_081804,H00159_SN2_081804_32021,H00159_SN2,test,140
119357,IN,H00702,SN2,101910,32847,val_imgs/IN_H00702_SN2_101910_32847.png,H00702_SN2,H00702_SN2_101910,H00702_SN2_101910_32847,H00702_SN2,test,678
119358,OUT,H00203,SN4,082003,14157,val_imgs/OUT_H00203_SN4_082003_14157.png,H00203_SN4,H00203_SN4_082003,H00203_SN4_082003_14157,H00203_SN4,test,184
119359,IN,H00752,SN1,102407,14345,val_imgs/IN_H00752_SN1_102407_14345.png,H00752_SN1,H00752_SN1_102407,H00752_SN1_102407_14345,H00752_SN1,test,728


In [47]:
df_train_g = df_total.query('PHASE != "test"').sample(frac=1.0).groupby('KLS_IDX')
df_trains, df_vals = [], []
for KLS_IDX, df_i in df_train_g:
    df_trains.append(df_i.iloc[:-30])
    df_vals.append(df_i.iloc[-30:])
df_train = pd.concat(df_trains)
df_val = pd.concat(df_vals)
df_val['PHASE'] = 'val'
df_test = df_total.query('PHASE == "test"')
df_total = pd.concat([df_train, df_val, df_test])
df_total

Unnamed: 0,INOUT,PID,SN,D1,D2,NAME,K_SN,K_D1,K_D2,CLS_NAME,PHASE,KLS_IDX
39920,OUT,H00005,SN1,207234,11372,train_imgs/OUT_H00005_SN1_207234_11372.png,H00005_SN1,H00005_SN1_207234,H00005_SN1_207234_11372,H00005_SN1,train,3
11345,OUT,H00005,SN1,207044,6989,train_imgs/OUT_H00005_SN1_207044_6989.png,H00005_SN1,H00005_SN1_207044,H00005_SN1_207044_6989,H00005_SN1,train,3
58454,OUT,H00005,SN1,207044,7028,train_imgs/OUT_H00005_SN1_207044_7028.png,H00005_SN1,H00005_SN1_207044,H00005_SN1_207044_7028,H00005_SN1,train,3
74988,OUT,H00005,SN1,207123,25780,train_imgs/OUT_H00005_SN1_207123_25780.png,H00005_SN1,H00005_SN1_207123,H00005_SN1_207123_25780,H00005_SN1,train,3
7341,OUT,H00005,SN1,207234,11345,train_imgs/OUT_H00005_SN1_207234_11345.png,H00005_SN1,H00005_SN1_207234,H00005_SN1_207234_11345,H00005_SN1,train,3
...,...,...,...,...,...,...,...,...,...,...,...,...
119356,OUT,H00159,SN2,081804,32021,val_imgs/OUT_H00159_SN2_081804_32021.png,H00159_SN2,H00159_SN2_081804,H00159_SN2_081804_32021,H00159_SN2,test,140
119357,IN,H00702,SN2,101910,32847,val_imgs/IN_H00702_SN2_101910_32847.png,H00702_SN2,H00702_SN2_101910,H00702_SN2_101910_32847,H00702_SN2,test,678
119358,OUT,H00203,SN4,082003,14157,val_imgs/OUT_H00203_SN4_082003_14157.png,H00203_SN4,H00203_SN4_082003,H00203_SN4_082003_14157,H00203_SN4,test,184
119359,IN,H00752,SN1,102407,14345,val_imgs/IN_H00752_SN1_102407_14345.png,H00752_SN1,H00752_SN1_102407,H00752_SN1_102407_14345,H00752_SN1,test,728


In [48]:
df_total.to_csv('ai_hub_reid.csv', index=False)
def to_label_txt(df, phase):
    txt = df_total.query('PHASE == @phase')
    txt = txt[['NAME', 'KLS_IDX']]
    
    file_name = f'img_list_{phase}.txt'
    txt.to_csv(file_name, index=False)
    return file_name 
    
txt_train = to_label_txt(df_total, 'train')
txt_val = to_label_txt(df_total, 'val')
txt_test = to_label_txt(df_total, 'test')

In [49]:
import cv2
from tqdm.auto import tqdm

df = pd.read_csv(txt_train)
imgs = df.to_dict('records')
for d in tqdm(imgs):
    try: 
        im = cv2.imread(d['NAME'])
    except Exception as e:
        print(err, e)
    

  0%|          | 0/120359 [00:00<?, ?it/s]

In [None]:
import cv2
from PIL import Image
import numpy as np

item = df_total.sample(n=1).iloc[0]
path = item['NAME']
im = Image.open(path)

def rect_pad(pil_image):
    import numpy as np
    from PIL import Image
    
    im = np.array(pil_image) 
    h, w = im.shape[:2]
    size = max(h, w)
    pad_h, pad_w = size-h, size -w
    im = np.pad(im, 
           ((pad_h//2, pad_h-pad_h//2), (pad_w//2, pad_w - pad_w//2), (0,0)), 
           mode='constant', constant_values=0)
    return Image.fromarray(im)
rect_pad(im)

In [None]:
df_train = pd.read_csv('')