In [1]:
import os
import cv2
import glob
import shutil
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET

def getImagesInDir(dir_path):
    img_formats = ['bmp', 'jpg', 'jpeg', 'png', 'tif', 'tiff', 'dng']  # acceptable image suffixes
    image_list = []
    for img_format in img_formats:
        for filename in glob.glob(dir_path + f'/*.{img_format}'):
            image_list.append(filename)

    return image_list

In [4]:
classes = ['pedestrian']
img_dir = 'pedestrian_datasets/JPEGImages/'
ann_dir = 'pedestrian_datasets/Annotations/'
image_paths = getImagesInDir(img_dir)
out_dir = 'reid_train_all'
if os.path.exists(out_dir):
    shutil.rmtree(out_dir)  # delete output folder
os.makedirs(out_dir)  # make new output folder

In [5]:
for img_path in tqdm(image_paths, total=len(image_paths)):
    # read image
    img = cv2.imread(img_path)
    
    basename = os.path.basename(img_path)
    basename_no_ext = os.path.splitext(basename)[0]

    in_file = open(ann_dir + '/' + basename_no_ext + '.xml')
    tree = ET.parse(in_file)
    root = tree.getroot()
    size = root.find('size')
    w = int(size.find('width').text)
    h = int(size.find('height').text)
    cnt=0
    for obj in root.iter('object'):
        difficult = obj.find('difficult').text
        cls = obj.find('name').text
        if cls not in classes or int(difficult)==1:
            continue
        cls_id = classes.index(cls)
        xmlbox = obj.find('bndbox')
        b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text), float(xmlbox.find('ymax').text))
        reid = obj.find('reid').text[:-2]
        crop_img = img[int(b[2]):int(b[3]), int(b[0]):int(b[1]), :]
        out_reid_dir = os.path.join(out_dir, reid)
        if not os.path.exists(out_reid_dir):
            os.mkdir(out_reid_dir)
        cv2.imwrite(os.path.join(out_reid_dir, basename_no_ext+'_'+str(cnt))+'.jpg', crop_img)
        cnt+=1

100%|██████████| 7121/7121 [02:05<00:00, 56.52it/s]


In [6]:
x_data_list = []
y_data_list = []
for roots, _, files in os.walk(out_dir):
    for each in files:
        if each.find('checkpoint') == -1:
            x_data_list.append(os.path.join(roots.split("/")[-1], each))
            y_data_list.append(roots.split("/")[-1])
data = pd.DataFrame({'file':x_data_list, 'class':y_data_list})
data.head()

Unnamed: 0,file,class
0,870/c5s2_121174_3.jpg,870
1,870/c6s2_114318_3.jpg,870
2,870/c6s2_114168_3.jpg,870
3,870/c6s2_113493_5.jpg,870
4,870/c6s2_114043_2.jpg,870


In [12]:
data.groupby('class').count().sort_values(['file'])[-100:]

Unnamed: 0_level_0,file
class,Unnamed: 1_level_1
296,41
166,41
265,41
216,42
545,42
...,...
155,121
88,135
1,388
233,432


In [27]:
reids = data.groupby('class').count().sort_values(['file'])[-101:].index

In [28]:
reids

Index(['188', '296', '166', '265', '216', '545', '659', '84', '191', '516',
       ...
       '173', '192', '193', '880', '302', '155', '88', '1', '233', '3'],
      dtype='object', name='class', length=101)

In [29]:
classes = ['pedestrian']
img_dir = 'pedestrian_datasets/JPEGImages/'
ann_dir = 'pedestrian_datasets/Annotations/'
image_paths = getImagesInDir(img_dir)

out = 'datasets'
if os.path.exists(out):
    shutil.rmtree(out)  # delete output folder
out_dir = os.path.join(out, 'reid')
os.makedirs(out_dir)  # make new output folder
out_img_dir = os.path.join(out, 'JPEGImages')
os.makedirs(out_img_dir)  # make new output folder
out_ann_dir = os.path.join(out, 'Annotations')
os.makedirs(out_ann_dir)  # make new output folder
out_txt_dir = os.path.join(out, 'ImageSets/Main')
os.makedirs(out_txt_dir)  # make new output folder
out_txt_file = os.path.join(out, 'ImageSets/Main/default.txt')

In [30]:
if os.path.exists(out_txt_file):
    os.remove(out_txt_file)
for img_path in tqdm(image_paths, total=len(image_paths)):
    # read image
    img = cv2.imread(img_path)
    
    basename = os.path.basename(img_path)
    basename_no_ext = os.path.splitext(basename)[0]

    in_file = open(ann_dir + '/' + basename_no_ext + '.xml')
    tree = ET.parse(in_file)
    root = tree.getroot()
    size = root.find('size')
    w = int(size.find('width').text)
    h = int(size.find('height').text)
    bndbox_all = []
    reid_all = []
    for obj in root.iter('object'):
        difficult = obj.find('difficult').text
        cls = obj.find('name').text
        if cls not in classes or int(difficult)==1:
            continue
        cls_id = classes.index(cls)
        xmlbox = obj.find('bndbox')
        b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text), float(xmlbox.find('ymax').text))
        bndbox_all.append(b)
        reid = obj.find('reid').text[:-2]
        reid_all.append(reid)
    
    for reid in reid_all:
        if reid not in reids:
            break
    else:
        # 複製影像和標籤
        shutil.copy(img_path, os.path.join(out_img_dir, basename))
        shutil.copy(ann_dir + '/' + basename_no_ext + '.xml', os.path.join(out_ann_dir, basename_no_ext)+'.xml')
        # 生成 txt
        with open(out_txt_file, 'a') as f:
            f.write(basename_no_ext+'\n')

        cnt=0
        for b, reid in zip(bndbox_all, reid_all):
            crop_img = img[int(b[2]):int(b[3]), int(b[0]):int(b[1]), :]
            out_reid_dir = os.path.join(out_dir, reid)
            if not os.path.exists(out_reid_dir):
                os.mkdir(out_reid_dir)
            cv2.imwrite(os.path.join(out_reid_dir, basename_no_ext+'_'+str(cnt))+'.jpg', crop_img)
            cnt+=1

100%|██████████| 7121/7121 [01:53<00:00, 62.66it/s]


In [31]:
x_data_list = []
y_data_list = []
for roots, _, files in os.walk(out_dir):
    for each in files:
        if each.find('checkpoint') == -1:
            x_data_list.append(os.path.join(roots.split("/")[-1], each))
            y_data_list.append(roots.split("/")[-1])
data_new = pd.DataFrame({'file':x_data_list, 'class':y_data_list})
data_new.head()

Unnamed: 0,file,class
0,624/c6s2_058318_4.jpg,624
1,624/c5s2_049580_3.jpg,624
2,624/c5s2_070777_4.jpg,624
3,624/c4s4_050685_1.jpg,624
4,624/c6s2_058393_1.jpg,624


In [32]:
data_new.groupby('class').count().sort_values(['file'])

Unnamed: 0_level_0,file
class,Unnamed: 1_level_1
285,1
714,2
653,2
618,2
656,3
...,...
175,51
124,53
173,60
155,62
