# Check Image Integrity

This piece of code is used to check if all the images in the dataset are readable and not corrupted. It uses the `PIL` library to open the images and check if they are readable. If the image is not readable, it will print the path of the image and the error message.

In [22]:
import os
import os.path as osp
import cv2
import tqdm
from multiprocessing import Pool
from skimage import io

def verify_image(img_file):
    try:
        io.imread(img_file)
    except:
        return False
    return True


def check_dir(cur_image_dir):
    invalid_image_list = []
    cur_image_list = sorted(os.listdir(cur_image_dir))
    for j in cur_image_list:
        cur_image_path = osp.join(cur_image_dir, j)

        # cur_image_data = cv2.imread(cur_image_path)
        # if cur_image_data is None:
        #     print(cur_image_path)
        #     invalid_image_list.append(cur_image_path)

        if not verify_image(cur_image_path):
            print(cur_image_path)
            invalid_image_list.append(cur_image_path)

    return invalid_image_list

data_root = '/data/PanoHeadData/multi_view_hq'

# image_dir = osp.join(data_root, 'align_images')
# image_dir = osp.join(data_root, 'align_parsing')
# image_dir = osp.join(data_root, 'head_images')
image_dir = osp.join(data_root, 'head_parsing')

image_dir_list = sorted(os.listdir(image_dir))
check_dir_list = []
for i in tqdm.tqdm(image_dir_list):
    cur_image_dir = osp.join(image_dir, i)
    check_dir_list.append(cur_image_dir)

with Pool(processes=128) as pool:
    mp_results = list(tqdm.tqdm(pool.imap(check_dir, check_dir_list), total=len(check_dir_list)))

all_invalid_files = []
for res in mp_results:
    if len(res) != 0:
        all_invalid_files.extend(res)

print(len(all_invalid_files))
print(all_invalid_files)

100%|██████████| 9376/9376 [00:00<00:00, 890852.53it/s]
100%|██████████| 9376/9376 [00:34<00:00, 273.74it/s]


0
[]


In [4]:
# additional step: removing all the files
for i in all_invalid_files:
    base_dir = osp.basename(osp.dirname(i))
    base_file = osp.basename(i).split('.')[0]
    print(f'{base_dir} {base_file}')
    ai_path = osp.join(data_root, 'align_images', base_dir, base_file+'.jpg')
    ap_path = osp.join(data_root, 'align_parsing', base_dir, base_file+'.png')
    hi_path = osp.join(data_root, 'head_images', base_dir, base_file+'.jpg')
    hp_path = osp.join(data_root, 'head_parsing', base_dir, base_file+'.png')
    assert osp.exists(ai_path), ai_path
    assert osp.exists(ap_path), ap_path
    assert osp.exists(hi_path), hi_path
    assert osp.exists(hp_path), hp_path
    os.remove(ai_path)
    os.remove(ap_path)
    os.remove(hi_path)
    os.remove(hp_path)

00027 img00027278


In [5]:
# additional step: removing all the files from dataset.json
import json

with open(osp.join(data_root, 'dataset.json'), 'r') as f:
    d0 = json.load(f)

for i in all_invalid_files:
    k = osp.relpath(i, image_dir).replace('.jpg', '.png')
    if k in d0.keys():
        print(k)
        del d0[k]

with open(osp.join(data_root, 'dataset.json'), 'w') as f:
    json.dump(d0, f, indent=4)

with open(osp.join(data_root, 'dataset_thresh_0.3.json'), 'r') as f:
    d2 = json.load(f)

for i in all_invalid_files:
    k = osp.relpath(i, image_dir).replace('.jpg', '.png')
    if k in d2.keys():
        print(k)
        del d2[k]

with open(osp.join(data_root, 'dataset_thresh_0.3.json'), 'w') as f:
    json.dump(d2, f, indent=4)


00027/img00027278.png
00027/img00027278.png
