In [1]:
import os
import fnmatch
import concurrent.futures
import traceback
import random
import time
import threading

import pandas as pd
from tqdm import tqdm
from pathlib import Path
from torchvision import transforms
from PIL import Image


In [2]:
DATASET_DIR = '../../../dataset'
MIMIC_JPG_DIR = f'{DATASET_DIR}/mimic-cxr-jpg-2.0.0'
MIMIC_JPG_RESIZE_OUTPUT_DIR = f'{DATASET_DIR}/mimic-cxr-jpg-2.0.0-resized'

In [3]:
def resize(jpg_path, pbar=None):
    img = Image.open(jpg_path)
    out_path = jpg_path.replace(MIMIC_JPG_DIR, MIMIC_JPG_RESIZE_OUTPUT_DIR)
    Path(out_path).parent.mkdir(parents=True, exist_ok=True)
    w, h = img.size
    sw, sh = (256 * h) // w, (256 * w) // h
    # print(f'({w=}, {h=}), ({sw=}, {sh=})')
    to_size = (256, sw) if w >= h else (sh, 256)
    #to_size = (256, sh) if w >= h else (256, sw) # inverted with pytorch
    # print(f'{to_size=}')
    img_ = transforms.Compose([transforms.Resize((to_size[1], to_size[0]))])(img)
    # print(f'transformed: {img_.size}')
    img_.save(out_path, 'JPEG')
    if pbar is not None:
        pbar.update(1)
    # waiting = random.randint(0, 1)
    # time.sleep(waiting)
    # print(f'\t{threading.get_ident()}: {waiting}')


In [5]:
MaxWorkers = 16

def resize_all_images():
    basedir = f'{MIMIC_JPG_DIR}'
    # basedir = f'{MIMIC_JPG_DIR}/p19/p19005583/s51728935'
    # basedir = f'{MIMIC_JPG_DIR}/p10'
    jpg_gen = (os.path.join(root, filename)
       for root, dirs, files in os.walk(basedir)
       for filename in fnmatch.filter(files, '*.jpg'))
    pbar = tqdm(total=377024, ncols=120)
    with concurrent.futures.ThreadPoolExecutor(max_workers=MaxWorkers) as executor:
        for filename in jpg_gen:
            executor.submit(resize, filename, pbar)
    pbar.close()
    print('resize_all_images: DONE')


In [6]:
%%time
resize_all_images()

377024it [6:01:29, 17.38it/s]                                                                                           

resize_all_images: DONE
CPU times: user 14h 20min 39s, sys: 59min 12s, total: 15h 19min 52s
Wall time: 6h 1min 29s





In [19]:
def generate_mini_list():
    mini_dir = f'{DATASET_DIR}/mini/mimic-cxr-jpg-2.0.0-resized'
    jpg_gen = (filename
               for root, dirs, files in os.walk(mini_dir)
               for filename in fnmatch.filter(files, '*.jpg'))
    dicom_ids = []
    for filename in jpg_gen:
        dicom = filename.replace('.jpg', '')
        dicom_ids.append(dicom)
    df = pd.DataFrame({'dicom_id': dicom_ids})
    df.to_csv(f'{DATASET_DIR}/processed/mimic-cxr-2.0.0-mini-dicoms.csv')
    return df
