In [2]:
import os
import pandas as pd

In [3]:
data_dir = '/mnt/data-c314/asekuboyina/mimic_cxr/physionet.org/files/mimic-cxr-jpg/2.0.0'

sub_dirs = [os.path.join(data_dir + '/files', x) for x in os.listdir(data_dir + '/files')]
sub_dirs = [x for x in sub_dirs if os.path.isdir(x)]  # filter out 'index.html'

patient_dirs = []
patient_ids  = []

for sub_dir in sub_dirs:
    
    temp_dirs = os.listdir(sub_dir)
    
    for x in temp_dirs:
        
        if '.html' not in x:
            
            patient_dirs.append(os.path.join(sub_dir, x))
            patient_ids.append(x)
    
print('# patients:', len(patient_ids))

# patients: 65379


In [4]:
"""dict structure:
patient_id
|
 --- path:
|
 --- studies: {study_id: {
                          images: {
                                    dicom_id: {
                                                path:
                                                view:
                                              } 
                                    
                                  } 
                          labels:
                          report_path:  
                          }}
"""

'dict structure:\npatient_id\n|\n --- path:\n|\n --- studies: {study_id: {\n                          images: {\n                                    dicom_id: {\n                                                path:\n                                                view:\n                                              } \n                                    \n                                  } \n                          labels:\n                          report_path:  \n                          }}\n'

In [5]:
# -- metadata
metadata_file = os.path.join(data_dir, 'mimic-cxr-2.0.0-metadata.csv')
df_metadata   = pd.read_csv(metadata_file)

# -- chexpert labels
labels_file_chexpert = os.path.join(data_dir, 'mimic-cxr-2.0.0-chexpert.csv')
df_chexpert_labels   = pd.read_csv(labels_file_chexpert)
label_keys           = list(df_chexpert_labels.columns[2:])
print('LABEL_KEYS:', label_keys)

# -- report paths
reports_dir  = os.path.join(data_dir, 'mimic-cxr-reports')
report_paths = []
report_ids   = []

for root, dirs, files in os.walk(reports_dir):
    for file in files:
        if file.endswith(".txt"):
            report_paths.append(os.path.join(root, file))
            report_ids.append(os.path.splitext(file)[0])

LABEL_KEYS: ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion', 'Lung Opacity', 'No Finding', 'Pleural Effusion', 'Pleural Other', 'Pneumonia', 'Pneumothorax', 'Support Devices']


In [6]:
mimic_dict = {}

In [7]:
def get_images_dict(study_dir):
    
    im_paths = [os.path.join(study_dir, x) for x in os.listdir(study_dir) if x.endswith('.jpg')]
    
    images_dict = {}
    
    for im_path in im_paths:
        
        dicom_id = os.path.splitext(os.path.basename(im_path))[0]
        
        im_view  = df_metadata.loc[df_metadata['dicom_id'] ==dicom_id]['ViewPosition'].values[0]
                
        images_dict[dicom_id] = {
                                    'path': im_path,
                                    'view': im_view
                                }
        
    return images_dict


def get_labels(patient_id, study_id):
    df_temp_patient = df_chexpert_labels.loc[df_chexpert_labels['subject_id'] == int(patient_id[1:])] 
    labels = df_temp_patient.loc[df_temp_patient['study_id'] == int(study_id[1:])].values[0][2:] # remove 's' from study id

    return labels

def get_report_path(study_id):
    
    index = report_ids.index(study_id)
    
    return report_paths[index]

In [None]:
error_ids = []

for idx, (patient_id, patient_dir) in enumerate(zip(patient_ids, patient_dirs)):
    
    # path
    mimic_dict[patient_id] = {'path': patient_dir}
    
    try:
        # study dict
        study_ids = os.listdir(patient_dir)

        studies_dict = {}

        for study_id in study_ids:
            study_dir = os.path.join(patient_dir, study_id)
            if os.path.isdir(study_dir):
                studies_dict[study_id] = {}

                studies_dict[study_id]['images'] = get_images_dict(study_dir)
                studies_dict[study_id]['labels'] = get_labels(patient_id, study_id)
                studies_dict[study_id]['report_path'] = get_report_path(study_id) 

        mimic_dict[patient_id]['studies'] = studies_dict
        
    except Exception as e:
        print('--- {} ---'.format(idx))
        print(patient_id)
        print(e)
        error_ids.append(patient_id)

--- 5009 ---
p17603668
index 0 is out of bounds for axis 0 with size 0


In [14]:
count = 0
for key in mimic_dict.keys():
    if 'studies' not in mimic_dict[key].keys():
        count += 1
        
        
print(count)

8


In [9]:
list(mimic_dict.keys())[42]
list(mimic_dict.values())[42]

{'path': '/mnt/data-c314/asekuboyina/mimic_cxr/physionet.org/files/mimic-cxr-jpg/2.0.0/files/p17/p17801051',
 'studies': {'s59599946': {'images': {'72b904df-688f1711-e4916b95-c16ea3a7-0ff7585f': {'path': '/mnt/data-c314/asekuboyina/mimic_cxr/physionet.org/files/mimic-cxr-jpg/2.0.0/files/p17/p17801051/s59599946/72b904df-688f1711-e4916b95-c16ea3a7-0ff7585f.jpg',
     'view': 'AP'}},
   'labels': array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,  1., nan,
          nan]),
   'report_path': '/mnt/data-c314/asekuboyina/mimic_cxr/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-reports/p17/p17801051/s59599946.txt'},
  's57585076': {'images': {'4b34b80b-e4ee6ca3-ff7858ee-f746656e-73552c59': {'path': '/mnt/data-c314/asekuboyina/mimic_cxr/physionet.org/files/mimic-cxr-jpg/2.0.0/files/p17/p17801051/s57585076/4b34b80b-e4ee6ca3-ff7858ee-f746656e-73552c59.jpg',
     'view': 'AP'}},
   'labels': array([nan,  0., nan,  1., nan, nan, nan,  1., nan,  1., nan,  1.,  0.,
           1.]),
   

In [None]:
with open('mimic_dict.pickle', 'wb') as handle:
    pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)