In [1]:
import pandas as pd
import numpy as np
import os, sys, datetime, random, math
import pydicom as dicom
import cv2
from pathlib import Path
import pydicom as dicom
from pydicom.fileset import FileSet
import matplotlib.pyplot as plt
from PIL import Image

# Avoid duplicates of patients
patient_imgpath = {}

def save_split(save_file, train):
    # Export to train and test csv
    # format as patientid, filename, label, separated by a space
    # Inputs:
    #   save_file is a path to a file that will be created, where the csv will be stored
    #   train is a list
    train_file = open(os.path.join(save_file),'w') 
    for sample in train:
        if len(sample) == 4:
            info = str(sample[0]) + ' ' + sample[1] + ' ' + sample[2] + ' ' + sample[3] + '\n'
        else:
            info = str(sample[0]) + ' ' + sample[1] + ' ' + sample[2] + '\n'
        train_file.write(info)
    train_file.close()

# QEH Dataset

In [2]:
# Transform QEH dataset CXRs from DICOMs into pngs
flag_save = True
# save path
if flag_save:
    path_to_save_images = Path(os.path.join(".","QEH_Earliest_CXR_per_patient"))
    path_to_save_images.mkdir(parents=True, exist_ok=True)

# dicom path
directory_dicom = "D:/data/QEH_COVID19_DATASET/DICOM/"
i=0
details = {"PatientID":[], "PatientSex":[], "PatientAge":[], "SOPInstanceUID":[], "AcquisitionDate":[], 
           "Modality":[], "XRayTubeCurrent":[], "ExposureTime":[], "KVP":[], "XPixelSpacing":[], "YPixelSpacing":[], "ViewPosition":[]}
counter = 0
for root, dirs, files in os.walk(directory_dicom):
    for name in files:
        if name == "DICOMDIR":
            filepath = os.path.join(root,name)
            fs = FileSet(dicom.dcmread(filepath))
            
            # Find each patient and modality in the DICOMDIR
            patient_ids = fs.find_values("PatientID")
            modalities = fs.find_values("Modality")
            #available_modalities = set(["CT"]).intersection(set(modalities))
            available_modalities = set(("CR", "DX")).intersection(set(modalities))
            if len(available_modalities) == 0:
                print("None of the specified modalities are available in this DICOMDIR.")
                continue
            for patient_id in patient_ids:
                # Narrow down the datasets in the DICOMDIR that fit patient_id and modality
                # For the modality in the DICOMDIR
                instances = []
                for modality_count, modality in enumerate(available_modalities):
                    #print(modality)
                    instances += fs.find(load=False, PatientID=patient_id, Modality=modality)
                
                
                # Find the acquisition dates of each instance
                acqDate = []
                for instance_counter, instance in enumerate(instances):
                    ds = instance.load()
                    acqDate.append(ds.AcquisitionDate+ds.AcquisitionTime)
                # Select the lowest acquisition date
                acqDate = list(map(float, acqDate))
                min_acqDate=min(acqDate)
                #print("Min acquisition date: {}".format(min_acqDate))
                if len(instances) == 0:
                    raise RuntimeError("No instance found with min_acqDate")
                
                for instance_counter, instance in enumerate(instances):
                    # Load instances
                    ds = instance.load()
                    if float(ds.AcquisitionDate+ds.AcquisitionTime) != min_acqDate:
                        continue
                    else:
                        #print(ds.AcquisitionDate + "," + str(min_acqDate))
                        
                        # List to link SOPInstanceUID with patient
                        details["PatientID"].append(ds.PatientID)
                        details["PatientSex"].append(ds.PatientSex)
                        details["PatientAge"].append(ds.PatientAge)
                        details["AcquisitionDate"].append(ds.AcquisitionDate)
                        details["SOPInstanceUID"].append(ds.SOPInstanceUID)

                        # Optional DICOM tags, so not all DICOM images will have them.
                        try:
                            details["ViewPosition"].append(ds.ViewPosition)
                        except:
                            details["ViewPosition"].append("None")
                        try:
                            details["XRayTubeCurrent"].append(ds.XRayTubeCurrent)
                        except:
                            details["XRayTubeCurrent"].append("None")
                        try:
                            details["ExposureTime"].append(ds.ExposureTime)
                        except:
                            details["ExposureTime"].append("None")
                        try:
                            details["KVP"].append(ds.KVP)
                        except:
                            details["KVP"].append("None")
                        details["Modality"].append(ds.Modality)
                        details["XPixelSpacing"].append(ds.ImagerPixelSpacing[0])
                        details["YPixelSpacing"].append(ds.ImagerPixelSpacing[1])

                        # Saving as png, needs to convert to uint8
                        image = ds.pixel_array
                        if flag_save:
                            image_filename = str(ds.SOPInstanceUID) + ".png"
                            #Rescale to 0-255 and convert to uint8
                            rescaled = (255.0 / image.max() * (image - image.min())).astype(np.uint8)
                            im = Image.fromarray(rescaled)
                            im.save(os.path.join(path_to_save_images, image_filename))
                        counter +=1
                        #print(details)
                        #i=i+1
                        #if i > 0:
                        #    raise RuntimeError("Debugging--remove when complete")"""

#print(details)
print(counter)
if flag_save:
    print("Completed Image Saving")

import pandas as pd
df = pd.DataFrame(details)
df.to_csv(os.path.join(path_to_save_images,"metadata.csv"))

524
Completed Image Saving


In [3]:
print(df[df.duplicated(["SOPInstanceUID"],keep=False)])

524
524
Empty DataFrame
Columns: [PatientID, PatientSex, PatientAge, SOPInstanceUID, AcquisitionDate, Modality, XRayTubeCurrent, ExposureTime, KVP, XPixelSpacing, YPixelSpacing, ViewPosition]
Index: []


In [4]:
print(ds)
STOP

Dataset.file_meta -------------------------------
(0002, 0000) File Meta Information Group Length  UL: 234
(0002, 0001) File Meta Information Version       OB: b'\x00\x01'
(0002, 0002) Media Storage SOP Class UID         UI: Digital X-Ray Image Storage - For Presentation
(0002, 0003) Media Storage SOP Instance UID      UI: 1.2.392.200036.9125.4.0.1679814657.2751608786.1061514505
(0002, 0010) Transfer Syntax UID                 UI: JPEG Lossless, Non-Hierarchical, First-Order Prediction (Process 14 [Selection Value 1])
(0002, 0012) Implementation Class UID            UI: 1.2.276.0.7238010.5.0.3.5.4
(0002, 0013) Implementation Version Name         SH: 'OSIRIX'
(0002, 0016) Source Application Entity Title     AE: 'QEHWFM01'
(0002, 0100) Private Information Creator UID     UI: DRI-Mac-mini-04
-------------------------------------------------
(0008, 0005) Specific Character Set              CS: 'ISO_IR 100'
(0008, 0008) Image Type                          CS: ['ORIGINAL', 'PRIMARY', '', 'RT

NameError: name 'STOP' is not defined

## RSNA (Normal and Pneumonia)

In [None]:
# path to ricord covid-19 images created by create_ricord_dataset/create_ricord_dataset.ipynb
# run create_ricord_dataset.ipynb before this notebook
ricord_imgpath = 'create_ricord_dataset/ricord_images'
ricord_txt = 'create_ricord_dataset/ricord_data_set.txt'

# RSNA numbers 8851 unique normal CXRs and 6012 unique normal 
# path to https://www.kaggle.com/c/rsna-pneumonia-detection-challenge
rsna_datapath = 'D:/data/RSNA_Pnemonia_Challenge'
# get all the normal from here
rsna_csvname = 'stage_2_detailed_class_info.csv' 
# get all the 1s from here since 1 indicate pneumonia
# found that images that aren't pneunmonia and also not normal are classified as 0s
rsna_csvname2 = 'stage_2_train_labels.csv' 
rsna_imgpath = 'stage_2_train_images'

# add normal and rest of pneumonia cases from https://www.kaggle.com/c/rsna-pneumonia-detection-challenge
savepath = 'rsna_data'
num_test = 100 # num of each class
num_val = 100 # num of each class

# In-memory parameters
train = []
validation = []
test = []
test_count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}
train_count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}
validation_count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}

csv_normal = pd.read_csv(os.path.join(rsna_datapath, rsna_csvname), nrows=None)
csv_pneu = pd.read_csv(os.path.join(rsna_datapath, rsna_csvname2), nrows=None)
patients = {'normal': [], 'pneumonia': []}

for index, row in csv_normal.iterrows():
    if row['class'] == 'Normal':
        patients['normal'].append(row['patientId'])

for index, row in csv_pneu.iterrows():
    if int(row['Target']) == 1:
        patients['pneumonia'].append(row['patientId'])

for key in patients.keys(): # For "normal" and "pneumonia"
    arr = np.array(patients[key]) # numpy array of strings
    if arr.size == 0:
        continue
        
    # find a random set of patients
    test_patients = random.sample(list(arr), num_test)
    # remove this set from consideration
    train_val_arr = np.array(list(set(arr)-set(test_patients)))
    val_patients = random.sample(list(train_val_arr), num_val)
    
    checkNoIntersection = set(val_patients).intersection(test_patients)
    if len(checkNoIntersection) > 0:
        raise RuntimeError("Validation and Test Patients are overlapping.")
    
    np.save('rsna_test_patients_{}.npy'.format(key), np.array(test_patients))
    np.save('rsna_val_patients_{}.npy'.format(key), np.array(val_patients))
    # test_patients = np.load('rsna_test_patients_{}.npy'.format(key))
    
    for patient in arr:
        # Check for duplicate patients
        if patient not in patient_imgpath:
            patient_imgpath[patient] = [patient]
        else:
            continue  # skip since image has already been written
                
        ds = dicom.dcmread(os.path.join(rsna_datapath, rsna_imgpath, patient + '.dcm'))
        pixel_array_numpy = ds.pixel_array
        imgname = patient + '.png'
        if patient in test_patients:
            cv2.imwrite(os.path.join(savepath, key, 'test', imgname), pixel_array_numpy)
            test.append([patient, imgname, key, 'rsna'])
            test_count[key] += 1
        elif patient in val_patients:
            cv2.imwrite(os.path.join(savepath, key, 'validation', imgname), pixel_array_numpy)
            validation.append([patient, imgname, key, 'rsna'])
            validation_count[key] += 1
        else:
            cv2.imwrite(os.path.join(savepath, key, 'train', imgname), pixel_array_numpy)
            train.append([patient, imgname, key, 'rsna'])
            train_count[key] += 1

print('test count: ', test_count)
print('validation count: ', validation_count)
print('train count: ', train_count)

# Export
save_split(os.path.join(savepath,"train_split.txt"), train)
save_split(os.path.join(savepath,"test_split.txt"), test)
save_split(os.path.join(savepath,"val_split.txt"), validation)

# COVID Datasets

In [None]:
# RICORD patient details
# RICORD Text File generated by Wang et al.
