In [2]:
import pandas as pd
import numpy as np
import pydicom
import glob

In [3]:
## First, read all of my DICOM files into a list
mydicoms = glob.glob("*.dcm")

In [4]:
mydicoms

['dicom_00013118_008.dcm',
 'dicom_00001688_000.dcm',
 'dicom_00010172_001.dcm',
 'dicom_00023075_033.dcm',
 'dicom_00013659_019.dcm',
 'dicom_00005066_030.dcm',
 'dicom_00029579_005.dcm']

### Let's look at the contents of the first DICOM:

In [5]:
dcm1 = pydicom.dcmread(mydicoms[0])
dcm1

(0008, 0016) SOP Class UID                       UI: Secondary Capture Image Storage
(0008, 0018) SOP Instance UID                    UI: 1.3.6.1.4.1.11129.5.5.139539879914217162512411239901306132962191
(0008, 0060) Modality                            CS: 'DX'
(0008, 1030) Study Description                   LO: 'Atelectasis'
(0010, 0020) Patient ID                          LO: '13118'
(0010, 0040) Patient's Sex                       CS: 'M'
(0010, 1010) Patient's Age                       AS: '69'
(0020, 000d) Study Instance UID                  UI: 1.3.6.1.4.1.11129.5.5.120992059193772113283592409393507044871674
(0020, 000e) Series Instance UID                 UI: 1.3.6.1.4.1.11129.5.5.110922964580080663514009950443538578354984
(0028, 0002) Samples per Pixel                   US: 1
(0028, 0004) Photometric Interpretation          CS: 'MONOCHROME2'
(0028, 0010) Rows                                US: 1024
(0028, 0011) Columns                             US: 1024
(0028, 0100) Bits Allo

In [16]:
print(dir(dcm1))

['BitsAllocated', 'BitsStored', 'Columns', 'HighBit', 'Modality', 'PatientAge', 'PatientID', 'PatientSex', 'PhotometricInterpretation', 'PixelData', 'PixelRepresentation', 'Rows', 'SOPClassUID', 'SOPInstanceUID', 'SamplesPerPixel', 'SeriesInstanceUID', 'StudyDescription', 'StudyInstanceUID', '__contains__', '__delattr__', '__delitem__', '__dir__', '__enter__', '__eq__', '__exit__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__gt__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_character_set', '_convert_pixel_data_using_handler', '_convert_pixel_data_without_handler', '_dataset_slice', '_do_pixel_data_conversion', '_pretty_str', '_slice_dataset', 'add', 'add_new', 'clear', 'convert_pixel_data', 'copy', 'data_element', 'decode', 'decompress', 'dir', 'elements', 'ensure_fil

In [28]:
## Do some exploratory work before about how to extract these attributes using pydicom... 

print(dcm1.PatientID)
print(dcm1.PatientSex)
print(dcm1.PatientAge)
print(dcm1.StudyDescription)
print(dcm1.Modality)
print(dcm1.PhotometricInterpretation)
print("Rows : ",dcm1.pixel_array.shape[0])
print("Columns : ",dcm1.pixel_array.shape[1])

13118
M
69
Atelectasis
DX
MONOCHROME2
Rows :  1024
Columns :  1024


## Now, let's create the dataframe that we want, and populate it in a loop with all of our DICOMS:

To complete this exercise, create a single dataframe that has the following columns:
- Patient ID
- Patient Age (as an integer)
- Patient Sex (M/F)
- Imaging Modality
- Type of finding in the image
- Number of rows in the image
- Number of columns in the image

Save this dataframe as a .CSV file.

In [30]:
patientID = []
patientAge = []
patientSex = []
imagingModality = []
finding = []
imageRows = []
imageCols = []

for imagedcm in mydicoms:
    dcm_i = pydicom.dcmread(imagedcm)
    patientID.append(dcm_i.PatientID)
    patientSex.append(dcm_i.PatientSex)
    patientAge.append(dcm_i.PatientAge)
    finding.append(dcm_i.StudyDescription)
    imagingModality.append(dcm_i.Modality)
    imageRows.append(dcm_i.pixel_array.shape[0])
    imageCols.append(dcm_i.pixel_array.shape[1])

data = {'patientID':patientID, 'patientAge':patientAge, 'patientSex':patientSex,
       'imagingModality':imagingModality, 'finding':finding, 'imageRows':imageRows, 'imageCols':imageCols}

df = pd.DataFrame(data=data)
df.head()

Unnamed: 0,patientID,patientAge,patientSex,imagingModality,finding,imageRows,imageCols
0,13118,69,M,DX,Atelectasis,1024,1024
1,1688,59,F,DX,Infiltration|Nodule,1024,1024
2,10172,59,F,DX,Atelectasis|Effusion,1024,1024
3,23075,31,M,DX,Mass,1024,1024
4,13659,62,F,DX,Consolidation|Mass|Pneumonia|Pneumothorax,1024,1024


In [None]:
Alternatively:

all_data = []

for i in mydicoms: 
    dcm = pydicom.dcmread(i)
    fields = [dcm.PatientID, int(dcm.PatientAge), dcm.PatientSex, dcm.Modality, dcm.StudyDescription,
             dcm.Rows, dcm.Columns]
    all_data.append(fields)
    
mydata = pd.DataFrame(all_data, 
                      columns = ['PatientID','PatientAge','PatientSex','Modality','Findings','Rows','Columns'])

In [None]:
df.to_csv("DICOM_pre.csv")