In [1]:
import pandas as pd
import numpy as np
import pydicom
import glob

In [2]:
## First, read all of my DICOM files into a list
mydicoms = glob.glob("*.dcm")

### Let's look at the contents of the first DICOM:

In [3]:
dcm1 = pydicom.dcmread(mydicoms[0])
dcm1

(0008, 0016) SOP Class UID                       UI: Secondary Capture Image Storage
(0008, 0018) SOP Instance UID                    UI: 1.3.6.1.4.1.11129.5.5.162426174634548301003630270411628292460952
(0008, 0060) Modality                            CS: 'DX'
(0008, 1030) Study Description                   LO: 'Effusion|Nodule|Pleural_Thickening|Mass'
(0010, 0020) Patient ID                          LO: '29579'
(0010, 0040) Patient's Sex                       CS: 'F'
(0010, 1010) Patient's Age                       AS: '24'
(0020, 000d) Study Instance UID                  UI: 1.3.6.1.4.1.11129.5.5.113025392650823751977671880960497589856674
(0020, 000e) Series Instance UID                 UI: 1.3.6.1.4.1.11129.5.5.168055162156043936178718006100964727334210
(0028, 0002) Samples per Pixel                   US: 1
(0028, 0004) Photometric Interpretation          CS: 'MONOCHROME2'
(0028, 0010) Rows                                US: 1024
(0028, 0011) Columns                             US:

In [13]:
## Do some exploratory work before about how to extract these attributes using pydicom... 
print(f'ID: {dcm1.PatientID}')
print(f'Sex: {dcm1.PatientSex}')
print(f'Age: {dcm1.PatientAge}')
print(f'Modality: {dcm1.Modality}')
print(f'Study Description: {dcm1.StudyDescription}')
print(f'Rows: {dcm1.Rows}')
print(f'Columns: {dcm1.Columns}')

ID: 29579
Sex: F
Age: 24
Modality: DX
Study Description: Effusion|Nodule|Pleural_Thickening|Mass
Rows: 1024
Columns: 1024


## Now, let's create the dataframe that we want, and populate it in a loop with all of our DICOMS:

To complete this exercise, create a single dataframe that has the following columns:
- Patient ID
- Patient Age (as an integer)
- Patient Sex (M/F)
- Imaging Modality
- Type of finding in the image
- Number of rows in the image
- Number of columns in the image

Save this dataframe as a .CSV file.

In [16]:
data = {
    'ID': [],
    'Sex': [],
    'Age': [],
    'Modality': [],
    'Description': [],
    'Rows': [],
    'Columns': [],
}
for f in mydicoms:
    dcm1 = pydicom.dcmread(f)
    pid = dcm1.PatientID
    sex = dcm1.PatientSex
    age = dcm1.PatientAge
    mod = dcm1.Modality
    study_desc = dcm1.StudyDescription
    rows = dcm1.Rows
    cols = dcm1.Columns
    data['ID'].append(pid)
    data['Sex'].append(sex)
    data['Age'].append(age)
    data['Modality'].append(mod)
    data['Description'].append(study_desc)
    data['Rows'].append(rows)
    data['Columns'].append(cols)

df = pd.DataFrame.from_dict(data)

In [17]:
df

Unnamed: 0,ID,Sex,Age,Modality,Description,Rows,Columns
0,29579,F,24,DX,Effusion|Nodule|Pleural_Thickening|Mass,1024,1024
1,1688,F,59,DX,Infiltration|Nodule,1024,1024
2,13659,F,62,DX,Consolidation|Mass|Pneumonia|Pneumothorax,1024,1024
3,13118,M,69,DX,Atelectasis,1024,1024
4,10172,F,59,DX,Atelectasis|Effusion,1024,1024
5,5066,M,52,DX,Cardiomegaly|Effusion|Infiltration,1024,1024
6,23075,M,31,DX,Mass,1024,1024
