In [2]:
import numpy as np
import pandas as pd
import pydicom
import glob

In [3]:
train_dcm_fps = glob.glob('dicom_dir/*.dcm')

In [4]:
train_dcms = [pydicom.read_file(x, stop_before_pixels=True) for x in train_dcm_fps]

In [5]:
train_dcms

[(0008, 0005) Specific Character Set              CS: 'ISO_IR 100'
 (0008, 0008) Image Type                          CS: ['ORIGINAL', 'PRIMARY', 'AXIAL']
 (0008, 0012) Instance Creation Date              DA: '19950131'
 (0008, 0013) Instance Creation Time              TM: '091428'
 (0008, 0016) SOP Class UID                       UI: CT Image Storage
 (0008, 0018) SOP Instance UID                    UI: 1.3.6.1.4.1.14519.5.2.1.6450.4012.266402896965029418188277260373
 (0008, 0020) Study Date                          DA: '19950131'
 (0008, 0021) Series Date                         DA: '19950131'
 (0008, 0022) Acquisition Date                    DA: '19950131'
 (0008, 0023) Content Date                        DA: '19950131'
 (0008, 0030) Study Time                          TM: '083043'
 (0008, 0031) Series Time                         TM: '091346'
 (0008, 0032) Acquisition Time                    TM: '091400'
 (0008, 0033) Content Time                        TM: '091428'
 (0008, 0050) Ac

In [6]:
def parse_dcm_metadata(dcm):
    unpacked_data = {}
    group_elem_to_keywords = {}
    # iterating here to force conversion from lazy RawDataElement to DataElement
    for d in dcm:
        pass
    # keys are pydicom.tag.BaseTag, values are pydicom.dataelem.DataElement
    for tag, elem in dcm.items():
        tag_group = tag.group
        tag_elem = tag.elem
        keyword = elem.keyword
        group_elem_to_keywords[(tag_group, tag_elem)] = keyword
        value = elem.value
        unpacked_data[keyword] = value
    return unpacked_data, group_elem_to_keywords

train_meta_dicts, tag_to_keyword_train = zip(*[parse_dcm_metadata(x) for x in train_dcms])

In [7]:
unified_tag_to_key_train = {k:v for dict_ in tag_to_keyword_train for k,v in dict_.items()}

In [34]:
train_df = pd.DataFrame.from_records(data=train_meta_dicts)

In [35]:
train_df.head(1)

Unnamed: 0,Unnamed: 1,AccessionNumber,AcquisitionDate,AcquisitionNumber,AcquisitionTime,AcquisitionsInSeries,AdditionalPatientHistory,AdmittingDiagnosesDescription,Allergies,BitsAllocated,...,StudyTime,TableFeedPerRotation,TableHeight,TableSpeed,TimeOfLastCalibration,TotalCollimationWidth,WindowCenter,WindowCenterWidthExplanation,WindowWidth,XRayTubeCurrent
0,STENTOR,9688812014390983,19950131,1,91400,,CT FNA RT + LT LUNG,,,16,...,83043,55.0,173.0,55.0,,40.0,40,,400,160


In [37]:
train_df['PatientAge'] = train_df['PatientAge'].str[1:3]
train_df.to_csv('test.csv', index=False)

In [38]:
import matplotlib.pyplot as plt # for basic plots
import seaborn as sns # for nicer plots

In [57]:
print("Modality: {}\nManufacturer: {}\nPatient Age: {}\nPatient Sex: {}\nPatient Name: {}\nPatient ID: {}".format(
    train_df.Modality, 
    train_df.Manufacturer,
    train_df.PatientAge,
    train_df.PatientSex,
    train_df.PatientName,
    train_df.PatientID))

Modality: 0     CT
1     CT
2     CT
3     CT
4     CT
5     CT
6     CT
7     CT
8     CT
9     CT
10    CT
11    CT
12    CT
13    CT
14    CT
15    CT
16    CT
17    CT
18    CT
19    CT
20    CT
21    CT
22    CT
23    CT
24    CT
25    CT
26    CT
27    CT
28    CT
29    CT
      ..
70    CT
71    CT
72    CT
73    CT
74    CT
75    CT
76    CT
77    CT
78    CT
79    CT
80    CT
81    CT
82    CT
83    CT
84    CT
85    CT
86    CT
87    CT
88    CT
89    CT
90    CT
91    CT
92    CT
93    CT
94    CT
95    CT
96    CT
97    CT
98    CT
99    CT
Name: Modality, Length: 100, dtype: object
Manufacturer: 0     GE MEDICAL SYSTEMS
1                SIEMENS
2                SIEMENS
3     GE MEDICAL SYSTEMS
4     GE MEDICAL SYSTEMS
5                SIEMENS
6     GE MEDICAL SYSTEMS
7                SIEMENS
8                SIEMENS
9                SIEMENS
10               SIEMENS
11               SIEMENS
12               SIEMENS
13               SIEMENS
14    GE MEDICAL SYSTEMS
15       