## Imports and logging

In [2]:
import os
import re
import json
import jsonschema
import pandas as pd
import numpy as np
import logging
import pytz
import pydicom
import string
import tzlocal
import logging
import zipfile
import datetime
import argparse
import nibabel
from fnmatch import fnmatch
from pprint import pprint

logging.basicConfig()
log = logging.getLogger('U2787g')

## Functions

In [3]:
def get_session_label(dcm):
    """
    Switch on manufacturer and either pull out the StudyID or the StudyInstanceUID
    """
    session_label = ''
    if ( dcm.get('Manufacturer') and (dcm.get('Manufacturer').find('GE') != -1 or dcm.get('Manufacturer').find('Philips') != -1 ) and dcm.get('StudyID')):
        session_label = dcm.get('StudyID')
    else:
        session_label = dcm.get('StudyInstanceUID')

    return session_label


def validate_timezone(zone):
    # pylint: disable=missing-docstring
    if zone is None:
        zone = tzlocal.get_localzone()
    else:
        try:
            zone = pytz.timezone(zone.zone)
        except pytz.UnknownTimeZoneError:
            zone = None
    return zone


def parse_patient_age(age):
    """
    Parse patient age from string.
    convert from 70d, 10w, 2m, 1y to datetime.timedelta object.
    Returns age as duration in seconds.
    """
    if age == 'None' or not age:
        return None

    conversion = {  # conversion to days
        'Y': 365.25,
        'M': 30,
        'W': 7,
        'D': 1,
    }
    scale = age[-1:]
    value = age[:-1]
    if scale not in conversion.keys():
        # Assume years
        scale = 'Y'
        value = age

    age_in_seconds = datetime.timedelta(int(value) * conversion.get(scale)).total_seconds()

    # Make sure that the age is reasonable
    if not age_in_seconds or age_in_seconds <= 0:
        age_in_seconds = None

    return age_in_seconds


def timestamp(date, time, timezone):
    """
    Return datetime formatted string
    """
    if date and time and timezone:
        # return datetime.datetime.strptime(date + time[:6], '%Y%m%d%H%M%S')
        try:
            return timezone.localize(datetime.datetime.strptime(date + time[:6], '%Y%m%d%H%M%S'), timezone)
        except:
            log.warning('Failed to create timestamp!')
            log.info(date)
            log.info(time)
            log.info(timezone)
            return None
    return None


def get_timestamp(dcm, timezone):
    """
    Parse Study Date and Time, return acquisition and session timestamps
    """
    if hasattr(dcm, 'StudyDate') and hasattr(dcm, 'StudyTime'):
        study_date = dcm.StudyDate
        study_time = dcm.StudyTime
    elif hasattr(dcm, 'StudyDateTime'):
        study_date = dcm.StudyDateTime[0:8]
        study_time = dcm.StudyDateTime[8:]
    else:
        study_date = None
        study_time = None

    if hasattr(dcm, 'AcquisitionDate') and hasattr(dcm, 'AcquisitionTime'):
        acquitision_date = dcm.AcquisitionDate
        acquisition_time = dcm.AcquisitionTime
    elif hasattr(dcm, 'AcquisitionDateTime'):
        acquitision_date = dcm.AcquisitionDateTime[0:8]
        acquisition_time = dcm.AcquisitionDateTime[8:]
    # The following allows the timestamps to be set for ScreenSaves
    elif hasattr(dcm, 'ContentDate') and hasattr(dcm, 'ContentTime'):
        acquitision_date = dcm.ContentDate
        acquisition_time = dcm.ContentTime
    else:
        acquitision_date = None
        acquisition_time = None

    session_timestamp = timestamp(dcm.StudyDate, dcm.StudyTime, timezone)
    acquisition_timestamp = timestamp(acquitision_date, acquisition_time, timezone)

    if session_timestamp:
        if session_timestamp.tzinfo is None:
            log.info('no tzinfo found, using UTC...')
            session_timestamp = pytz.timezone('UTC').localize(session_timestamp)
        session_timestamp = session_timestamp.isoformat()
    else:
        session_timestamp = ''
    if acquisition_timestamp:
        if acquisition_timestamp.tzinfo is None:
            log.info('no tzinfo found, using UTC')
            acquisition_timestamp = pytz.timezone('UTC').localize(acquisition_timestamp)
        acquisition_timestamp = acquisition_timestamp.isoformat()
    else:
        acquisition_timestamp = ''
    return session_timestamp, acquisition_timestamp


def get_sex_string(sex_str):
    """
    Return male or female string.
    """
    if sex_str == 'M':
        sex = 'male'
    elif sex_str == 'F':
        sex = 'female'
    else:
        sex = ''
    return sex


def assign_type(s):
    """
    Sets the type of a given input.
    """
    if type(s) == pydicom.valuerep.PersonName or type(s) == pydicom.valuerep.PersonName3 or type(s) == pydicom.valuerep.PersonNameBase:
        return format_string(s)
    if type(s) == list or type(s) == pydicom.multival.MultiValue:
        try:
            return [ int(x) for x in s ]
        except ValueError:
            try:
                return [ float(x) for x in s ]
            except ValueError:
                return [ format_string(x) for x in s if len(x) > 0 ]
    else:
        s = str(s)
        try:
            return int(s)
        except ValueError:
            try:
                return float(s)
            except ValueError:
                return format_string(s)


def format_string(in_string):
    formatted = re.sub(r'[^\x00-\x7f]',r'', str(in_string)) # Remove non-ascii characters
    formatted = ''.join(filter(lambda x: x in string.printable, formatted))
    if len(formatted) == 1 and formatted == '?':
        formatted = None
    return formatted#.encode('utf-8').strip()


def get_seq_data(sequence, ignore_keys):
    seq_dict = {}
    for seq in sequence:
        for s_key in seq.dir():
            s_val = getattr(seq, s_key, '')
            if type(s_val) is pydicom.UID.UID or s_key in ignore_keys:
                continue

            if type(s_val) == pydicom.sequence.Sequence:
                _seq = get_seq_data(s_val, ignore_keys)
                seq_dict[s_key] = _seq
                continue

            if type(s_val) == str:
                s_val = format_string(s_val)
            else:
                s_val = assign_type(s_val)

            if s_val:
                seq_dict[s_key] = s_val

    return seq_dict


def get_pydicom_header(dcm):
    # Extract the header values
    header = {}
    exclude_tags = ['[Unknown]', 
                    'PixelData', 
                    'Pixel Data',  
                    '[User defined data]', 
                    '[Protocol Data Block (compressed)]', 
                    '[Histogram tables]', 
                    '[Unique image iden]']
    tags = dcm.dir()
    for tag in tags:
        try:
            if (tag not in exclude_tags) and ( type(dcm.get(tag)) != pydicom.sequence.Sequence ):
                value = dcm.get(tag)
                if value or value == 0: # Some values are zero
                    # Put the value in the header
                    if type(value) == str and len(value) < 10240: # Max pydicom field length
                        header[tag] = format_string(value)
                    else:
                        header[tag] = assign_type(value)
                else:
                    log.debug('No value found for tag: ' + tag)

            if type(dcm.get(tag)) == pydicom.sequence.Sequence:
                seq_data = get_seq_data(dcm.get(tag), exclude_tags)
                # Check that the sequence is not empty
                if seq_data:
                    header[tag] = seq_data
        except:
            log.debug('Failed to get ' + tag)
            pass
    return header


def get_csa_header(dcm):
    exclude_tags = ['PhoenixZIP', 'SrMsgBuffer']
    header = {}
    try:
        raw_csa_header = nibabel.nicom.dicomwrappers.SiemensWrapper(dcm).csa_header
        tags = raw_csa_header['tags']
    except:
        log.warning('Failed to parse csa header!')
        return header

    for tag in tags:
        if not raw_csa_header['tags'][tag]['items'] or tag in exclude_tags:
            log.debug('Skipping : %s' % tag)
            pass
        else:
            value = raw_csa_header['tags'][tag]['items']
            if len(value) == 1:
                value = value[0]
                if type(value) == str and ( len(value) > 0 and len(value) < 1024 ):
                    header[format_string(tag)] = format_string(value)
                else:
                    header[format_string(tag)] = assign_type(value)
            else:
                header[format_string(tag)] = assign_type(value)

    return header


def get_classification_from_string(value):
    result = {}

    parts = re.split(r'\s*,\s*', value)
    last_key = None
    for part in parts:
        key_value = re.split(r'\s*:\s*', part)

        if len(key_value) == 2:
            last_key = key = key_value[0]
            value = key_value[1]
        else:
            if last_key:
                key = last_key
            else:
                log.warning('Unknown classification format: {0}'.format(part))
                key = 'Custom'
            value = part

        if key not in result:
            result[key] = []

        result[key].append(value)

    return result


def validate_against_template(input_dict, template, error_log_path):
    """
    This is a function for validating a dictionary against a template. Given
    an input_dict and a template object, it will create a JSON schema validator
    and construct an object that is a list of error dictionaries. It will write a
    JSON file to the specified error_log_path and return the validation_errors object as
    well as log each error.message to log.errors

    :param input_dict: a dictionary of DICOM header data to be validated
    :param template: a template dictionary to validate against
    :param error_log_path: the path to which to write error log JSON
    :return: validation_errors, an object containing information on validation errors
    """
    # Initialize json schema validator
    validator = jsonschema.Draft7Validator(template)
    # Initialize list object for storing validation errors
    validation_errors = []
    for error in sorted(validator.iter_errors(input_dict), key=str):
        # Create a temporary dictionary for the individual error
        tmp_dict = {}
        # Get error type
        tmp_dict['error_type'] = error.validator
        # Get error message and log it
        tmp_dict['error_message'] = error.message
        log.error(error.message)
        # Required field errors are a little special and need to be handled
        # separately to get the field. We don't get the schema because it
        # will print the entire template schema
        if error.validator == "required":
            # Get the item failing validation from the error message
            tmp_dict['item'] = 'info.' + error.message.split("'")[1]
        # Get additional information for pattern and type errors
        elif error.validator in ("pattern", "type"):
            # Get the value of the field that failed validation
            tmp_dict['error_value'] = error.instance
            # Get the field that failed validation
            tmp_dict['item'] = 'info.' + str(error.path.pop())
            # Get the schema object used to validate in failed validation
            tmp_dict['schema'] = error.schema
        elif error.validator == "anyOf":
            tmp_dict['schema'] = {"anyOf": error.schema['anyOf']}
        else:
            pass
        # Append individual error object to the return validation_errors object
        validation_errors.append(tmp_dict)

    with open(error_log_path, 'w') as outfile:
        json.dump(validation_errors, outfile, separators=(', ', ': '), sort_keys=True, indent=4)
    return validation_errors


def dicom_to_json(zip_file_path, outbase, timezone):
    # Check for input file path
    if not os.path.exists(zip_file_path):
        log.debug('could not find %s' % zip_file_path)
        log.debug('checking input directory ...')
        if os.path.exists(os.path.join('/input', zip_file_path)):
            zip_file_path = os.path.join('/input', zip_file_path)
            log.debug('found %s' % zip_file_path)

    if not outbase:
        outbase = '/flywheel/v0/output'
        log.info('setting outbase to %s' % outbase)

    # Extract the last file in the zip to /tmp/ and read it
    dcm = []
    if zipfile.is_zipfile(zip_file_path):
        zip = zipfile.ZipFile(zip_file_path)
        num_files = len(zip.namelist())
        for n in range((num_files - 1), -1, -1):
            dcm_path = zip.extract(zip.namelist()[n], '/tmp')
            if os.path.isfile(dcm_path):
                try:
                    log.info('reading %s' % dcm_path)
                    dcm = pydicom.read_file(dcm_path)
                    # Here we check for the Raw Data Storage SOP Class, if there
                    # are other pydicom files in the zip then we read the next one,
                    # if this is the only class of pydicom in the file, we accept
                    # our fate and move on.
                    if dcm.get('SOPClassUID') == 'Raw Data Storage' and n != range((num_files - 1), -1, -1)[-1]:
                        continue
                    else:
                        break
                except:
                    pass
            else:
                log.warning('%s does not exist!' % dcm_path)
    else:
        log.info('Not a zip. Attempting to read %s directly' % os.path.basename(zip_file_path))
        dcm = pydicom.read_file(zip_file_path)

    if not dcm:
        log.warning('dcm is empty!!!')
        os.sys.exit(1)

    # Build metadata
    metadata = {}

    # Session metadata
    metadata['session'] = {}
    session_timestamp, acquisition_timestamp = get_timestamp(dcm, timezone);
    if session_timestamp:
        metadata['session']['timestamp'] = session_timestamp
    if hasattr(dcm, 'OperatorsName') and dcm.get('OperatorsName'):
        metadata['session']['operator'] = format_string(dcm.get('OperatorsName'))
    session_label = get_session_label(dcm)
    if session_label:
        metadata['session']['label'] = session_label

    # Subject Metadata
    metadata['session']['subject'] = {}
    if hasattr(dcm, 'PatientSex') and get_sex_string(dcm.get('PatientSex')):
        metadata['session']['subject']['sex'] = get_sex_string(dcm.get('PatientSex'))
    if hasattr(dcm, 'PatientAge') and dcm.get('PatientAge'):
        try:
            age = parse_patient_age(dcm.get('PatientAge'))
            if age:
                metadata['session']['subject']['age'] = int(age)
        except:
            pass
    if hasattr(dcm, 'PatientName') and dcm.get('PatientName').given_name:
        # If the first name or last name field has a space-separated string, and one or the other field is not
        # present, then we assume that the operator put both first and last names in that one field. We then
        # parse that field to populate first and last name.
        metadata['session']['subject']['firstname'] = str(format_string(dcm.get('PatientName').given_name))
        if not dcm.get('PatientName').family_name:
            name = format_string(dcm.get('PatientName').given_name.split(' '))
            if len(name) == 2:
                first = name[0]
                last = name[1]
                metadata['session']['subject']['lastname'] = str(last)
                metadata['session']['subject']['firstname'] = str(first)
    if hasattr(dcm, 'PatientName') and dcm.get('PatientName').family_name:
        metadata['session']['subject']['lastname'] = str(format_string(dcm.get('PatientName').family_name))
        if not dcm.get('PatientName').given_name:
            name = format_string(dcm.get('PatientName').family_name.split(' '))
            if len(name) == 2:
                first = name[0]
                last = name[1]
                metadata['session']['subject']['lastname'] = str(last)
                metadata['session']['subject']['firstname'] = str(first)

    # File classification
    pydicom_file = {}
    pydicom_file['name'] = os.path.basename(zip_file_path)
    pydicom_file['modality'] = format_string(dcm.get('Modality', 'MR'))

    # Acquisition metadata
    metadata['acquisition'] = {}
    if hasattr(dcm, 'Modality') and dcm.get('Modality'):
        metadata['acquisition']['instrument'] = format_string(dcm.get('Modality'))

    series_desc = format_string(dcm.get('SeriesDescription', ''))
    if series_desc:
        metadata['acquisition']['label'] = series_desc

    if acquisition_timestamp:
        metadata['acquisition']['timestamp'] = acquisition_timestamp

    # Acquisition metadata from pydicom header
    pydicom_file['info'] = get_pydicom_header(dcm)

    # Validate header data
    error_filepath = os.path.join(output_folder, 'error.log.json')
    validation_errors = validate_against_template(pydicom_file['info'], json_template, error_filepath)
    if validation_errors:
        metadata['acquisition']['tags'] = ['error']

    # Append the pydicom_file to the files array
    metadata['acquisition']['files'] = [pydicom_file]

    # Acquisition metadata from pydicom header
    metadata['acquisition']['metadata'] = get_pydicom_header(dcm)
    if dcm.get('Manufacturer') == 'SIEMENS':
        csa_header = get_csa_header(dcm)
        if csa_header:
            metadata['acquisition']['metadata']['CSAHeader'] = csa_header

    # Write out the metadata to file (.metadata.json)
    metafile_outname = os.path.join(os.path.dirname(outbase), '.metadata.json')
    with open(metafile_outname, 'w') as metafile:
        json.dump(metadata, metafile, separators=(', ', ': '), sort_keys=True, indent=4)

    # Show the metadata
    pprint(metadata)

    return metafile_outname

## Load in JSON Schema template

In [4]:


with open(template_filepath) as template_data:
    template_json = json.load(template_data)

NameError: name 'template_filepath' is not defined

In [None]:
test_dictionary = {
    "Modality": "CT",
    "ImageType": "SCREEN SAVE",
    "PatientID": "55555",
}

In [None]:
validate_against_template(test_dictionary, template_json, "error.log.json")

In [None]:
# Initialize json schema validator
validator = jsonschema.Draft7Validator(template_json)

In [5]:
for error in sorted(validator.iter_errors(test_dictionary), key=str):
    print(error.validator)
    if error.validator == "anyOf":
        print(error.schema)

NameError: name 'validator' is not defined

In [6]:
template_filepath = "U2787g_template"
# Configure timezone and dicom filepath
timezone = validate_timezone(tzlocal.get_localzone())
zip_file_path = "A.zip"
outbase = os.getcwd()
output_folder = os.getcwd()
# Import JSON template
with open(template_filepath) as template_data:
    json_template = json.load(template_data)


In [8]:
# Check for input file path
if not os.path.exists(zip_file_path):
    log.debug('could not find %s' % zip_file_path)
    log.debug('checking input directory ...')
    if os.path.exists(os.path.join('/input', zip_file_path)):
        zip_file_path = os.path.join('/input', zip_file_path)
        log.debug('found %s' % zip_file_path)

if not outbase:
    outbase = '/flywheel/v0/output'
    log.info('setting outbase to %s' % outbase)

# Extract the last file in the zip to /tmp/ and read it
dcm = []
if zipfile.is_zipfile(zip_file_path):
    zip = zipfile.ZipFile(zip_file_path)
    num_files = len(zip.namelist())
    for n in range((num_files - 1), -1, -1):
        dcm_path = zip.extract(zip.namelist()[n], '/tmp')
        if os.path.isfile(dcm_path):
            try:
                log.info('reading %s' % dcm_path)
                dcm = pydicom.read_file(dcm_path)
                # Here we check for the Raw Data Storage SOP Class, if there
                # are other pydicom files in the zip then we read the next one,
                # if this is the only class of pydicom in the file, we accept
                # our fate and move on.
                if dcm.get('SOPClassUID') == 'Raw Data Storage' and n != range((num_files - 1), -1, -1)[-1]:
                    continue
                else:
                    break
            except:
                pass
        else:
            log.warning('%s does not exist!' % dcm_path)
else:
    log.info('Not a zip. Attempting to read %s directly' % os.path.basename(zip_file_path))
    dcm = pydicom.read_file(zip_file_path)

if not dcm:
    log.warning('dcm is empty!!!')
    os.sys.exit(1)

# Build metadata
metadata = {}

# Session metadata
metadata['session'] = {}
session_timestamp, acquisition_timestamp = get_timestamp(dcm, timezone);
if session_timestamp:
    metadata['session']['timestamp'] = session_timestamp
if hasattr(dcm, 'OperatorsName') and dcm.get('OperatorsName'):
    metadata['session']['operator'] = format_string(dcm.get('OperatorsName'))
session_label = get_session_label(dcm)
if session_label:
    metadata['session']['label'] = session_label

# Subject Metadata
metadata['session']['subject'] = {}
if hasattr(dcm, 'PatientSex') and get_sex_string(dcm.get('PatientSex')):
    metadata['session']['subject']['sex'] = get_sex_string(dcm.get('PatientSex'))
if hasattr(dcm, 'PatientAge') and dcm.get('PatientAge'):
    try:
        age = parse_patient_age(dcm.get('PatientAge'))
        if age:
            metadata['session']['subject']['age'] = int(age)
    except:
        pass
if hasattr(dcm, 'PatientName') and dcm.get('PatientName').given_name:
    # If the first name or last name field has a space-separated string, and one or the other field is not
    # present, then we assume that the operator put both first and last names in that one field. We then
    # parse that field to populate first and last name.
    metadata['session']['subject']['firstname'] = str(format_string(dcm.get('PatientName').given_name))
    if not dcm.get('PatientName').family_name:
        name = format_string(dcm.get('PatientName').given_name.split(' '))
        if len(name) == 2:
            first = name[0]
            last = name[1]
            metadata['session']['subject']['lastname'] = str(last)
            metadata['session']['subject']['firstname'] = str(first)
if hasattr(dcm, 'PatientName') and dcm.get('PatientName').family_name:
    metadata['session']['subject']['lastname'] = str(format_string(dcm.get('PatientName').family_name))
    if not dcm.get('PatientName').given_name:
        name = format_string(dcm.get('PatientName').family_name.split(' '))
        if len(name) == 2:
            first = name[0]
            last = name[1]
            metadata['session']['subject']['lastname'] = str(last)
            metadata['session']['subject']['firstname'] = str(first)

# File classification
pydicom_file = {}
pydicom_file['name'] = os.path.basename(zip_file_path)
pydicom_file['modality'] = format_string(dcm.get('Modality', 'MR'))

# Acquisition metadata
metadata['acquisition'] = {}
if hasattr(dcm, 'Modality') and dcm.get('Modality'):
    metadata['acquisition']['instrument'] = format_string(dcm.get('Modality'))

series_desc = format_string(dcm.get('SeriesDescription', ''))
if series_desc:
    metadata['acquisition']['label'] = series_desc

if acquisition_timestamp:
    metadata['acquisition']['timestamp'] = acquisition_timestamp

# Acquisition metadata from pydicom header
pydicom_file['info'] = get_pydicom_header(dcm)

# Validate header data
error_filepath = os.path.join(output_folder, 'error.log.json')
validation_errors = validate_against_template(pydicom_file['info'], json_template, error_filepath)
if validation_errors:
    metadata['acquisition']['tags'] = ['error']

# Append the pydicom_file to the files array
metadata['acquisition']['files'] = [pydicom_file]

# Acquisition metadata from pydicom header
metadata['acquisition']['metadata'] = get_pydicom_header(dcm)
if dcm.get('Manufacturer') == 'SIEMENS':
    csa_header = get_csa_header(dcm)
    if csa_header:
        metadata['acquisition']['metadata']['CSAHeader'] = csa_header

# Write out the metadata to file (.metadata.json)
metafile_outname = os.path.join(os.path.dirname(outbase), '.metadata.json')
with open(metafile_outname, 'w') as metafile:
    json.dump(metadata, metafile, separators=(', ', ': '), sort_keys=True, indent=4)

# Show the metadata
pprint(metadata)

ERROR:U2787g:'737178' does not match '^[0-9]{5}$'
ERROR:U2787g:['ORIGINAL', 'PRIMARY', 'OTHER'] is not of type 'string'


{'acquisition': {'files': [{'info': {'AccessionNumber': '039208195',
                                     'AcquisitionDate': '20050623',
                                     'AcquisitionMatrix': [0, 256, 256, 0],
                                     'AcquisitionNumber': 1,
                                     'AcquisitionTime': '083051',
                                     'AdditionalPatientHistory': 'RESEARCH PT',
                                     'AngioFlag': 'N',
                                     'BitsAllocated': 16,
                                     'BitsStored': 12,
                                     'CardiacNumberOfImages': 0,
                                     'Columns': 256,
                                     'ContentDate': '20050623',
                                     'ContentTime': '083051',
                                     'DeviceSerialNumber': '000000209526SMMR',
                                     'EchoNumbers': 1,
                                  

In [12]:
import pandas as pd
pd.DataFrame.from_dict(metadata['acquisition']['files'][0]['info'])

ValueError: arrays must all be same length

## Import all dcm file headers

I'd like a way to import all headers as a dataframe in order to do validation across files. Pandas does not accept lists as dictionary values, so just non-sequence for now.


In [37]:
def assign_type_pd(s):
    """
    Sets the type of a given input.
    """
    if type(s) == pydicom.valuerep.PersonName or type(s) == pydicom.valuerep.PersonName3 or type(s) == pydicom.valuerep.PersonNameBase:
        return format_string(s)
    else:
        s = str(s)
        try:
            return int(s)
        except ValueError:
            try:
                return float(s)
            except ValueError:
                return format_string(s)
def dicom_header_to_pd(dcm):
    # Extract the header values
    header = {}
    exclude_tags = ['[Unknown]', 
                    'PixelData', 
                    'Pixel Data',  
                    '[User defined data]', 
                    '[Protocol Data Block (compressed)]', 
                    '[Histogram tables]', 
                    '[Unique image iden]']
    tags = dcm.dir()
    for tag in tags:
        try:
            if (tag not in exclude_tags) and ( type(dcm.get(tag)) != pydicom.sequence.Sequence ):
                value = dcm.get(tag)
                if value or value == 0: # Some values are zero
                    # Put the value in the header
                    if type(value) == str and len(value) < 10240: # Max pydicom field length
                        header[tag] = [format_string(value)]
                    else:
                        header[tag] = [assign_type_pd(value)]
                        
                else:
                    log.debug('No value found for tag: ' + tag)
        except:
            log.debug('Failed to get ' + tag)
            pass
    dataframe = pd.DataFrame.from_dict(header)
    return dataframe
    
    #return header


In [38]:
header = dicom_header_to_pd(dcm)
header

Unnamed: 0,AccessionNumber,AcquisitionDate,AcquisitionMatrix,AcquisitionNumber,AcquisitionTime,AdditionalPatientHistory,AngioFlag,BitsAllocated,BitsStored,CardiacNumberOfImages,...,StationName,StudyDate,StudyDescription,StudyID,StudyInstanceUID,StudyTime,TriggerWindow,VariableFlipAngleFlag,WindowCenter,WindowWidth
0,39208195,20050623,"[0, 256, 256, 0]",1,83051,RESEARCH PT,N,16,12,0,...,GEMSOW,20050623,BRAIN,21671,1.2.840.113619.2.176.3596.6688930.8424.1119539...,82053,0,N,546,1093


In [46]:
df_list = []
dcm = []
if zipfile.is_zipfile(zip_file_path):
    zip = zipfile.ZipFile(zip_file_path)
    num_files = len(zip.namelist())
    for n in range((num_files - 1), -1, -1):
        dcm_path = zip.extract(zip.namelist()[n], '/tmp')
        print(dcm_path)
        if os.path.isfile(dcm_path):
            try:
                log.info('reading %s' % dcm_path)
                dcm = pydicom.read_file(dcm_path)
                # Here we check for the Raw Data Storage SOP Class, if there
                # are other pydicom files in the zip then we read the next one,
                # if this is the only class of pydicom in the file, we accept
                # our fate and move on.
                if dcm.get('SOPClassUID') == 'Raw Data Storage' and n != range((num_files - 1), -1, -1)[-1]:
                    continue
                else:
                    header = None
                    header = dicom_header_to_pd(dcm)
                    df_list.append(header)
            except:
                pass
        else:
            log.warning('%s does not exist!' % dcm_path)

/tmp/A/A:Z07
/tmp/A/A:Z09
/tmp/A/A:Z08
/tmp/A/A:Z06
/tmp/A/A:Z01
/tmp/A/A:Z23
/tmp/A/A:Z24
/tmp/A/A:Z12
/tmp/A/A:Z15
/tmp/A/A:Z14
/tmp/A/A:Z13
/tmp/A/A:Z25
/tmp/A/A:Z22
/tmp/A/A:Z04
/tmp/A/A:Z03
/tmp/A/A:Z02




/tmp/A/A:Z05
/tmp/A/A:Z18
/tmp/A/A:Z20
/tmp/A/A:Z16
/tmp/A/A:Z11
/tmp/A/A:Z10
/tmp/A/A:Z17
/tmp/A/A:Z21
/tmp/A/A:Z19
/tmp/A


In [53]:
df_list = []
dcm = []
for zip_file in zipfile.ZipFile(zip_file_path).namelist():
    dcm_path = zip.extract(zip_file, '/tmp')
    if os.path.isfile(dcm_path):
        try:
            log.info('reading %s' % dcm_path)
            dcm = pydicom.read_file(dcm_path)
            # Here we check for the Raw Data Storage SOP Class, if there
            # are other pydicom files in the zip then we read the next one,
            # if this is the only class of pydicom in the file, we accept
            # our fate and move on.
            if dcm.get('SOPClassUID') == 'Raw Data Storage' and n != range((num_files - 1), -1, -1)[-1]:
                continue
            else:
                header = None
                header = dicom_header_to_pd(dcm)
                df_list.append(header)
        except:
            pass

In [56]:
df = pd.concat(df_list,)

In [70]:
#print(df.nunique())
for col in df.columns:
    if len(df[col].unique()) == 2:
        print(df[col])

0    ['1', '-0', '0', '-0', '0.999998', '0.00219732']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219732']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219733']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219732']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219732']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219733']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219732']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219733']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219732']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219732']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219732']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219732']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219732']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219732']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219732']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219733']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219733']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219732']
0    ['1', '-0', '0', '-0', 

In [73]:
df.loc[0,'ImageOrientationPatient']

0    ['1', '-0', '0', '-0', '0.999998', '0.00219732']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219732']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219733']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219732']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219732']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219733']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219732']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219733']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219732']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219732']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219732']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219732']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219732']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219732']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219732']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219733']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219733']
0    ['1', '-0', '0', '-0', '0.999998', '0.00219732']
0    ['1', '-0', '0', '-0', 