## Imports

In [33]:
import os
import re
import json
import jsonschema
import pandas as pd
import numpy as np
import logging
logging.basicConfig()
log = logging.getLogger('import_dicom_metadata')


## Functions

In [2]:
def import_file(filepath):
    """
    This function determines the file data type and appropriately
    imports the file as a pandas object
    It also partially accounts for the bug where date
    parsing in pandas cannot be set to False for pd.read_excel()

    :param filepath: path to the excel file
    :type filepath: str
    :returns:  data_frame, a pandas dataframe without date
    """
    if filepath.endswith(('.xls', '.xlsx')):
        # read in dataframe from xls/xlsx
        dataframe = pd.read_excel(filepath)

        # if there are no datetimes series found, return  the
        if dataframe.select_dtypes('datetime64').empty:
            return dataframe
        # otherwise, convert to string and replace any NaT with NaN
        else:
            print('fixing datetime')
            # select the offending columns
            dtcolumns = dataframe.select_dtypes('datetime64').columns
            # loop over the columns
            for column in dtcolumns:
                # convert to string type
                dataframe[column] = dataframe[column].astype('str')
                # handle NaTs
                dataframe[column] = dataframe[column].replace('NaT', np.nan)
            return dataframe
    elif filepath.endswith('.csv'):
        dataframe = pd.read_csv(filepath)
        return dataframe
    else:
        print('File type is not supported')


def convert_value(value):
    """
    converts all objects to strings and handles trailing .0 on ints
    since pandas likes to spit out floats
    """
    if type(value) == str:
        return value
    else:
        # convert to string
        value = str(value)
        # clip trailing .0
        value = re.sub('\.0+$', '',value)
        return value


def export_to_dict(dataframe):
    """
    This function exports a pandas dataframe object
    to a dictionary

    :param dataframe: a pandas DataFrame object
    :type filepath: DataFrame
    :returns:  output_dic (dict) - the output object to be converted to json

    """
    # for now, assume subject is the first column
    subject_column = dataframe.columns[0]
    # get the count of the non-null subjects
    subject_count = dataframe[subject_column].count()
    # check that the above returned a value
    if type(subject_count) != np.int64:
        print("subject_count is not a valid integer. Dictionary not created.")
    # if there's only one subject, account for possibility of list columnns
    elif subject_count == 1:
        print("Processing single subject...")
        output_dict = {}
        for column in dataframe:
            if dataframe[column].count() > 1:
                output_dict[column] = dataframe[column].dropna().tolist()
            else:
                value = dataframe[column][0]
                #value = convert_value(value)
                output_dict[column] = value
        # output_json = json.dumps(output_dict)
        return output_dict
    elif subject_count < 1:
        print("No subjects in DataFrame. Dictionary not created.")
    else:
        print("Processing multiple subjects...")
        output_dict = {}
        dataframe = dataframe.astype("str")
        for index, row in dataframe.iterrows():
            key = row[subject_column]
            value = row.to_dict()
            output_dict[key] = value
        # output_json = json.dumps(output_dict)
        return output_dict


In [21]:
xls_filepath = "input.metadata.xlsx"
csv_filepath = "input.metadata.csv"
xls_dataframe = import_file(xls_filepath)
csv_dataframe = import_file(csv_filepath)
xls_dict = export_to_dict(xls_dataframe)
csv_dict = export_to_dict(csv_dataframe)



fixing datetime
Processing single subject...
Processing single subject...


In [25]:
template_filepath = "mr.metadata.template.json"

with open(template_filepath) as template_data:
    template_json = json.load(template_data)
JSON_template = {}
JSON_template['type'] = 'object'
JSON_template['properties'] = template_json

In [22]:
def match_template_strings(input_dict,template):
    """
    This is a funtion that converts numeric dictionary
    values to strings when specified by the template 
    to be used in validation and removes ".0" from floats
    
    """
    for key in input_dict:
        if key in template['properties']:
            if template['properties'][key]["type"] == ("string" or "str"):
                if type(input_dict[key]) != str:
                    input_dict[key] = convert_value(input_dict[key])
    return input_dict

In [23]:
match_template_strings(xls_dict,JSON_template)

DeviceSerialNumber
PatientWeight
ImageType
SeriesNumber
AcquisitionTime
AcquisitionNumber
SpacingBetweenSlices
SAR
FlipAngle
PartialFourier
BaseResolution
ShimSetting
TxRefAmp
PhaseResolution
MultibandAccelerationFactor
PercentPhaseFOV
EchoTrainLength
PhaseEncodingSteps
AcquisitionMatrixPE
ReconMatrixPE
BandwidthPerPixelPhaseEncode
EffectiveEchoSpacing
DerivedVendorReportedEchoSpacing
TotalReadoutTime
PixelBandwidth
DwellTime
SliceTiming
ImageOrientationPatientDICOM


{'Modality': 'MR',
 'MagneticFieldStrength': 3.0,
 'ImagingFrequency': 123.262,
 'Manufacturer': 'Siemens',
 'ManufacturersModelName': 'Prisma_fit',
 'InstitutionName': 'BCBL',
 'InstitutionalDepartmentName': 'Department',
 'InstitutionAddress': 'Mikeletegi_69_San_Sebastian_District_ES_20009',
 'DeviceSerialNumber': '167004',
 'StationName': 'MRC35414',
 'SeriesInstanceUID': '1.3.12.2.1107.5.2.43.167004.2019030318311045850440175.0.0.0',
 'StudyInstanceUID': '1.3.12.2.1107.5.2.43.167004.30000019022812230075700000061',
 'ReferringPhysicianName': 'Pedro',
 'StudyID': 1.0,
 'PatientName': '14_MAGNO_7806',
 'PatientID': 'MAGNO',
 'PatientBirthDate': '1995-10-10',
 'PatientSex': 'M',
 'PatientWeight': '80',
 'BodyPartExamined': 'BRAIN',
 'PatientPosition': 'HFS',
 'ProcedureStepDescription': 'CRANEO_FUNCIONAL',
 'SoftwareVersions': 'syngo_MR_E11',
 'MRAcquisitionType': '2D',
 'SeriesDescription': 'RetinotopicAtlas_Run1',
 'ProtocolName': 'RetinotopicAtlas_Run1',
 'ScanningSequence': 'EP',
 '

In [41]:
type(xls_dict['StudyInstanceUID'])

str

In [26]:
required_array = []
for key in JSON_template['properties']:
    try:
        if JSON_template['properties'][key]['required'] is True:
            print(key)
            required_array.append(key)
            JSON_template['properties'][key].pop('required', None)
    except KeyError:
        continue

Modality
MagneticFieldStrength
SeriesInstanceUID
StudyInstanceUID
StudyID
PatientID
BodyPartExamined
MRAcquisitionType
SeriesDescription
ScanningSequence
SliceThickness
EchoTime
RepetitionTime
ClinicalTrialTimePointID
ClinicalTrialTimePointDescription
PixelSpacing
InversionTime


In [27]:
JSON_template['required'] = required_array

In [31]:
def validate_against_template(input_dict,template,error_log_path):
    # Initialize json schema validator
    validator = jsonschema.Draft7Validator(template)
    # Initialize list object for storing validation errors
    validation_errors = []
    for error in sorted(validator.iter_errors(input_dict), key=str):
        # Create a temporary dictionary for the individual error
        tmp_dict = {} 
        # Get error type
        tmp_dict['error_type'] = error.validator
        # Get error message and log it
        tmp_dict['error_message'] = error.message
        log.error(error.message)
        # Required field errors are a little special and need to be handled
        ## separately to get the field. We don't get the schema because it
        ## will print the entire template schema
        if error.validator == "required":
            # Get the item failing validation from the error message
            tmp_dict['item'] = 'info.' + error.message.split("'")[1]
        else:
            # Get the value of the field that failed validation
            tmp_dict['error_value'] = error.instance
            # Get the field that failed validation
            tmp_dict['item'] = 'info.' + str(error.path.pop())
            # Get the schema object used to validate in failed validation
            tmp_dict['schema'] = error.schema
        # Append individual error object to the return validation_errors object
        validation_errors.append(tmp_dict)

    with open(error_log_path, 'w') as outfile:
        json.dump(validation_errors, outfile, separators=(', ', ': '), sort_keys=True, indent=4)
    return validation_errors

In [14]:
validation_errors

[{'error_type': 'pattern',
  'error_message': "'1.3.12.2.1107.5.2.43.167004.2019030318311045850440175.0.0.0' does not match '/^\\\\d{1}\\\\.\\\\d{1}(\\\\.\\\\d{1,55})?$/'",
  'error_value': '1.3.12.2.1107.5.2.43.167004.2019030318311045850440175.0.0.0',
  'item': 'info.SeriesInstanceUID',
  'schema': {'type': 'string',
   'pattern': '/^\\d{1}\\.\\d{1}(\\.\\d{1,55})?$/',
   'required': True,
   'description': 'SeriesInstanceUID must match above pattern. Example. 1.3.12.2.1107.5.2.43.167004.2019030318311045850440175.0.0.0'}},
 {'error_type': 'pattern',
  'error_message': "'1.3.12.2.1107.5.2.43.167004.30000019022812230075700000061' does not match '/^\\\\d{1}\\\\.\\\\d{1}(\\\\.\\\\d{1,55})?$/'",
  'error_value': '1.3.12.2.1107.5.2.43.167004.30000019022812230075700000061',
  'item': 'info.StudyInstanceUID',
  'schema': {'type': 'string',
   'pattern': '/^\\d{1}\\.\\d{1}(\\.\\d{1,55})?$/',
   'required': True,
   'description': 'StudyInstanceUID is an optional field. Example. 1.3.12.2.1107.5.

In [28]:
JSON_template

{'type': 'object',
 'properties': {'Modality': {'type': 'string',
   'pattern': '/MR/',
   'description': "Modality must match 'MR'"},
  'MagneticFieldStrength': {'type': 'number',
   'pattern': '/^\\d{0,2}(\\.\\d{1})?$/',
   'description': 'MagneticFieldStrength must match /^\\d{0,2}(\\.\\d{1})?$/. Example: 3, 3.0 or 11.7'},
  'ImagingFrequency': {'type': 'number',
   'description': 'ImagingFrequency is an optional field. If proivded it must be a number. Example. 123.262'},
  'Manufacturer': {'type': 'string',
   'description': 'Manufacturer is an optional field. Example. Siemens'},
  'ManufacturersModelName': {'type': 'string',
   'description': 'ManufacturersModelName is an optional field. Example. Prisma_fit'},
  'InstitutionName': {'type': 'string',
   'description': 'InstitutionName is an optional field. Example. Stanford University'},
  'InstitutionalDepartmentName': {'type': 'string',
   'description': 'InstitutionalDepartmentName is an optional field. Example. Psychiatary'},
 

In [29]:
with open("revised.mr.metadata.template.json", 'w') as outfile:
    json.dump(JSON_template, outfile, separators=(', ', ': '), sort_keys=True, indent=4)
    template_filepath = "mr.metadata.template.json"



In [30]:
template_filepath = 'revised.mr.metadata.template.json'
with open(template_filepath) as template_data:
    template_json2 = json.load(template_data)

In [34]:
validate_against_template(xls_dict,template_json2,"test.error.log.json")

ERROR:import_dicom_metadata:'1.3.12.2.1107.5.2.43.167004.2019030318311045850440175.0.0.0' does not match '/^\\d{1}\\.\\d{1}(\\.\\d{1,55})?$/'
ERROR:import_dicom_metadata:'1.3.12.2.1107.5.2.43.167004.30000019022812230075700000061' does not match '/^\\d{1}\\.\\d{1}(\\.\\d{1,55})?$/'
ERROR:import_dicom_metadata:'2D' does not match '/[A-Za-z0-9-_.,!"\'/$]*/'
ERROR:import_dicom_metadata:'BRAIN' does not match '/^[A-Za-z]{1,}/'
ERROR:import_dicom_metadata:'ClinicalTrialTimePointDescription' is a required property
ERROR:import_dicom_metadata:'ClinicalTrialTimePointID' is a required property
ERROR:import_dicom_metadata:'EP' does not match '/[A-Za-z0-9-_.,!"\'/$]*/'
ERROR:import_dicom_metadata:'InversionTime' is a required property
ERROR:import_dicom_metadata:'MAGNO' does not match '/^[A-Za-z0-9]{1+}/'
ERROR:import_dicom_metadata:'MR' does not match '/MR/'
ERROR:import_dicom_metadata:'PixelSpacing' is a required property
ERROR:import_dicom_metadata:'RetinotopicAtlas_Run1' does not match '/^[A-Z

[{'error_type': 'pattern',
  'error_message': "'1.3.12.2.1107.5.2.43.167004.2019030318311045850440175.0.0.0' does not match '/^\\\\d{1}\\\\.\\\\d{1}(\\\\.\\\\d{1,55})?$/'",
  'error_value': '1.3.12.2.1107.5.2.43.167004.2019030318311045850440175.0.0.0',
  'item': 'info.SeriesInstanceUID',
  'schema': {'description': 'SeriesInstanceUID must match above pattern. Example. 1.3.12.2.1107.5.2.43.167004.2019030318311045850440175.0.0.0',
   'pattern': '/^\\d{1}\\.\\d{1}(\\.\\d{1,55})?$/',
   'type': 'string'}},
 {'error_type': 'pattern',
  'error_message': "'1.3.12.2.1107.5.2.43.167004.30000019022812230075700000061' does not match '/^\\\\d{1}\\\\.\\\\d{1}(\\\\.\\\\d{1,55})?$/'",
  'error_value': '1.3.12.2.1107.5.2.43.167004.30000019022812230075700000061',
  'item': 'info.StudyInstanceUID',
  'schema': {'description': 'StudyInstanceUID is an optional field. Example. 1.3.12.2.1107.5.2.43.167004.30000019022812230075700000061',
   'pattern': '/^\\d{1}\\.\\d{1}(\\.\\d{1,55})?$/',
   'type': 'string'