## Imports

In [13]:
import os
import re
import json
import jsonschema
import pandas as pd
import numpy as np

## Functions

In [83]:
def import_file(filepath):
    """
    This function determines the file data type and appropriately
    imports the file as a pandas object
    It also partially accounts for the bug where date
    parsing in pandas cannot be set to False for pd.read_excel()

    :param filepath: path to the excel file
    :type filepath: str
    :returns:  data_frame, a pandas dataframe without date
    """
    if filepath.endswith(('.xls', '.xlsx')):
        # read in dataframe from xls/xlsx
        dataframe = pd.read_excel(filepath)

        # if there are no datetimes series found, return  the
        if dataframe.select_dtypes('datetime64').empty:
            return dataframe
        # otherwise, convert to string and replace any NaT with NaN
        else:
            print('fixing datetime')
            # select the offending columns
            dtcolumns = dataframe.select_dtypes('datetime64').columns
            # loop over the columns
            for column in dtcolumns:
                # convert to string type
                dataframe[column] = dataframe[column].astype('str')
                # handle NaTs
                dataframe[column] = dataframe[column].replace('NaT', np.nan)
            return dataframe
    elif filepath.endswith('.csv'):
        dataframe = pd.read_csv(filepath)
        return dataframe
    else:
        print('File type is not supported')


def convert_value(value):
    """
    converts all objects to strings and handles trailing .0 on ints
    since pandas likes to spit out floats
    """
    if type(value) == str:
        return value
    else:
        # convert to string
        value = str(value)
        # clip trailing .0
        value = re.sub('\.0+$', '',value)
        return value


def export_to_dict(dataframe):
    """
    This function exports a pandas dataframe object
    to a dictionary

    :param dataframe: a pandas DataFrame object
    :type filepath: DataFrame
    :returns:  output_dic (dict) - the output object to be converted to json

    """
    # for now, assume subject is the first column
    subject_column = dataframe.columns[0]
    # get the count of the non-null subjects
    subject_count = dataframe[subject_column].count()
    # check that the above returned a value
    if type(subject_count) != np.int64:
        print("subject_count is not a valid integer. Dictionary not created.")
    # if there's only one subject, account for possibility of list columnns
    elif subject_count == 1:
        print("Processing single subject...")
        output_dict = {}
        for column in dataframe:
            if dataframe[column].count() > 1:
                output_dict[column] = dataframe[column].dropna().tolist()
            else:
                value = dataframe[column][0]
                #value = convert_value(value)
                output_dict[column] = value
        # output_json = json.dumps(output_dict)
        return output_dict
    elif subject_count < 1:
        print("No subjects in DataFrame. Dictionary not created.")
    else:
        print("Processing multiple subjects...")
        output_dict = {}
        dataframe = dataframe.astype("str")
        for index, row in dataframe.iterrows():
            key = row[subject_column]
            value = row.to_dict()
            output_dict[key] = value
        # output_json = json.dumps(output_dict)
        return output_dict


In [84]:
xls_filepath = "input.metadata.xlsx"
csv_filepath = "input.metadata.csv"
xls_dataframe = import_file(xls_filepath)
csv_dataframe = import_file(csv_filepath)
xls_dict = export_to_dict(xls_dataframe)
csv_dict = export_to_dict(csv_dataframe)



fixing datetime
Processing single subject...
Processing single subject...


In [88]:
template_filepath = "mr.metadata.template.json"

with open(template_filepath) as template_data:
    template_json = json.load(template_data)
JSON_template = {}
JSON_template['type'] = 'object'
JSON_template['properties'] = template_json

In [28]:
jsonschema.validate(xls_dict,template_json)

In [41]:
type(xls_dict['StudyInstanceUID'])

str

In [89]:
required_array = []
for key in JSON_template['properties']:
    try:
        if JSON_template['properties'][key]['required'] is True:
            print(key)
            required_array.append(key)
            JSON_template['properties'][key].pop('required', None)
    except KeyError:
        continue

Modality
MagneticFieldStrength
SeriesInstanceUID
StudyInstanceUID
StudyID
PatientID
BodyPartExamined
MRAcquisitionType
SeriesDescription
ScanningSequence
SliceThickness
EchoTime
RepetitionTime
ClinicalTrialTimePointID
ClinicalTrialTimePointDescription
PixelSpacing
InversionTime


In [71]:
JSON_template['required'] = required_array

In [97]:
jsonschema.validate(csv_dict,JSON_template,fail_fast=False)

TypeError: __init__() got an unexpected keyword argument 'fail_fast'

In [99]:
dir(jsonschema)

['Draft3Validator',
 'Draft4Validator',
 'ErrorTree',
 'FormatChecker',
 'FormatError',
 'RefResolutionError',
 'RefResolver',
 'SchemaError',
 'ValidationError',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 '_format',
 '_utils',
 '_validators',
 '_version',
 'compat',
 'draft3_format_checker',
 'draft4_format_checker',
 'exceptions',
 'validate',
 'validators']

In [78]:
JSON_template['properties']['Modality']

{'type': 'string',
 'pattern': '/MR/',
 'description': "Modality must match 'MR'"}

In [96]:

v = jsonschema.Draft7Validator(JSON_template)
errors = sorted(v.iter_errors(csv_dict), key=lambda e: e.path)

AttributeError: module 'jsonschema' has no attribute 'Draft7Validator'