In [3]:
from collections.abc import MutableMapping
from copy import deepcopy
import json

In [4]:
with open('test_data/lambda_project_testdata_long_allyears.json', 'r') as f:
    members = json.load(f)

In [30]:
class JsonHash:
    '''
    JsonHash is a Class enabling the processing and querying of JSON data.
    ROUGH example of how we could work with the hash table.
    '''
    def __init__(self, json_data, p_sparse=False):
        # store json
        self.json_data = json_data
        # attribute to store the p_sparese parameter
        self.make_sparse = p_sparse
        # placeholder for query results
        self.results = []
        # hash the json object
        self.hash_table = self.hash_json(json_data)
        # reference for parsing the hash key
        # TODO: rename to json_map
        self.levels =   {
                        "member_id": 1,
                        "member_age": 1,
                        "member_sex": 1,
                        "claim": 1,
                        "claim_id": 3,
                        "claim_type": 3,   
                        "type_of_bill": 3,
                        "admission_date": 3,
                        "discharge_date": 3,
                        "taxonomy_code": 3,
                        "place_of_service": 3,
                        "principle_diagnosis": 3,
                        "diagnosis_codes": 3,
                        "drg": 3,  
                        "drg_severity": 3,
                        "drg_type": 3,
                        "claim_line": 3,
                        "line_number": 5,
                        "from_date": 5,
                        "thru_date": 5,
                        "revenue_code": 5,
                        "procedure_code": 5,
                        "ndc_code": 5,
                        "quantity": 5,
                        "allowed_amount": 5
                    }

    def hash_json(self, p_dictionary, p_parent_key=False, p_sparse=False):
        '''
        Gary's awesome JSON hashing function
        '''
        _items = []
        for _key, _value in p_dictionary.items():
            _new_key = str(p_parent_key) + '.' + _key if p_parent_key else _key
            if isinstance(_value, MutableMapping):
                # it's a dictionary
                if not _value.items():
                    # check for empty dictionary
                    _items.append((_new_key, None))
                else:
                    # not empty, recurse!
                    _items.extend(self.hash_json(_value, _new_key, self.make_sparse).items())
            elif isinstance(_value, list):
                # it's a list, so check to see if it's [not] empty
                if len(_value):
                    for _k, _v in enumerate(_value):
                        _items.extend(self.hash_json({str(_k): _v}, _new_key, self.make_sparse).items())
                else:
                    # empty list
                    _items.append((_new_key, None))
            else:
                # not dict or list, so append key, value
                if self.make_sparse is True and _value is None:
                        continue           
                _items.append((_new_key, _value))
        return dict(_items)

    def describe_json(self):
        '''
        TODO: Pretty print the structure of the JSON object (json_map)
        '''
        return 

    def trimLastElement(self, key):
        '''
        Removes the last element from the hash key
        '''
        _new_key = key.split('.')[:-1]
        return '.'.join(_new_key)

    def mapStr(self, obj):
        '''
        Coerce list items to str
        '''
        return [i for i in map(str, obj)]

    def getLastElement(self, key): 
        '''
        Returns the last element of a key
        '''
        return key.split('.')[-1]


    def getKeyDepth(self, key):
        '''
        returns depth of a given key
        This is messy and highly specific to this work. We would need labels for the 
        various levels of the JSON to do this dynamically.
        '''
        _key = key.split('.')
        
        # if the last element is numeric, it's an array index, so go further back in the key
        # ex ['contents', '0', 'claim', '4', 'claim_line', '3', 'diagnosis_codes', '5']
        if _key[-1].isnumeric():
            return _key[-4]
        else:
            return _key[-3].replace('contents', 'member')

    def getMemberComponent(self, key):
        ''''
        get member portion of key
        '''
        _key = key.split('.')
        try:
            return '.'.join(_key[:2])
        except IndexError:
            return None

    def getClaimComponent(self, key):
        ''''
        get member portion of key
        '''
        _key = key.split('.')
        try:
            return '.'.join(_key[:4])
        except IndexError:
            return None        

    def getClaimLineComponent(self, key):
        ''''
        get member portion of key
        '''
        _key = key.split('.')
        try:
            return '.'.join(_key[:6])
        except IndexError:
            return None                        

    def getKeyInfo(self, key):
        '''
        key: a key from the hash table
        returns: a dictionary with information about the key of the form:
            {
                key: 'contents.0.claim.3.claim_line.3.procedure_code,
                depth: 'claim_line',
                member: 'contents.0',
                claim: 'contents.0.claim.3',
                claim_line: 'contents.0.claim.3.claim_line.3'
            }
        TODO: Once we have a schema generator, let's generalize this using the schema.
        '''
        _depth = self.getKeyDepth(key)
        return {
            'key': key,
            'depth': _depth,
            'member': self.getMemberComponent(key), # every key has a member component
            'claim': self.getClaimComponent(key) if _depth.startswith('claim') else None,
            'claim_line': self.getClaimLineComponent(key) if _depth == 'claim_line' else None
            }

    def find_or(self, search_values):
        '''
        values: dictionary of the form {data_element: [list of associated values]}
        Return a dictionary of the form: 
                {
                    data_element: [
                        {
                            key: 'contents.0.claim.5.claim_id',
                            depth: 'claim',
                            member: 'contents.0',
                            claim: 'contents.0.claim.5',
                            claim_line: None
                        }
                    ]
                }
        TODO: Can we chain this with another call to find?
                EX: find professional claims with 99214 and a specific taxonomy code
                    find(['P']).find(['99214']).find(['207QA0000X'])
        '''
        if not isinstance(search_values, dict):
            raise TypeError('Malformed parameter: search_values must be dict like object')
        
        # initialize dict with the keys = keys from the search_values dict
        _results_dict = {el: [] for el in search_values.keys()}
        
        # loop through the search criteria and build the _return_dict
        for _key, _values in search_values.items():
            # This sacrifices a lot of readability and may need to be refactored
            _results_dict.update({_key: [self.getKeyInfo(_k) for _k, _v in self.hash_table.items() if str(_v) in self.mapStr(_values) if self.getLastElement(_k)==_key]})

        return _results_dict

    def find_and(self):
        '''
        Find the intersection of keys in the results object
        Ex: 
        Matches the professional claim query:
            {'contents.0.claim.35.claim_type': 'P'}
        Matches the Office Visit query:
            {'contents.0.claim.35.claim_line.0.procedure_code': '99214'}
        Intersecting portion of the key is this claim for this member. It can be used
        to find the corresponding service date or provider taxonomy, etc.
            {'contents.0.claim.35}
        '''

    def from_keys(self, values):
        '''
        values: list of values to search hash table for
        Return a subset of the hash table with the corresponding value
        TODO: Can we chain this with another call to from_values?
                EX: find professional claims with 99214 and a specific taxonomy code
                    find(['P']).find(['99214']).find(['207QA0000X'])
        '''
        
        self.results = self.find_or(values)
        return self

    def get_element(self, element):
        '''
        Return the desired data element; allows chaining with from_keys()
        This could accept a list.
        '''
        _results = []
        _ix = self.levels.get(element)
        _keys = set()
        for _key in self.results.keys():
            _parts = _key.split('.')[:_ix+1]
            # TODO: handle cases where we can't return the desired element
            _new_key = '.'.join(_parts + [element])
            # prevent duplicates in the output
            if _new_key not in _keys:
                _keys.add(_new_key)
                _results.append({_new_key: self.hash_table[_new_key]})
        return _results
            

In [31]:
table = JsonHash(members, p_sparse=True)

In [5]:
# Working code to get a set of all possible keys

test = deepcopy(table.hash_table)
test = [k.split('.') for k in test.keys()]

for ix, key_list in enumerate(test):
    for n, item in enumerate(key_list):
        if item.isnumeric():
            key_list[n] = '0'
        if n+1 == len(key_list) and item.isnumeric():
            key_list.pop(n)
        test[ix] = '.'.join(key_list)

test = [i.split('.') for i in set(test)]

In [24]:
table.hash_table

{'batch_id': 'long_allyears',
 'sequence': 1,
 'contents.0.member_id': 'mbr_002313701',
 'contents.0.member_age': 48,
 'contents.0.member_sex': 'F',
 'contents.0.claim.0.claim_id': 'clm_701224700',
 'contents.0.claim.0.claim_type': 'P',
 'contents.0.claim.0.admission_date': '2019-02-12',
 'contents.0.claim.0.discharge_date': '2019-02-12',
 'contents.0.claim.0.taxonomy_code': '363A00000X',
 'contents.0.claim.0.place_of_service': 11,
 'contents.0.claim.0.principle_diagnosis': 'F909',
 'contents.0.claim.0.diagnosis_codes.0': 'F909',
 'contents.0.claim.0.claim_line.0.line_number': 1,
 'contents.0.claim.0.claim_line.0.from_date': '2019-02-12',
 'contents.0.claim.0.claim_line.0.thru_date': '2019-02-12',
 'contents.0.claim.0.claim_line.0.procedure_code': '99215',
 'contents.0.claim.0.claim_line.0.quantity': 1,
 'contents.0.claim.0.claim_line.0.allowed_amount': 174.25,
 'contents.0.claim.0.claim_line.1.line_number': 2,
 'contents.0.claim.0.claim_line.1.from_date': '2019-02-12',
 'contents.0.cl

In [35]:
# get from_date for office visit claims
results = table.find_or({'procedure_code': ['99212','99213','99214','99215'], 'principle_diagnosis': ['F909'], 'member_id': ['mbr_002313701']})
results.keys()
## dict_keys(['procedure_code', 'principal_diagnosis'])
results['member_id']

[{'key': 'contents.0.member_id',
  'depth': 'member',
  'member': 'contents.0',
  'claim': None,
  'claim_line': None},
 {'key': 'contents.1.member_id',
  'depth': 'member',
  'member': 'contents.1',
  'claim': None,
  'claim_line': None}]

In [16]:
# finding values with high frequency for testing
from collections import Counter

values = Counter(list(table.values()))
{k:v for k,v in values.items() if v>50}

{1: 10494,
 'P': 2724,
 '363A00000X': 124,
 11: 2043,
 'F909': 96,
 2: 1796,
 '96127': 56,
 0: 782,
 3: 854,
 'R': 450,
 '2019-04-09': 128,
 90: 127,
 '2019-04-02': 60,
 60: 85,
 '90837': 258,
 91.15: 84,
 '99214': 429,
 129.2: 54,
 '2019-05-08': 84,
 '207Q00000X': 219,
 '99213': 332,
 '2019-05-07': 204,
 '97110': 68,
 'Z0000': 127,
 '80061': 74,
 4: 643,
 5: 403,
 6: 289,
 '36415': 263,
 '2019-06-17': 142,
 '2019-05-13': 64,
 30: 129,
 '2019-06-14': 78,
 20: 90,
 '2019-03-04': 56,
 '2018-02-06': 86,
 'E039': 199,
 '83036': 68,
 '84443': 124,
 'K219': 66,
 'Z1231': 120,
 '2018-11-30': 68,
 'Z23': 95,
 '363LF0000X': 156,
 'I': 504,
 '2019-06-23': 236,
 '2019-06-27': 64,
 '282N00000X': 522,
 '0250': 67,
 19: 54,
 '0320': 64,
 7: 167,
 8: 141,
 9: 122,
 '0636': 166,
 10: 126,
 23: 60,
 '2019-07-29': 60,
 '207X00000X': 54,
 'S6722XD': 108,
 '97140': 98,
 '97112': 65,
 '2019-07-08': 84,
 '2019-07-05': 76,
 '2019-10-29': 88,
 '2019-09-12': 540,
 '2019-07-01': 112,
 'F419': 82,
 '2019-09-19':

In [16]:
location_array = ['contents', 0, 'claim', 0, 'claim_line', 0, 'procedure_code']

In [23]:
# recursive function to get values from location array
def getJsonValue(d, location_array):
    if len(location_array) == 0:
        return d        
    for loc in location_array:
        return getJsonValue(d[loc], location_array[1:])        

In [22]:
getJsonValue(members, location_array)

'99215'