# Investigating shape of unstructured JSON data
+ Looking through nested JSONs for hidden keys prior to modeling

In [1]:
import json

In [2]:
brands = []
with open('../data/brands.json') as f:
    for line in f:
        brands.append(json.loads(line))
receipts = []
with open('../data/receipts.json') as f:
    for line in f:
        receipts.append(json.loads(line))
users = []
with open('../data/users.json') as f:
    for line in f:
        users.append(json.loads(line))

In [3]:
def getKeys(jsonList: list) -> set:
    keys = [set(row.keys()) for row in jsonList]
    keys = set().union(*keys)
    
    print(f"{keys=}")
    return keys

def getNestedKeysTypes(jsonList: list) -> dict:
    keys = getKeys(jsonList)
    nestedKeys = dict()
    for i in keys:
        # Exclude NoneType as a type
        keysTypes = [type(row.get(i)) for row in jsonList if type(row.get(i)) is not type(None)]
        keysTypes = list(set(keysTypes))
        assert len(keysTypes) == 1, f"Multiple types for column {i}"

        if (dict in keysTypes) | (list in keysTypes):
            nestedKeys[i] = keysTypes[0]
            
    print(f"{nestedKeys=}")
    return nestedKeys

def getNestedKeysKeys(jsonList: list) -> dict:
    out = dict()
    nestedKeys = getNestedKeysTypes(jsonList)
    for key, type in nestedKeys.items():
        uniqueNestedKeysKeys = set()
        nestedKeyList = [row.get(key, dict()) for row in jsonList]

        # Nested keys are either type list or dict
        if type == list:
            for listItem in nestedKeyList:
                if listItem:
                    nestedKeysKeys = [item.keys() for item in listItem]
                    uniqueNestedKeysKeys = uniqueNestedKeysKeys.union(*nestedKeysKeys)
            
        elif type == dict:
            nestedKeysKeys = [item.keys() for item in nestedKeyList]
            uniqueNestedKeysKeys = uniqueNestedKeysKeys.union(*nestedKeysKeys)

        print(f"[ {key} ] keys: {uniqueNestedKeysKeys}")
        out[key] = uniqueNestedKeysKeys
    return out

In [4]:
print("BRANDS")
brandNestedKeysKeys = getNestedKeysKeys(brands)

print("\nUSERS")
userNestedKeysKeys = getNestedKeysKeys(users)

print("\nRECEIPTS")
receiptNestedKeysKeys = getNestedKeysKeys(receipts)

BRANDS
keys={'category', 'topBrand', 'categoryCode', 'barcode', '_id', 'brandCode', 'cpg', 'name'}
nestedKeys={'_id': <class 'dict'>, 'cpg': <class 'dict'>}
[ _id ] keys: {'$oid'}
[ cpg ] keys: {'$id', '$ref'}

USERS
keys={'lastLogin', 'active', '_id', 'createdDate', 'signUpSource', 'role', 'state'}
nestedKeys={'lastLogin': <class 'dict'>, '_id': <class 'dict'>, 'createdDate': <class 'dict'>}
[ lastLogin ] keys: {'$date'}
[ _id ] keys: {'$oid'}
[ createdDate ] keys: {'$date'}

RECEIPTS
keys={'finishedDate', 'purchasedItemCount', 'modifyDate', '_id', 'pointsAwardedDate', 'bonusPointsEarned', 'purchaseDate', 'rewardsReceiptStatus', 'totalSpent', 'userId', 'createDate', 'bonusPointsEarnedReason', 'rewardsReceiptItemList', 'pointsEarned', 'dateScanned'}
nestedKeys={'finishedDate': <class 'dict'>, 'modifyDate': <class 'dict'>, '_id': <class 'dict'>, 'pointsAwardedDate': <class 'dict'>, 'purchaseDate': <class 'dict'>, 'createDate': <class 'dict'>, 'rewardsReceiptItemList': <class 'list'>, 'd