The following steps explore the generated Synthea dataset in order to understand what it contains. The in intent is particularly to understand whether there are records within it which might form part of an asthma cohort.

In [9]:
import json
from collections import Counter
import pandas as pd

patients = []
with open('../output/synthea/filtered/fhir/Patient.ndjson') as file1:
    while True:
        line = file1.readline()        
        if len(line) > 0:
            pt = json.loads(line.strip())
            patients.append(pt)
        if not line:
            break

rTypes = Counter()
    
for p in patients:
    rTypes[p['resourceType']] += 1
    extCounter = Counter()
    for e in p['extension']:
        extCounter[e['url']] +=1
    # Check address extensions
    addresses = p['address']
    if len(addresses) > 1:
        print(f'multiple addresses for patient {p["id"]}')
    for e in addresses[0]['extension']:
        extCounter[e['url']] +=1
    print (json.dumps(extCounter, indent=3))


print (json.dumps(rTypes, indent=3))

{
   "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race": 1,
   "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity": 1,
   "http://hl7.org/fhir/StructureDefinition/patient-mothersMaidenName": 1,
   "http://hl7.org/fhir/us/core/StructureDefinition/us-core-birthsex": 1,
   "http://hl7.org/fhir/StructureDefinition/patient-birthPlace": 1,
   "http://synthetichealth.github.io/synthea/disability-adjusted-life-years": 1,
   "http://synthetichealth.github.io/synthea/quality-adjusted-life-years": 1,
   "http://hl7.org/fhir/StructureDefinition/geolocation": 1
}
{
   "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race": 1,
   "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity": 1,
   "http://hl7.org/fhir/StructureDefinition/patient-mothersMaidenName": 1,
   "http://hl7.org/fhir/us/core/StructureDefinition/us-core-birthsex": 1,
   "http://hl7.org/fhir/StructureDefinition/patient-birthPlace": 1,
   "http://synthetichealth.github.io/synthea/disabi

In [7]:
patients[0]

{'resourceType': 'Patient',
 'id': '02dade42-9887-12c3-979e-5df8f35319f7',
 'meta': {'profile': ['http://hl7.org/fhir/us/core/StructureDefinition/us-core-patient']},
 'text': {'status': 'generated',
  'div': '<div xmlns="http://www.w3.org/1999/xhtml">Generated by <a href="https://github.com/synthetichealth/synthea">Synthea</a>.Version identifier: v2.7.0-180-gba768c62\n .   Person seed: 230200947382813706  Population seed: 1650380336059</div>'},
 'extension': [{'url': 'http://hl7.org/fhir/us/core/StructureDefinition/us-core-race',
   'extension': [{'url': 'ombCategory',
     'valueCoding': {'system': 'urn:oid:2.16.840.1.113883.6.238',
      'code': '2106-3',
      'display': 'White'}},
    {'url': 'text', 'valueString': 'White'}]},
  {'url': 'http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity',
   'extension': [{'url': 'ombCategory',
     'valueCoding': {'system': 'urn:oid:2.16.840.1.113883.6.238',
      'code': '2186-5',
      'display': 'Not Hispanic or Latino'}},
    {

We can see the data in more compact form via a DataFrame. This shows that the same set of seven attributes are available for all seven patients.

In [23]:
patient_list = []
patient_ids = []
std_attributes = {'gender':'','birthDate':'','maritalStatus':'','multipleBirthBoolean':''}
for p in patients:
    rTypes[p['resourceType']] += 1
    extCounter = Counter()
    for e in p['extension']:
        ext = e['url'].split('/')[-1]
        extCounter[ext] += 1
    addresses = p['address']
    for e in addresses[0]['extension']:
        ext = e['url'].split('/')[-1]
        extCounter[ext] +=1
    for att in std_attributes.keys():
        if att in p:
            extCounter[att] += 1
    patient_list.append(extCounter)
    patient_ids.append(p['id'])

pd.set_option("display.max_rows", None, "display.max_columns", None)
pdf = pd.DataFrame(patient_list)    
pdf

Unnamed: 0,us-core-race,us-core-ethnicity,patient-mothersMaidenName,us-core-birthsex,patient-birthPlace,disability-adjusted-life-years,quality-adjusted-life-years,geolocation,gender,birthDate,maritalStatus,multipleBirthBoolean
0,1,1,1,1,1,1,1,1,1,1,1,1.0
1,1,1,1,1,1,1,1,1,1,1,1,1.0
2,1,1,1,1,1,1,1,1,1,1,1,1.0
3,1,1,1,1,1,1,1,1,1,1,1,1.0
4,1,1,1,1,1,1,1,1,1,1,1,1.0
5,1,1,1,1,1,1,1,1,1,1,1,1.0
6,1,1,1,1,1,1,1,1,1,1,1,1.0
7,1,1,1,1,1,1,1,1,1,1,1,1.0
8,1,1,1,1,1,1,1,1,1,1,1,1.0
9,1,1,1,1,1,1,1,1,1,1,1,1.0


In [55]:
patient_list = []
patient_ids = []
discrepancies = []
std_attributes = {'gender':{'type':'string'},
                  'birthDate':{'type':'string'},
                  'maritalStatus':{'type':1,
                                   'codingSystem':'http://terminology.hl7.org/CodeSystem/v3-MaritalStatus'},
                  'multipleBirthBoolean':{'type':'boolean'}
                 }
ext_attributes = {
                  'disability-adjusted-life-years':{'type':'valueDecimal'},
                  'quality-adjusted-life-years':{'type':'valueDecimal'},
                  'us-core-birthsex':{'type':'valueCode'},
                  'us-core-race':{'type':'ombCategory','corv':'display'},
                 'us-core-ethnicity':{'type':'ombCategory','corv':'code'}

                 }
for p in patients:
    rTypes[p['resourceType']] += 1
    extCounter = Counter()
    extCounter['id'] = p['id']
    for e in p['extension']:
        ext = e['url'].split('/')[-1]
        if ext in ext_attributes:
            extType = ext_attributes[ext]['type']
            if extType in ['valueDecimal','valueCode']:
                extCounter[ext] = e[extType]
            elif extType in ['ombCategory']:
                subexts = [subext for subext in e['extension'] if subext['url'] == 'ombCategory']
                code_or_value = ext_attributes[ext]['corv']
                #extCounter[ext] = subexts[0]['valueCoding']['code']
                extCounter[ext] = subexts[0]['valueCoding'][code_or_value]
        #else:
        #   extCounter[ext] += 1

    addresses = p['address']
    for e in addresses[0]['extension']:
        ext = e['url'].split('/')[-1]
        extCounter[ext] +=1
    for k, val in std_attributes.items():
        if k in p:
            if val['type'] in ['string','boolean']:
                extCounter[k] = p[k]
            elif val['type'] == 1:
                coding0  = p[k]['coding'][0]
                extCounter[k] = coding0['code']
                expectedCoding = val['codingSystem']
                if coding0['system'] != expectedCoding:
                    discrepancies.append({p['resourceType']: p['id'], issue:f'{k} not coded as {expectedCoding}'})
            else:
                extCounter[k] += 1
    patient_list.append(extCounter)
    patient_ids.append(p['id'])

pd.set_option("display.max_rows", None, "display.max_columns", None)
pdf = pd.DataFrame(patient_list)
print(f'Discrepancies from expected/desired coding\n{discrepancies}')
pdf

Discrepancies from expected/desired coding
[]


Unnamed: 0,id,us-core-race,us-core-ethnicity,us-core-birthsex,disability-adjusted-life-years,quality-adjusted-life-years,geolocation,gender,birthDate,maritalStatus,multipleBirthBoolean
0,02dade42-9887-12c3-979e-5df8f35319f7,White,2186-5,F,1.307288,20.692712,1,female,1999-02-16,S,False
1,0ae5420d-3f37-fde9-3a4b-78b5115577ed,Black or African American,2186-5,M,0.257456,16.742544,1,male,2004-03-25,S,False
2,252bfba3-8213-0b45-ad39-2432fc6d5b93,White,2186-5,M,0.039343,6.960657,1,male,2014-03-25,S,False
3,53c41fa9-c9d9-8c2e-1649-7fb7915d8075,White,2186-5,F,0.137118,11.862882,1,female,2009-06-01,S,False
4,54a0dc8e-5c3c-e294-6619-486afc4f9444,White,2186-5,F,1.179539,49.820461,1,female,1970-04-08,M,False
5,5806c8d2-0ddc-4d62-d564-a34775ba6bb5,White,2186-5,M,11.278129,15.721871,1,male,1994-10-10,M,False
6,63004878-a5b8-c646-f14d-f0d087b1ff88,White,2186-5,M,0.137118,11.862882,1,male,2009-10-05,S,False
7,66fc8954-b148-59de-d6f4-24f97e44675d,White,2186-5,F,0.241671,16.758329,1,female,2004-08-25,S,False
8,7232b7b7-5a3a-bfc1-aac7-8bb7f10a531c,White,2135-2,M,4.824599,16.175401,1,male,2000-03-02,S,False
9,8afc8284-8d0f-c900-d9d0-a7541d7ed5da,White,2186-5,M,0.195617,6.804383,1,male,2014-02-09,S,False


In [56]:
pdf.to_csv('../output/synthea/tsv/Patient.txt', sep='\t')

In [3]:
import json
patient_schema = {
    "$id": "",
    "description": "Synthea patient",
    "$schema": "http://json-schema.org/draft-07/schema",
    "properties": {
        "disability-adjusted-life-years": {
            "description": "",
            "type": "number",
            "$unit": "years"
        },
        "quality-adjusted-life-years": {
            "description": "",
            "type": "number",
            "$unit": "years"
        },
        "gender": {
            "allOf": [
                {"description": ""}
            ]
        },
        "birthDate": {
            "description": "",
            "type": "string"
        },
        "us-core-ethnicity": {
            "allOf": [
                {"description": ""},
                {"$coding": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity"}
            ]
        },
        "maritalStatus": {
            "allOf": [
                {"description": ""},
                {"$coding": "http://terminology.hl7.org/CodeSystem/v3-MaritalStatus"}
            ]
        },
        "multipleBirthBoolean": {
            "description": "",
            "type": "boolean"
        },
        "us-core-race": {
            "allOf": [
                {"description": ""},
                {"$coding": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race"}
            ]
        },
        "us-core-birthsex": {
            "allOf": [
                {"description": ""},
                {"$coding": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-birthsex"}
            ]
        }
    }
}

with open('../output/synthea/DataConnect_schema/bigquery.synthea.patient.data_dict.json', 'w') as outfile:
    json.dump(patient_schema, outfile)
  