# Sample query library usage
This notebook loads data from Parquet files generated from the "big" test
database (i.e., the `openmrs-fhir-mysql-ocl-big` docker image). This dataset
has 7892 Patients, 396,650 Encounters, and 1,690,632 Observations. The
timings are on an Intel Xeon E5-1650 CPU (6 cores and 12 threads) with
64 GB of memory.

In [1]:
from datetime import datetime
import pandas
from typing import List, Any
import pyspark.sql.functions as F

import query_lib
import indicator_lib

In [3]:
BASE_DIR='./test_files/parquet_big_db_r4'
#CODE_SYSTEM='http://snomed.info/sct'
CODE_SYSTEM='http://www.ampathkenya.org'

# Encounter view
Note the first time the `patient_query` object is created, it also
starts the Spark environment which takes some time.
The total time for this and loading Encounters is ~25 seconds.

In [4]:
patient_query = query_lib.patient_query_factory(
    query_lib.Runner.SPARK, BASE_DIR, CODE_SYSTEM)

flat_enc_df = patient_query.get_patient_encounter_view()

df = patient_query._flatten_encounter('Encounter/')

#len(flat_enc_df.head(100000))

#df = df.toPandas()
#sel = df['locationId'] == '7f65d926-57d6-4402-ae10-a5b3bcbf7986'
#df[(df['encPatientId'] == '00035e9e-4a62-4d65-8832-4c2437558166') & sel ]

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/01/24 22:51:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


                                                                                

[INDICATORS_LOG 2023-01-24 22:51:14.624816] Number of Patient resources= 7888
[INDICATORS_LOG 2023-01-24 22:51:16.535786] Number of Encounter resources= 396693


                                                                                

In [5]:
flat_enc_df[flat_enc_df['locationId'].notna()].head()

Unnamed: 0,encPatientId,locationId,locationDisplay,encTypeSystem,encTypeCode,num_encounters,firstDate,lastDate
0,000672e2-328d-4a9b-ad45-b6e5c8ba0268,2131aff8-2e2a-480a-b7ab-4ac53250262b,Isolation Ward,http://fhir.openmrs.org/code-system/encounter-...,5021b1a1-e7f6-44b4-ba02-da2f2bcf8718,9,1962-07-27T14:11:21+00:00,
1,000672e2-328d-4a9b-ad45-b6e5c8ba0268,6351fcf4-e311-4a19-90f9-35667d99a8af,Registration Desk,http://fhir.openmrs.org/code-system/encounter-...,5021b1a1-e7f6-44b4-ba02-da2f2bcf8718,7,1963-07-05T14:11:21+00:00,
2,000be2f6-c751-4757-b1b0-2f303e47d8c3,2131aff8-2e2a-480a-b7ab-4ac53250262b,Isolation Ward,http://fhir.openmrs.org/code-system/encounter-...,5021b1a1-e7f6-44b4-ba02-da2f2bcf8718,3,1996-10-09T16:05:49+00:00,
3,001a3083-e0f8-4b12-bebf-754e4af48b0e,58c57d25-8d39-41ab-8422-108a0c277d98,Outpatient Clinic,http://fhir.openmrs.org/code-system/encounter-...,5021b1a1-e7f6-44b4-ba02-da2f2bcf8718,4,1999-10-31T16:31:35+00:00,
4,001bbc56-067b-4dfa-a2d0-6540be5020e7,6351fcf4-e311-4a19-90f9-35667d99a8af,Registration Desk,http://fhir.openmrs.org/code-system/encounter-...,5021b1a1-e7f6-44b4-ba02-da2f2bcf8718,7,1973-11-14T09:45:51+00:00,


## Adding an encounter location constraint

In [6]:
# Add encounter location constraint
patient_query.encounter_constraints(locationId=['58c57d25-8d39-41ab-8422-108a0c277d98'])
flat_enc_df = patient_query.get_patient_encounter_view()
flat_enc_df.head()

                                                                                

Unnamed: 0,encPatientId,locationId,locationDisplay,encTypeSystem,encTypeCode,num_encounters,firstDate,lastDate
0,000672e2-328d-4a9b-ad45-b6e5c8ba0268,58c57d25-8d39-41ab-8422-108a0c277d98,Outpatient Clinic,http://fhir.openmrs.org/code-system/encounter-...,5021b1a1-e7f6-44b4-ba02-da2f2bcf8718,7,1977-09-02T14:11:21+00:00,
1,000be2f6-c751-4757-b1b0-2f303e47d8c3,58c57d25-8d39-41ab-8422-108a0c277d98,Outpatient Clinic,http://fhir.openmrs.org/code-system/encounter-...,5021b1a1-e7f6-44b4-ba02-da2f2bcf8718,4,1993-12-08T16:05:49+00:00,
2,001a3083-e0f8-4b12-bebf-754e4af48b0e,58c57d25-8d39-41ab-8422-108a0c277d98,Outpatient Clinic,http://fhir.openmrs.org/code-system/encounter-...,5021b1a1-e7f6-44b4-ba02-da2f2bcf8718,4,1999-10-31T16:31:35+00:00,
3,001bbc56-067b-4dfa-a2d0-6540be5020e7,58c57d25-8d39-41ab-8422-108a0c277d98,Outpatient Clinic,http://fhir.openmrs.org/code-system/encounter-...,5021b1a1-e7f6-44b4-ba02-da2f2bcf8718,7,1972-11-15T09:45:51+00:00,
4,001d5ca1-e363-4dd2-921d-f435a6c57504,58c57d25-8d39-41ab-8422-108a0c277d98,Outpatient Clinic,http://fhir.openmrs.org/code-system/encounter-...,5021b1a1-e7f6-44b4-ba02-da2f2bcf8718,7,1950-08-23T04:53:25+00:00,


In [7]:
flat_enc_df[flat_enc_df['encPatientId'] == '8295eb5b-fba6-4e83-a5cb-2817b135cd27']

Unnamed: 0,encPatientId,locationId,locationDisplay,encTypeSystem,encTypeCode,num_encounters,firstDate,lastDate
4116,8295eb5b-fba6-4e83-a5cb-2817b135cd27,58c57d25-8d39-41ab-8422-108a0c277d98,Outpatient Clinic,http://fhir.openmrs.org/code-system/encounter-...,67a71486-1a54-468f-ac3e-7091a9a79584,10,2019-02-04T09:59:58+00:00,


In [8]:
flat_enc = patient_query._flatten_encounter('')
flat_enc.head().asDict()

{'encounterId': '0c559f07-6b60-4fee-b3ff-526b534ec0b1',
 'encPatientId': '212c957f-50ba-4f47-b88e-62c58d734353',
 'first': '2002-04-08T14:52:04+00:00',
 'last': None,
 'locationId': '58c57d25-8d39-41ab-8422-108a0c277d98',
 'locationDisplay': 'Outpatient Clinic',
 'encTypeSystem': ['http://fhir.openmrs.org/code-system/encounter-type'],
 'encTypeCode': ['5021b1a1-e7f6-44b4-ba02-da2f2bcf8718']}

# Observation view
Loading all Observation data needed for the view generation takes ~50 seconds.

In [10]:
_VL_CODE = '856'  # HIV VIRAL LOAD
_ARV_PLAN = '1255'  # ANTIRETROVIRAL PLAN
end_date='2018-01-01'
start_date='1998-01-01'
old_start_date='1978-01-01'

# Creating a new `patient_query` to drop all previous constraints
# and recreate flat views.
patient_query = query_lib.patient_query_factory(
    query_lib.Runner.SPARK, BASE_DIR, CODE_SYSTEM)

patient_query.include_obs_values_in_time_range(
    _VL_CODE, min_time=start_date, max_time=end_date)
patient_query.include_obs_values_in_time_range(
    _ARV_PLAN, min_time=start_date, max_time=end_date)
patient_query.include_all_other_codes(min_time=start_date, max_time=end_date)
# Note the first call to `find_patient_aggregates` starts a local Spark
# cluster, load input files, and flattens observations. These won't be
# done in subsequent calls of this function on the same instance.
# Also same cluster will be reused for other instances of `PatientQuery`.
agg_df = patient_query.get_patient_obs_view()
agg_df.head(10)

[INDICATORS_LOG 2023-01-24 22:53:18.519210] Number of Patient resources= 7888
[INDICATORS_LOG 2023-01-24 22:53:19.185951] Number of Observation resources= 1695439


                                                                                

[INDICATORS_LOG 2023-01-24 22:53:22.808971] Number of flattened obs rows = 2523442
[INDICATORS_LOG 2023-01-24 22:53:23.145067] Number of Encounter resources= 396693


                                                                                

[INDICATORS_LOG 2023-01-24 22:53:30.332792] Number of aggregated obs= 99810


                                                                                

[INDICATORS_LOG 2023-01-24 22:53:35.311819] Number of joined patient_agg_obs= 99810


                                                                                

[INDICATORS_LOG 2023-01-24 22:53:45.759237] patient_obs_view size= 99810


Unnamed: 0,patientId,birthDate,gender,code,num_obs,min_value,max_value,min_date,max_date,first_value,last_value,first_value_code,last_value_code
0,011e41b9-6301-42eb-bd38-7a72ec6f9c3d,1972-12-20,male,844,1,,,2002-08-19T20:49:25+00:00,2002-08-19T20:49:25+00:00,,,1138.0,1138.0
1,014af46f-3114-411b-9af6-7f997e5cf0c7,1995-12-21,male,1271,36,,,2015-08-02T15:46:21+00:00,2017-07-27T15:46:21+00:00,,,1042.0,855.0
2,01846644-d9df-4138-8d63-ae1b16ca228e,1959-05-24,male,5085,4,90.0,212.0,1998-06-29T22:25:34+00:00,2000-06-07T22:25:34+00:00,108.0,90.0,,
3,01aca144-31de-4805-a4b8-ece5d3e52c7a,1989-08-02,male,844,4,,,2010-11-14T17:45:23+00:00,2012-10-23T17:45:23+00:00,,,703.0,704.0
4,01b7480a-59c3-4a28-81ad-20bebd5ec374,1983-09-13,female,6174,14,,,2013-01-07T15:47:12+00:00,2016-03-05T15:47:12+00:00,,,157.0,967.0
5,030207de-97a1-4a26-830f-71ae526e87a5,1953-07-22,female,1250,45,,,2012-08-07T21:19:30+00:00,2017-04-12T21:19:30+00:00,,,628.0,815.0
6,035f7b83-afd9-4170-a5e6-c081afa554bc,1966-07-23,male,1261,8,,,2010-05-17T06:40:04+00:00,2012-11-03T06:40:04+00:00,,,1260.0,1256.0
7,0366fa61-043e-4347-b7a7-aa54eacff851,1983-05-06,male,5085,6,89.0,203.0,2004-01-31T10:42:46+00:00,2006-12-14T10:42:46+00:00,196.0,203.0,,
8,03abfeb3-39cb-45ef-8f4e-9d44b33913c1,1963-08-04,male,1111,108,,,2003-12-19T07:00:04+00:00,2014-02-15T07:00:04+00:00,,,1107.0,745.0
9,03abfeb3-39cb-45ef-8f4e-9d44b33913c1,1963-08-04,male,5090,1,169.0,169.0,2003-12-19T07:00:04+00:00,2003-12-19T07:00:04+00:00,169.0,169.0,,


In [15]:
# Inspecting one specific patient.
agg_df[(agg_df['patientId'] == '011e41b9-6301-42eb-bd38-7a72ec6f9c3d') & (agg_df['code'] == '844')] 

Unnamed: 0,patientId,birthDate,gender,code,num_obs,min_value,max_value,min_date,max_date,first_value,last_value,first_value_code,last_value_code
0,011e41b9-6301-42eb-bd38-7a72ec6f9c3d,1972-12-20,male,844,1,,,2002-08-19T20:49:25+00:00,2002-08-19T20:49:25+00:00,,,1138,1138


In [16]:
agg_df[(agg_df['patientId'] == '011e41b9-6301-42eb-bd38-7a72ec6f9c3d')]

Unnamed: 0,patientId,birthDate,gender,code,num_obs,min_value,max_value,min_date,max_date,first_value,last_value,first_value_code,last_value_code
0,011e41b9-6301-42eb-bd38-7a72ec6f9c3d,1972-12-20,male,844,1,,,2002-08-19T20:49:25+00:00,2002-08-19T20:49:25+00:00,,,1138.0,1138.0
27311,011e41b9-6301-42eb-bd38-7a72ec6f9c3d,1972-12-20,male,5088,1,35.0,35.0,2002-08-19T20:49:25+00:00,2002-08-19T20:49:25+00:00,35.0,35.0,,
28812,011e41b9-6301-42eb-bd38-7a72ec6f9c3d,1972-12-20,male,856,1,438510.0,438510.0,2002-08-19T20:49:25+00:00,2002-08-19T20:49:25+00:00,438510.0,438510.0,,
31777,011e41b9-6301-42eb-bd38-7a72ec6f9c3d,1972-12-20,male,5087,1,81.0,81.0,2002-08-19T20:49:25+00:00,2002-08-19T20:49:25+00:00,81.0,81.0,,
33358,011e41b9-6301-42eb-bd38-7a72ec6f9c3d,1972-12-20,male,1040,2,,,2002-08-19T20:49:25+00:00,2002-08-19T20:49:25+00:00,,,703.0,704.0
48829,011e41b9-6301-42eb-bd38-7a72ec6f9c3d,1972-12-20,male,5089,1,63.0,63.0,2002-08-19T20:49:25+00:00,2002-08-19T20:49:25+00:00,63.0,63.0,,
61734,011e41b9-6301-42eb-bd38-7a72ec6f9c3d,1972-12-20,male,5090,1,168.0,168.0,2002-08-19T20:49:25+00:00,2002-08-19T20:49:25+00:00,168.0,168.0,,
67310,011e41b9-6301-42eb-bd38-7a72ec6f9c3d,1972-12-20,male,1030,1,,,2002-08-19T20:49:25+00:00,2002-08-19T20:49:25+00:00,,,1138.0,1138.0
77283,011e41b9-6301-42eb-bd38-7a72ec6f9c3d,1972-12-20,male,5092,1,98.0,98.0,2002-08-19T20:49:25+00:00,2002-08-19T20:49:25+00:00,98.0,98.0,,
82263,011e41b9-6301-42eb-bd38-7a72ec6f9c3d,1972-12-20,male,1271,28,,,2002-08-19T20:49:25+00:00,2002-08-19T20:49:25+00:00,,,1042.0,855.0


In [12]:
agg_df[(agg_df['code'] == '856') & (agg_df['min_date'] != agg_df['max_date'])][
    ['patientId', 'code', 'min_date', 'max_date', 'first_value_code', 'last_value_code']].head()

Unnamed: 0,patientId,code,min_date,max_date,first_value_code,last_value_code
52,192b9bed-5350-4ed8-b829-e67d66a46a48,856,2001-12-11T11:29:21+00:00,2004-08-04T11:29:21+00:00,,
87,2c1de324-61a0-45c3-8e0a-768912ccf1f8,856,2001-09-10T16:28:43+00:00,2010-04-27T16:28:43+00:00,,
89,2eedef9e-0d02-421d-ad39-b9e804a789c3,856,1998-02-09T15:09:09+00:00,1999-06-14T15:09:09+00:00,,
98,32a2f898-4688-4beb-aad5-c9626b8a76b8,856,2010-09-28T23:16:47+00:00,2017-01-12T23:16:47+00:00,,
171,5b9302f6-0869-4017-bec3-9a2a5f671829,856,2009-07-25T02:32:27+00:00,2013-12-06T02:32:27+00:00,,


# Inspecting underlying Spark data-frames
The _user_ of the library does not need to deal with the underlying distributed query processing system. However, the _developer_ of the library needs an easy way to inspect the internal data of these systems. Here is how:

In [11]:
_DRUG1 = '1256'  # START DRUGS
_DRUG2 = '1260'  # STOP ALL MEDICATIONS

In [12]:
patient_query._obs_df.head().asDict()

22/06/28 14:00:52 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


{'id': '654e32da-3c1d-4191-a9af-04fd0906de5e',
 'meta': Row(id=None, versionId=None, lastUpdated=None, profile=None, security=None, tag=[Row(id=None, system='http://terminology.hl7.org/CodeSystem/v3-ObservationValue', version=None, code='SUBSETTED', display='Resource encoded in summary mode', userSelected=None)]),
 'implicitRules': None,
 'language': None,
 'text': None,
 'contained': [Row(id='#9d6ebf5c-7a40-436e-a931-dae16d8b4492', meta=Row(id=None, versionId=None, lastUpdated=None, profile=None, security=None, tag=[Row(id=None, system='http://terminology.hl7.org/CodeSystem/v3-ObservationValue', version=None, code='SUBSETTED', display='Resource encoded in summary mode', userSelected=None)]), implicitRules=None, language=None)],
 'identifier': None,
 'basedOn': None,
 'status': 'final',
 'category': None,
 'code': Row(id=None, coding=[Row(id=None, system=None, version=None, code='1111AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA', display='Patient reported current tuberculosis treatment', userSelec

In [13]:
exp_obs = patient_query._obs_df.withColumn('coding', F.explode('code.coding'))
exp_obs.head().asDict()

{'id': '654e32da-3c1d-4191-a9af-04fd0906de5e',
 'meta': Row(id=None, versionId=None, lastUpdated=None, profile=None, security=None, tag=[Row(id=None, system='http://terminology.hl7.org/CodeSystem/v3-ObservationValue', version=None, code='SUBSETTED', display='Resource encoded in summary mode', userSelected=None)]),
 'implicitRules': None,
 'language': None,
 'text': None,
 'contained': [Row(id='#9d6ebf5c-7a40-436e-a931-dae16d8b4492', meta=Row(id=None, versionId=None, lastUpdated=None, profile=None, security=None, tag=[Row(id=None, system='http://terminology.hl7.org/CodeSystem/v3-ObservationValue', version=None, code='SUBSETTED', display='Resource encoded in summary mode', userSelected=None)]), implicitRules=None, language=None)],
 'identifier': None,
 'basedOn': None,
 'status': 'final',
 'category': None,
 'code': Row(id=None, coding=[Row(id=None, system=None, version=None, code='1111AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA', display='Patient reported current tuberculosis treatment', userSelec

In [14]:
exp_obs.where('coding.code = "159800AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"').head().asDict()

{'id': 'b7e69303-256e-485a-9692-1bf31d0d2036',
 'meta': Row(id=None, versionId=None, lastUpdated=None, profile=None, security=None, tag=[Row(id=None, system='http://terminology.hl7.org/CodeSystem/v3-ObservationValue', version=None, code='SUBSETTED', display='Resource encoded in summary mode', userSelected=None)]),
 'implicitRules': None,
 'language': None,
 'text': None,
 'contained': [Row(id='#4a748301-b11d-4776-b44a-8a75391c5f98', meta=Row(id=None, versionId=None, lastUpdated=None, profile=None, security=None, tag=[Row(id=None, system='http://terminology.hl7.org/CodeSystem/v3-ObservationValue', version=None, code='SUBSETTED', display='Resource encoded in summary mode', userSelected=None)]), implicitRules=None, language=None)],
 'identifier': None,
 'basedOn': None,
 'status': 'final',
 'category': None,
 'code': Row(id=None, coding=[Row(id=None, system=None, version=None, code='159800AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA', display='Tuberculosis, review of screening questions', userSelected=

In [15]:
exp_obs.where('coding.code = "1268"').head().asDict()

{'id': '607a4c73-e3ad-4209-82df-d4c3081335a8',
 'meta': Row(id=None, versionId=None, lastUpdated=None, profile=None, security=None, tag=[Row(id=None, system='http://terminology.hl7.org/CodeSystem/v3-ObservationValue', version=None, code='SUBSETTED', display='Resource encoded in summary mode', userSelected=None)]),
 'implicitRules': None,
 'language': None,
 'text': None,
 'contained': [Row(id='#f7d3ba4d-b956-4945-af0c-70b566e2e4ea', meta=Row(id=None, versionId=None, lastUpdated=None, profile=None, security=None, tag=[Row(id=None, system='http://terminology.hl7.org/CodeSystem/v3-ObservationValue', version=None, code='SUBSETTED', display='Resource encoded in summary mode', userSelected=None)]), implicitRules=None, language=None)],
 'identifier': None,
 'basedOn': None,
 'status': 'final',
 'category': None,
 'code': Row(id=None, coding=[Row(id=None, system=None, version=None, code='1268AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA', display='Tuberculosis treatment plan', userSelected=None), Row(id=No

In [16]:
exp_obs.where(
    'coding.system IN ("http://snomed.info/sct", "http://loinc.org", "http://www.ampathkenya.org") \
    AND coding.display LIKE "%viral%" '
).groupBy(['coding']).agg(F.count('*')).head(20)

                                                                                

[Row(coding=Row(id=None, system='http://loinc.org', version=None, code='25836-8', display='HIV viral load', userSelected=None), count(1)=24150),
 Row(coding=Row(id=None, system='http://www.ampathkenya.org', version=None, code='1250', display='Antiretrovirals started', userSelected=None), count(1)=188949),
 Row(coding=Row(id=None, system='http://www.ampathkenya.org', version=None, code='2154', display='current antiretroviral treatment, patient reported', userSelected=None), count(1)=44805),
 Row(coding=Row(id=None, system='http://snomed.info/sct', version=None, code='315124004', display='HIV viral load', userSelected=None), count(1)=24150),
 Row(coding=Row(id=None, system='http://www.ampathkenya.org', version=None, code='1088', display='Current antiretroviral drugs used for treatment', userSelected=None), count(1)=250106),
 Row(coding=Row(id=None, system='http://snomed.info/sct', version=None, code='427314002', display='Current antiretroviral drugs used for treatment', userSelected=None

In [17]:
agg_df[(agg_df['code'] == _ARV_PLAN) & agg_df['last_value_code'].isin([_DRUG1, _DRUG2])].head()

Unnamed: 0,patientId,birthDate,gender,code,num_obs,min_value,max_value,min_date,max_date,first_value,last_value,first_value_code,last_value_code
39,199921cf-596f-4d70-9cb0-697e026d79c5,1997-01-05,female,1255,1,,,2016-09-10T16:45:56+00:00,2016-09-10T16:45:56+00:00,,,1256,1256
111,438f1ed8-7a79-4841-ab45-22bf900b6029,1979-11-09,male,1255,10,,,2001-04-14T03:48:03+00:00,2007-02-11T03:48:03+00:00,,,981,1256
248,81cc9ac3-5c80-4446-91ea-da955b7e4ae6,1956-05-13,female,1255,2,,,2000-08-23T22:11:59+00:00,2000-10-22T22:11:59+00:00,,,1258,1256
492,f3c15011-26a0-41f4-a2cd-8273c073c88a,1999-02-16,female,1255,4,,,2014-06-07T19:30:00+00:00,2017-12-01T19:30:00+00:00,,,1256,1260
577,27b0d130-a550-4f99-a1e7-1d5962bfc289,1969-01-13,male,1255,5,,,1999-10-29T14:46:35+00:00,2002-11-17T14:46:35+00:00,,,1258,1256


In [18]:
agg_df[(agg_df['code'] == _ARV_PLAN) & agg_df['last_value_code'].isin([_DRUG1, _DRUG2])].index.size

1044

In [19]:
agg_df[(agg_df['code'] == _ARV_PLAN) & agg_df['last_value_code'].isin([_DRUG1, _DRUG2])].groupby(
    'patientId').count().index.size

1044

In [20]:
indicator_lib.calc_TX_NEW(agg_df, ARV_plan=_ARV_PLAN, start_drug=[_DRUG1], end_date_str=end_date)

Unnamed: 0,TX_NEW,buckets,TX_NEW_count,TX_NEW_ratio
0,False,15-19_ALL-GENDERS,252,0.05881
1,False,15-19_female,123,0.028705
2,False,15-19_male,129,0.030105
3,False,20-24_ALL-GENDERS,428,0.099883
4,False,20-24_female,221,0.051575
5,False,20-24_male,207,0.048308
6,False,25-49_ALL-GENDERS,1894,0.442007
7,False,25-49_female,976,0.227771
8,False,25-49_male,918,0.214236
9,False,50+_ALL-GENDERS,994,0.231972


In [21]:
indicator_lib.calc_TX_PVLS(
    agg_df, VL_code=_VL_CODE, failure_threshold=10000,
    end_date_str=end_date)

Unnamed: 0,sup_VL,buckets,sup_VL_count,sup_VL_ratio
0,False,15-19_ALL-GENDERS,428,0.080557
1,False,15-19_female,215,0.040467
2,False,15-19_male,213,0.04009
3,False,20-24_ALL-GENDERS,669,0.125918
4,False,20-24_female,346,0.065123
5,False,20-24_male,323,0.060794
6,False,25-49_ALL-GENDERS,2782,0.523621
7,False,25-49_female,1450,0.272915
8,False,25-49_male,1332,0.250706
9,False,50+_ALL-GENDERS,1381,0.259928


# Indicator library development
This is an example to show how the `indicator_lib.py` functions can be incrementally developed based on the query library DataFrames.

In [22]:
patient_query._flat_obs.head().asDict()

{'coding': Row(id=None, system='http://www.ampathkenya.org', version=None, code='1111', display='Patient reported current tuberculosis treatment', userSelected=None),
 'valueCoding': Row(id=None, system='http://www.ampathkenya.org', version=None, code='2227', display='Rifampicin isoniazid pyrazinamide and ethambutol', userSelected=None),
 'value': Row(quantity=None, codeableConcept=Row(id=None, coding=[Row(id=None, system=None, version=None, code='1131AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA', display='Rifampicin isoniazid pyrazinamide and ethambutol', userSelected=None), Row(id=None, system='http://www.ampathkenya.org', version=None, code='2227', display='Rifampicin isoniazid pyrazinamide and ethambutol', userSelected=None), Row(id=None, system='http://www.ampathkenya.org', version=None, code='2226', display='Rifampicin isoniazid pyrazinamide and ethambutol', userSelected=None), Row(id=None, system='https://openconceptlab.org/orgs/CIEL/sources/CIEL', version=None, code='1131', display='Rifamp

In [23]:
agg_df[(agg_df['code'] == _VL_CODE)].head()

Unnamed: 0,patientId,birthDate,gender,code,num_obs,min_value,max_value,min_date,max_date,first_value,last_value,first_value_code,last_value_code
118,4553cb1b-d318-404d-86cb-595e91d39f46,1953-12-15,female,856,8,69984.0,746610.0,2006-01-07T08:31:55+00:00,2017-06-26T08:31:55+00:00,303870.0,183940.0,,
124,488343f2-d4fb-4fe3-b021-b606ea67ec00,1975-05-21,male,856,4,350710.0,848390.0,2000-05-17T00:35:54+00:00,2004-08-30T00:35:54+00:00,350710.0,397550.0,,
156,56b0b498-f808-4e44-afd9-d9af6191b0fa,1990-05-02,female,856,4,66670.0,642230.0,2005-12-27T04:18:41+00:00,2011-03-06T04:18:41+00:00,564150.0,496930.0,,
183,5fde49f6-b7d1-419b-81cf-a1325ec4bcc6,1995-03-29,male,856,4,246040.0,886980.0,2011-10-27T14:24:04+00:00,2017-03-06T14:24:04+00:00,886980.0,470210.0,,
217,70dd5a32-40b7-4d09-b08c-e63c8ffe65df,1992-04-28,female,856,3,208550.0,935890.0,2010-03-19T23:07:56+00:00,2013-07-20T23:07:56+00:00,213180.0,208550.0,,


In [24]:
def _find_age_band(birth_date: str, end_date: datetime) -> str:
  """Given the birth date, finds the age_band for PEPFAR disaggregation."""
  age = None
  try:
    # TODO handle all different formats (issues #174)
    birth = datetime.strptime(birth_date, '%Y-%m-%d')
    age = int((end_date - birth).days / 365.25)
  except Exception as e:
    common.custom_log('Invalid birth_date format: {}'.format(e))
    age = 999999

  if age == 999999:
    return 'ERROR'
  if age < 1:
    return '0-1'
  if age <= 4:
    return '1-4'
  if age <= 9:
    return '5-9'
  if age <= 14:
    return '10-14'
  if age <= 19:
    return '15-19'
  if age <= 24:
    return '20-24'
  if age <= 49:
    return '25-49'
  return '50+'


def _agg_buckets(birth_date: str, gender: str, end_date: datetime) -> List[str]:
  """Generates the list of all PEPFAR disaggregation buckets."""
  age_band = _find_age_band(birth_date, end_date)
  return [age_band + '_' + gender, 'ALL-AGES_' + gender,
          age_band + '_ALL-GENDERS', 'ALL-AGES_ALL-GENDERS']

def calc_TX_PVLS(patient_agg_obs: pandas.DataFrame, VL_code: str,
    failure_threshold: int, end_date_str: str = None) -> pandas.DataFrame:
  """Calculates TX_PVLS indicator with its corresponding disaggregations.

  Args:
    patient_agg_obs: An output from `patient_query.find_patient_aggregates()`.
    VL_code: The code for viral load values.
    failure_threshold: VL count threshold of failure.
    end_date: The string representation of the last date as 'YYYY-MM-DD'.
  Returns:
    The aggregated DataFrame.
  """
  end_date = datetime.today()
  if end_date_str:
    end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
  temp_df = patient_agg_obs[(patient_agg_obs['code'] == VL_code)].copy()
  # Note the above copy is used to avoid setting a new column on a slice next:
  temp_df['sup_VL'] = (temp_df['max_value'] < failure_threshold)
  temp_df['buckets'] = temp_df.apply(
      lambda x: _agg_buckets(x.birthDate, x.gender, end_date), axis=1)
  temp_df_exp = temp_df.explode('buckets')
  temp_df_exp = temp_df_exp.groupby(['sup_VL', 'buckets'], as_index=False)\
      .count()[['sup_VL', 'buckets', 'patientId']]\
      .rename(columns={'patientId': 'count'})
  # calculate ratio
  num_patients = len(temp_df.index)
  temp_df_exp['ratio'] = temp_df_exp['count']/num_patients
  return temp_df_exp
    
calc_TX_PVLS(agg_df, _VL_CODE, 10000, end_date_str='2020-12-30')

Unnamed: 0,sup_VL,buckets,count,ratio
0,False,15-19_ALL-GENDERS,99,0.018634
1,False,15-19_female,50,0.009411
2,False,15-19_male,49,0.009223
3,False,20-24_ALL-GENDERS,600,0.112931
4,False,20-24_female,302,0.056842
5,False,20-24_male,298,0.056089
6,False,25-49_ALL-GENDERS,2895,0.54489
7,False,25-49_female,1507,0.283644
8,False,25-49_male,1388,0.261246
9,False,50+_ALL-GENDERS,1703,0.320535
