# Sample query library usage
This notebook loads data from Parquet files generated from the "big" test
database (i.e., the `openmrs-fhir-mysql-ocl-big` docker image). This dataset
has 7892 Patients, 396,650 Encounters, and 1,690,632 Observations. The
timings are on an Intel Xeon E5-1650 CPU (6 cores and 12 threads) with
64 GB of memory.

In [1]:
from datetime import datetime
import pandas
from typing import List, Any
import pyspark.sql.functions as F

import query_lib
import indicator_lib

In [2]:
BASE_DIR='./test_files/parquet_big_db'
#CODE_SYSTEM='http://snomed.info/sct'
CODE_SYSTEM='http://www.ampathkenya.org'

# Encounter view
Note the first time the `patient_query` object is created, it also
starts the Spark environment which takes some time.
The total time for this and loading Encounters is ~25 seconds.

In [3]:
patient_query = query_lib.patient_query_factory(
    query_lib.Runner.SPARK, BASE_DIR, CODE_SYSTEM)

flat_enc_df = patient_query.get_patient_encounter_view()

df = patient_query._flatten_encounter('Encounter/')

#len(flat_enc_df.head(100000))

#df = df.toPandas()
#sel = df['locationId'] == '7f65d926-57d6-4402-ae10-a5b3bcbf7986'
#df[(df['encPatientId'] == '00035e9e-4a62-4d65-8832-4c2437558166') & sel ]

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/06/28 13:59:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

[INDICATORS_LOG 2022-06-28 14:00:01.649000] Number of Patient resources= 7892
[INDICATORS_LOG 2022-06-28 14:00:03.525810] Number of Encounter resources= 396650


                                                                                

In [4]:
flat_enc_df[flat_enc_df['locationId'].notna()].head()

Unnamed: 0,encPatientId,locationId,locationDisplay,encTypeSystem,encTypeCode,num_encounters,firstDate,lastDate
0,00035e9e-4a62-4d65-8832-4c2437558166,7f65d926-57d6-4402-ae10-a5b3bcbf7986,Pharmacy,http://fhir.openmrs.org/code-system/encounter-...,5021b1a1-e7f6-44b4-ba02-da2f2bcf8718,3,2001-08-06T19:04:54+00:00,
1,00035e9e-4a62-4d65-8832-4c2437558166,7fdfa2cb-bc95-405a-88c6-32b7673c0453,Laboratory,http://fhir.openmrs.org/code-system/encounter-...,5021b1a1-e7f6-44b4-ba02-da2f2bcf8718,5,1996-01-15T19:04:54+00:00,
2,00055360-c85b-4b78-903f-fda01390cad0,58c57d25-8d39-41ab-8422-108a0c277d98,Outpatient Clinic,http://fhir.openmrs.org/code-system/encounter-...,5021b1a1-e7f6-44b4-ba02-da2f2bcf8718,8,1958-03-05T21:27:38+00:00,
3,00055360-c85b-4b78-903f-fda01390cad0,7fdfa2cb-bc95-405a-88c6-32b7673c0453,Laboratory,http://fhir.openmrs.org/code-system/encounter-...,5021b1a1-e7f6-44b4-ba02-da2f2bcf8718,7,1962-06-06T21:27:38+00:00,
4,000977ff-c65d-440f-b231-a6292e516718,2131aff8-2e2a-480a-b7ab-4ac53250262b,Isolation Ward,http://fhir.openmrs.org/code-system/encounter-...,5021b1a1-e7f6-44b4-ba02-da2f2bcf8718,5,1976-04-04T19:10:26+00:00,


## Adding an encounter location constraint

In [5]:
# Add encounter location constraint
patient_query.encounter_constraints(locationId=['58c57d25-8d39-41ab-8422-108a0c277d98'])
flat_enc_df = patient_query.get_patient_encounter_view()
flat_enc_df.head()

                                                                                

Unnamed: 0,encPatientId,locationId,locationDisplay,encTypeSystem,encTypeCode,num_encounters,firstDate,lastDate
0,00035e9e-4a62-4d65-8832-4c2437558166,58c57d25-8d39-41ab-8422-108a0c277d98,Outpatient Clinic,http://fhir.openmrs.org/code-system/encounter-...,5021b1a1-e7f6-44b4-ba02-da2f2bcf8718,7,1993-08-09T19:04:54+00:00,
1,00055360-c85b-4b78-903f-fda01390cad0,58c57d25-8d39-41ab-8422-108a0c277d98,Outpatient Clinic,http://fhir.openmrs.org/code-system/encounter-...,5021b1a1-e7f6-44b4-ba02-da2f2bcf8718,8,1958-03-05T21:27:38+00:00,
2,000977ff-c65d-440f-b231-a6292e516718,58c57d25-8d39-41ab-8422-108a0c277d98,Outpatient Clinic,http://fhir.openmrs.org/code-system/encounter-...,5021b1a1-e7f6-44b4-ba02-da2f2bcf8718,6,1972-04-30T19:10:26+00:00,
3,000a98b9-4a36-4c72-b736-dca10a0087ab,58c57d25-8d39-41ab-8422-108a0c277d98,Outpatient Clinic,http://fhir.openmrs.org/code-system/encounter-...,5021b1a1-e7f6-44b4-ba02-da2f2bcf8718,8,1972-03-27T09:14:19+00:00,
4,001945cf-f937-4b77-8205-10686e917df3,58c57d25-8d39-41ab-8422-108a0c277d98,Outpatient Clinic,http://fhir.openmrs.org/code-system/encounter-...,5021b1a1-e7f6-44b4-ba02-da2f2bcf8718,8,1993-10-05T07:14:34+00:00,


In [6]:
flat_enc_df[flat_enc_df['encPatientId'] == '8295eb5b-fba6-4e83-a5cb-2817b135cd27']

Unnamed: 0,encPatientId,locationId,locationDisplay,encTypeSystem,encTypeCode,num_encounters,firstDate,lastDate
4043,8295eb5b-fba6-4e83-a5cb-2817b135cd27,58c57d25-8d39-41ab-8422-108a0c277d98,Outpatient Clinic,http://fhir.openmrs.org/code-system/encounter-...,67a71486-1a54-468f-ac3e-7091a9a79584,10,2019-02-04T09:59:58+00:00,


In [7]:
flat_enc = patient_query._flatten_encounter('')
flat_enc.head().asDict()

{'encounterId': '3793316f-13be-4741-b588-ca82f09a2c0b',
 'encPatientId': 'b2cbf10c-73e4-46ea-85a7-f9f323be15d2',
 'first': '1996-01-01T22:50:16+00:00',
 'last': None,
 'locationId': '58c57d25-8d39-41ab-8422-108a0c277d98',
 'locationDisplay': 'Outpatient Clinic',
 'encTypeSystem': ['http://fhir.openmrs.org/code-system/encounter-type'],
 'encTypeCode': ['5021b1a1-e7f6-44b4-ba02-da2f2bcf8718']}

# Observation view
Loading all Observation data needed for the view generation takes ~50 seconds.

In [8]:
_VL_CODE = '856'  # HIV VIRAL LOAD
_ARV_PLAN = '1255'  # ANTIRETROVIRAL PLAN
end_date='2018-01-01'
start_date='1998-01-01'
old_start_date='1978-01-01'

# Creating a new `patient_query` to drop all previous constraints
# and recreate flat views.
patient_query = query_lib.patient_query_factory(
    query_lib.Runner.SPARK, BASE_DIR, CODE_SYSTEM)

patient_query.include_obs_values_in_time_range(
    _VL_CODE, min_time=start_date, max_time=end_date)
patient_query.include_obs_values_in_time_range(
    _ARV_PLAN, min_time=start_date, max_time=end_date)
patient_query.include_all_other_codes(min_time=start_date, max_time=end_date)
# Note the first call to `find_patient_aggregates` starts a local Spark
# cluster, load input files, and flattens observations. These won't be
# done in subsequent calls of this function on the same instance.
# Also same cluster will be reused for other instances of `PatientQuery`.
agg_df = patient_query.get_patient_obs_view()
agg_df.head(10)

[INDICATORS_LOG 2022-06-28 14:00:17.031596] Number of Patient resources= 7892




[INDICATORS_LOG 2022-06-28 14:00:18.133764] Number of Observation resources= 1690632


                                                                                

[INDICATORS_LOG 2022-06-28 14:00:23.086001] Number of flattened obs rows = 2515992
[INDICATORS_LOG 2022-06-28 14:00:23.481129] Number of Encounter resources= 396650


                                                                                

[INDICATORS_LOG 2022-06-28 14:00:32.751177] Number of aggregated obs= 99715


                                                                                

[INDICATORS_LOG 2022-06-28 14:00:38.614760] Number of joined patient_agg_obs= 99715


                                                                                

[INDICATORS_LOG 2022-06-28 14:00:51.738396] patient_obs_view size= 99715


Unnamed: 0,patientId,birthDate,gender,code,num_obs,min_value,max_value,min_date,max_date,first_value,last_value,first_value_code,last_value_code
0,00c1426f-ca04-414a-8db7-043bb41b64d2,1994-04-18,male,844,3,,,2009-07-07T01:44:23+00:00,2012-07-02T01:44:23+00:00,,,1138.0,1138.0
1,01188340-b29f-471a-9cc8-2b2247aa9f84,1953-08-02,male,1270,126,,,1998-02-22T11:44:13+00:00,2006-09-04T11:44:13+00:00,,,1108.0,750.0
2,01729300-dd28-49c3-baf0-ae054128e0db,1975-10-10,female,5085,2,120.0,238.0,1999-09-04T16:06:20+00:00,1999-09-04T16:06:20+00:00,120.0,238.0,,
3,01b747b4-2029-4842-bf32-e3157c4f4f06,1980-10-26,female,1271,40,,,2005-06-04T05:33:39+00:00,2006-09-20T05:33:39+00:00,,,1032.0,856.0
4,037a7abe-6567-415b-8558-81cf7d1d26f0,1992-01-29,female,2154,4,,,2007-11-16T07:15:49+00:00,2009-09-05T07:15:49+00:00,,,817.0,817.0
5,03a22d18-600e-478b-b6b6-1d8709de8149,1998-07-26,female,5085,4,102.0,240.0,2016-01-13T05:56:56+00:00,2017-12-22T05:56:56+00:00,135.0,240.0,,
6,04472fb9-14d2-4510-8459-db44c9271986,1974-07-08,male,5087,3,64.0,130.0,2011-09-30T20:26:46+00:00,2014-03-05T20:26:46+00:00,101.0,64.0,,
7,045a2075-9846-4855-a6f2-f2c04bf7862d,1949-06-02,female,5087,1,175.0,175.0,2008-12-17T23:06:29+00:00,2008-12-17T23:06:29+00:00,175.0,175.0,,
8,045e681b-832e-4126-8910-7cdf39fb2438,1999-07-31,female,1268,1,,,2016-09-22T18:29:08+00:00,2016-09-22T18:29:08+00:00,,,1257.0,1257.0
9,0769b38d-8137-4f33-aef2-54c38aa12fe5,1965-07-02,female,1265,7,,,2002-05-29T21:42:44+00:00,2003-01-24T21:42:44+00:00,,,1257.0,1107.0


In [9]:
# Inspecting one specific patient.
agg_df[(agg_df['patientId'] == '00c1426f-ca04-414a-8db7-043bb41b64d2') & (agg_df['code'] == '844')] 

Unnamed: 0,patientId,birthDate,gender,code,num_obs,min_value,max_value,min_date,max_date,first_value,last_value,first_value_code,last_value_code
0,00c1426f-ca04-414a-8db7-043bb41b64d2,1994-04-18,male,844,3,,,2009-07-07T01:44:23+00:00,2012-07-02T01:44:23+00:00,,,1138,1138


In [10]:
agg_df[(agg_df['code'] == '856') & (agg_df['min_date'] != agg_df['max_date'])][
    ['patientId', 'code', 'min_date', 'max_date', 'first_value_code', 'last_value_code']].head()

Unnamed: 0,patientId,code,min_date,max_date,first_value_code,last_value_code
118,4553cb1b-d318-404d-86cb-595e91d39f46,856,2006-01-07T08:31:55+00:00,2017-06-26T08:31:55+00:00,,
124,488343f2-d4fb-4fe3-b021-b606ea67ec00,856,2000-05-17T00:35:54+00:00,2004-08-30T00:35:54+00:00,,
156,56b0b498-f808-4e44-afd9-d9af6191b0fa,856,2005-12-27T04:18:41+00:00,2011-03-06T04:18:41+00:00,,
183,5fde49f6-b7d1-419b-81cf-a1325ec4bcc6,856,2011-10-27T14:24:04+00:00,2017-03-06T14:24:04+00:00,,
217,70dd5a32-40b7-4d09-b08c-e63c8ffe65df,856,2010-03-19T23:07:56+00:00,2013-07-20T23:07:56+00:00,,


# Inspecting underlying Spark data-frames
The _user_ of the library does not need to deal with the underlying distributed query processing system. However, the _developer_ of the library needs an easy way to inspect the internal data of these systems. Here is how:

In [11]:
_DRUG1 = '1256'  # START DRUGS
_DRUG2 = '1260'  # STOP ALL MEDICATIONS

In [12]:
patient_query._obs_df.head().asDict()

22/06/28 14:00:52 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


{'id': '654e32da-3c1d-4191-a9af-04fd0906de5e',
 'meta': Row(id=None, versionId=None, lastUpdated=None, profile=None, security=None, tag=[Row(id=None, system='http://terminology.hl7.org/CodeSystem/v3-ObservationValue', version=None, code='SUBSETTED', display='Resource encoded in summary mode', userSelected=None)]),
 'implicitRules': None,
 'language': None,
 'text': None,
 'contained': [Row(id='#9d6ebf5c-7a40-436e-a931-dae16d8b4492', meta=Row(id=None, versionId=None, lastUpdated=None, profile=None, security=None, tag=[Row(id=None, system='http://terminology.hl7.org/CodeSystem/v3-ObservationValue', version=None, code='SUBSETTED', display='Resource encoded in summary mode', userSelected=None)]), implicitRules=None, language=None)],
 'identifier': None,
 'basedOn': None,
 'status': 'final',
 'category': None,
 'code': Row(id=None, coding=[Row(id=None, system=None, version=None, code='1111AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA', display='Patient reported current tuberculosis treatment', userSelec

In [13]:
exp_obs = patient_query._obs_df.withColumn('coding', F.explode('code.coding'))
exp_obs.head().asDict()

{'id': '654e32da-3c1d-4191-a9af-04fd0906de5e',
 'meta': Row(id=None, versionId=None, lastUpdated=None, profile=None, security=None, tag=[Row(id=None, system='http://terminology.hl7.org/CodeSystem/v3-ObservationValue', version=None, code='SUBSETTED', display='Resource encoded in summary mode', userSelected=None)]),
 'implicitRules': None,
 'language': None,
 'text': None,
 'contained': [Row(id='#9d6ebf5c-7a40-436e-a931-dae16d8b4492', meta=Row(id=None, versionId=None, lastUpdated=None, profile=None, security=None, tag=[Row(id=None, system='http://terminology.hl7.org/CodeSystem/v3-ObservationValue', version=None, code='SUBSETTED', display='Resource encoded in summary mode', userSelected=None)]), implicitRules=None, language=None)],
 'identifier': None,
 'basedOn': None,
 'status': 'final',
 'category': None,
 'code': Row(id=None, coding=[Row(id=None, system=None, version=None, code='1111AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA', display='Patient reported current tuberculosis treatment', userSelec

In [14]:
exp_obs.where('coding.code = "159800AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"').head().asDict()

{'id': 'b7e69303-256e-485a-9692-1bf31d0d2036',
 'meta': Row(id=None, versionId=None, lastUpdated=None, profile=None, security=None, tag=[Row(id=None, system='http://terminology.hl7.org/CodeSystem/v3-ObservationValue', version=None, code='SUBSETTED', display='Resource encoded in summary mode', userSelected=None)]),
 'implicitRules': None,
 'language': None,
 'text': None,
 'contained': [Row(id='#4a748301-b11d-4776-b44a-8a75391c5f98', meta=Row(id=None, versionId=None, lastUpdated=None, profile=None, security=None, tag=[Row(id=None, system='http://terminology.hl7.org/CodeSystem/v3-ObservationValue', version=None, code='SUBSETTED', display='Resource encoded in summary mode', userSelected=None)]), implicitRules=None, language=None)],
 'identifier': None,
 'basedOn': None,
 'status': 'final',
 'category': None,
 'code': Row(id=None, coding=[Row(id=None, system=None, version=None, code='159800AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA', display='Tuberculosis, review of screening questions', userSelected=

In [15]:
exp_obs.where('coding.code = "1268"').head().asDict()

{'id': '607a4c73-e3ad-4209-82df-d4c3081335a8',
 'meta': Row(id=None, versionId=None, lastUpdated=None, profile=None, security=None, tag=[Row(id=None, system='http://terminology.hl7.org/CodeSystem/v3-ObservationValue', version=None, code='SUBSETTED', display='Resource encoded in summary mode', userSelected=None)]),
 'implicitRules': None,
 'language': None,
 'text': None,
 'contained': [Row(id='#f7d3ba4d-b956-4945-af0c-70b566e2e4ea', meta=Row(id=None, versionId=None, lastUpdated=None, profile=None, security=None, tag=[Row(id=None, system='http://terminology.hl7.org/CodeSystem/v3-ObservationValue', version=None, code='SUBSETTED', display='Resource encoded in summary mode', userSelected=None)]), implicitRules=None, language=None)],
 'identifier': None,
 'basedOn': None,
 'status': 'final',
 'category': None,
 'code': Row(id=None, coding=[Row(id=None, system=None, version=None, code='1268AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA', display='Tuberculosis treatment plan', userSelected=None), Row(id=No

In [16]:
exp_obs.where(
    'coding.system IN ("http://snomed.info/sct", "http://loinc.org", "http://www.ampathkenya.org") \
    AND coding.display LIKE "%viral%" '
).groupBy(['coding']).agg(F.count('*')).head(20)

                                                                                

[Row(coding=Row(id=None, system='http://loinc.org', version=None, code='25836-8', display='HIV viral load', userSelected=None), count(1)=24150),
 Row(coding=Row(id=None, system='http://www.ampathkenya.org', version=None, code='1250', display='Antiretrovirals started', userSelected=None), count(1)=188949),
 Row(coding=Row(id=None, system='http://www.ampathkenya.org', version=None, code='2154', display='current antiretroviral treatment, patient reported', userSelected=None), count(1)=44805),
 Row(coding=Row(id=None, system='http://snomed.info/sct', version=None, code='315124004', display='HIV viral load', userSelected=None), count(1)=24150),
 Row(coding=Row(id=None, system='http://www.ampathkenya.org', version=None, code='1088', display='Current antiretroviral drugs used for treatment', userSelected=None), count(1)=250106),
 Row(coding=Row(id=None, system='http://snomed.info/sct', version=None, code='427314002', display='Current antiretroviral drugs used for treatment', userSelected=None

In [17]:
agg_df[(agg_df['code'] == _ARV_PLAN) & agg_df['last_value_code'].isin([_DRUG1, _DRUG2])].head()

Unnamed: 0,patientId,birthDate,gender,code,num_obs,min_value,max_value,min_date,max_date,first_value,last_value,first_value_code,last_value_code
39,199921cf-596f-4d70-9cb0-697e026d79c5,1997-01-05,female,1255,1,,,2016-09-10T16:45:56+00:00,2016-09-10T16:45:56+00:00,,,1256,1256
111,438f1ed8-7a79-4841-ab45-22bf900b6029,1979-11-09,male,1255,10,,,2001-04-14T03:48:03+00:00,2007-02-11T03:48:03+00:00,,,981,1256
248,81cc9ac3-5c80-4446-91ea-da955b7e4ae6,1956-05-13,female,1255,2,,,2000-08-23T22:11:59+00:00,2000-10-22T22:11:59+00:00,,,1258,1256
492,f3c15011-26a0-41f4-a2cd-8273c073c88a,1999-02-16,female,1255,4,,,2014-06-07T19:30:00+00:00,2017-12-01T19:30:00+00:00,,,1256,1260
577,27b0d130-a550-4f99-a1e7-1d5962bfc289,1969-01-13,male,1255,5,,,1999-10-29T14:46:35+00:00,2002-11-17T14:46:35+00:00,,,1258,1256


In [18]:
agg_df[(agg_df['code'] == _ARV_PLAN) & agg_df['last_value_code'].isin([_DRUG1, _DRUG2])].index.size

1044

In [19]:
agg_df[(agg_df['code'] == _ARV_PLAN) & agg_df['last_value_code'].isin([_DRUG1, _DRUG2])].groupby(
    'patientId').count().index.size

1044

In [20]:
indicator_lib.calc_TX_NEW(agg_df, ARV_plan=_ARV_PLAN, start_drug=[_DRUG1], end_date_str=end_date)

Unnamed: 0,TX_NEW,buckets,TX_NEW_count,TX_NEW_ratio
0,False,15-19_ALL-GENDERS,252,0.05881
1,False,15-19_female,123,0.028705
2,False,15-19_male,129,0.030105
3,False,20-24_ALL-GENDERS,428,0.099883
4,False,20-24_female,221,0.051575
5,False,20-24_male,207,0.048308
6,False,25-49_ALL-GENDERS,1894,0.442007
7,False,25-49_female,976,0.227771
8,False,25-49_male,918,0.214236
9,False,50+_ALL-GENDERS,994,0.231972


In [21]:
indicator_lib.calc_TX_PVLS(
    agg_df, VL_code=_VL_CODE, failure_threshold=10000,
    end_date_str=end_date)

Unnamed: 0,sup_VL,buckets,sup_VL_count,sup_VL_ratio
0,False,15-19_ALL-GENDERS,428,0.080557
1,False,15-19_female,215,0.040467
2,False,15-19_male,213,0.04009
3,False,20-24_ALL-GENDERS,669,0.125918
4,False,20-24_female,346,0.065123
5,False,20-24_male,323,0.060794
6,False,25-49_ALL-GENDERS,2782,0.523621
7,False,25-49_female,1450,0.272915
8,False,25-49_male,1332,0.250706
9,False,50+_ALL-GENDERS,1381,0.259928


# Indicator library development
This is an example to show how the `indicator_lib.py` functions can be incrementally developed based on the query library DataFrames.

In [22]:
patient_query._flat_obs.head().asDict()

{'coding': Row(id=None, system='http://www.ampathkenya.org', version=None, code='1111', display='Patient reported current tuberculosis treatment', userSelected=None),
 'valueCoding': Row(id=None, system='http://www.ampathkenya.org', version=None, code='2227', display='Rifampicin isoniazid pyrazinamide and ethambutol', userSelected=None),
 'value': Row(quantity=None, codeableConcept=Row(id=None, coding=[Row(id=None, system=None, version=None, code='1131AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA', display='Rifampicin isoniazid pyrazinamide and ethambutol', userSelected=None), Row(id=None, system='http://www.ampathkenya.org', version=None, code='2227', display='Rifampicin isoniazid pyrazinamide and ethambutol', userSelected=None), Row(id=None, system='http://www.ampathkenya.org', version=None, code='2226', display='Rifampicin isoniazid pyrazinamide and ethambutol', userSelected=None), Row(id=None, system='https://openconceptlab.org/orgs/CIEL/sources/CIEL', version=None, code='1131', display='Rifamp

In [23]:
agg_df[(agg_df['code'] == _VL_CODE)].head()

Unnamed: 0,patientId,birthDate,gender,code,num_obs,min_value,max_value,min_date,max_date,first_value,last_value,first_value_code,last_value_code
118,4553cb1b-d318-404d-86cb-595e91d39f46,1953-12-15,female,856,8,69984.0,746610.0,2006-01-07T08:31:55+00:00,2017-06-26T08:31:55+00:00,303870.0,183940.0,,
124,488343f2-d4fb-4fe3-b021-b606ea67ec00,1975-05-21,male,856,4,350710.0,848390.0,2000-05-17T00:35:54+00:00,2004-08-30T00:35:54+00:00,350710.0,397550.0,,
156,56b0b498-f808-4e44-afd9-d9af6191b0fa,1990-05-02,female,856,4,66670.0,642230.0,2005-12-27T04:18:41+00:00,2011-03-06T04:18:41+00:00,564150.0,496930.0,,
183,5fde49f6-b7d1-419b-81cf-a1325ec4bcc6,1995-03-29,male,856,4,246040.0,886980.0,2011-10-27T14:24:04+00:00,2017-03-06T14:24:04+00:00,886980.0,470210.0,,
217,70dd5a32-40b7-4d09-b08c-e63c8ffe65df,1992-04-28,female,856,3,208550.0,935890.0,2010-03-19T23:07:56+00:00,2013-07-20T23:07:56+00:00,213180.0,208550.0,,


In [24]:
def _find_age_band(birth_date: str, end_date: datetime) -> str:
  """Given the birth date, finds the age_band for PEPFAR disaggregation."""
  age = None
  try:
    # TODO handle all different formats (issues #174)
    birth = datetime.strptime(birth_date, '%Y-%m-%d')
    age = int((end_date - birth).days / 365.25)
  except Exception as e:
    common.custom_log('Invalid birth_date format: {}'.format(e))
    age = 999999

  if age == 999999:
    return 'ERROR'
  if age < 1:
    return '0-1'
  if age <= 4:
    return '1-4'
  if age <= 9:
    return '5-9'
  if age <= 14:
    return '10-14'
  if age <= 19:
    return '15-19'
  if age <= 24:
    return '20-24'
  if age <= 49:
    return '25-49'
  return '50+'


def _agg_buckets(birth_date: str, gender: str, end_date: datetime) -> List[str]:
  """Generates the list of all PEPFAR disaggregation buckets."""
  age_band = _find_age_band(birth_date, end_date)
  return [age_band + '_' + gender, 'ALL-AGES_' + gender,
          age_band + '_ALL-GENDERS', 'ALL-AGES_ALL-GENDERS']

def calc_TX_PVLS(patient_agg_obs: pandas.DataFrame, VL_code: str,
    failure_threshold: int, end_date_str: str = None) -> pandas.DataFrame:
  """Calculates TX_PVLS indicator with its corresponding disaggregations.

  Args:
    patient_agg_obs: An output from `patient_query.find_patient_aggregates()`.
    VL_code: The code for viral load values.
    failure_threshold: VL count threshold of failure.
    end_date: The string representation of the last date as 'YYYY-MM-DD'.
  Returns:
    The aggregated DataFrame.
  """
  end_date = datetime.today()
  if end_date_str:
    end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
  temp_df = patient_agg_obs[(patient_agg_obs['code'] == VL_code)].copy()
  # Note the above copy is used to avoid setting a new column on a slice next:
  temp_df['sup_VL'] = (temp_df['max_value'] < failure_threshold)
  temp_df['buckets'] = temp_df.apply(
      lambda x: _agg_buckets(x.birthDate, x.gender, end_date), axis=1)
  temp_df_exp = temp_df.explode('buckets')
  temp_df_exp = temp_df_exp.groupby(['sup_VL', 'buckets'], as_index=False)\
      .count()[['sup_VL', 'buckets', 'patientId']]\
      .rename(columns={'patientId': 'count'})
  # calculate ratio
  num_patients = len(temp_df.index)
  temp_df_exp['ratio'] = temp_df_exp['count']/num_patients
  return temp_df_exp
    
calc_TX_PVLS(agg_df, _VL_CODE, 10000, end_date_str='2020-12-30')

Unnamed: 0,sup_VL,buckets,count,ratio
0,False,15-19_ALL-GENDERS,99,0.018634
1,False,15-19_female,50,0.009411
2,False,15-19_male,49,0.009223
3,False,20-24_ALL-GENDERS,600,0.112931
4,False,20-24_female,302,0.056842
5,False,20-24_male,298,0.056089
6,False,25-49_ALL-GENDERS,2895,0.54489
7,False,25-49_female,1507,0.283644
8,False,25-49_male,1388,0.261246
9,False,50+_ALL-GENDERS,1703,0.320535
