In [None]:
import pyspark.sql.functions as F
from credentials import MY_CREDENTIALS
from data_location import DELTA_LOCATION

from spark_bi.constants import ColNames, Extensions
from spark_bi.spark import FutPathlingContext

pc = FutPathlingContext.create(
    app_name="example-spark-app", hadoop_config=MY_CREDENTIALS.to_hadoop_config()
)
delta_lake = pc.read.delta(DELTA_LOCATION)

:: loading settings :: url = jar:file:/Users/mabe/Git/spark-bi/.venv/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /Users/mabe/.ivy2.5.2/cache
The jars for the packages stored in: /Users/mabe/.ivy2.5.2/jars
au.csiro.pathling#library-runtime added as a dependency
io.delta#delta-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-4c226ceb-2e11-4348-830f-1791158eda69;1.0
	confs: [default]
	found au.csiro.pathling#library-runtime;9.1.0 in local-m2-cache
	found io.delta#delta-spark_2.13;4.0.0 in local-m2-cache
	found io.delta#delta-storage;4.0.0 in local-m2-cache
	found org.antlr#antlr4-runtime;4.13.1 in local-m2-cache
:: resolution report :: resolve 89ms :: artifacts dl 5ms
	:: modules in use:
	au.csiro.pathling#library-runtime;9.1.0 from local-m2-cache in [default]
	io.delta#delta-spark_2.13;4.0.0 from local-m2-cache in [default]
	io.delta#delta-storage;4.0.0 fr

# By aktiv/inaktiv

Definitionen af, om en patient er "aktiv", kan være meget forskellig alt efter forretningsbehov.

I FHIR data har vi markeret en patient som aktiv, hvis de har mindst ét "åbent" forløb. Et "åbent" forløb defineres som:
* PLANNED
* WAITLIST
* ACTIVE
* ON_HOLD

Denne status ligger på `patient.active` og synkroniseres dagligt.

Hvis vi genbruger denne definition, kan vi hurtigt finde antal:

In [2]:
(
    delta_lake.view(resource="Patient", select=[{"column": [{"name": "active", "path": "active"}]}])
    .groupby("active")
    .count()
    .toPandas()  # Pandas dataframes are rendered nicely in Jupyter Notebooks
)

                                                                                

Unnamed: 0,active,count
0,True,3
1,False,712


# By anvenderløsning

For at finde anvenderløsning bruger vi coexistence-tags der er beskrevet i: https://ehealth-dk.atlassian.net/wiki/spaces/EDTW/pages/2355986433/Multitenancy

Patienterne selv stammer fra CPR-registret, og har derfor ikke et coexistence-tag. Alle relevante patienter har mindst ét forløb, og derfor tæller vi antal unikke patienter fra episodes of care, og finder disses creating_solution.

In [None]:
from pyspark.sql.functions import col

eoc_with_pt_id = delta_lake.view(
    resource="EpisodeOfCare",
    select=[
        {
            "column": [
                {"name": "eoc_id", "path": "getResourceKey()"},
                {"name": "eoc_patient_id", "path": "patient.getReferenceKey()"},
                {
                    "name": ColNames.CREATING_SOLUTION.value,
                    "path": f"meta.tag.where(system='{Extensions.EHEALTH_COLOCATION.value}').code.first()",
                },
            ]
        },
        {"forEach": "team", "column": [{"name": "eoc_team_id", "path": "getReferenceKey()"}]},
    ],
)
eoc_with_pt_id.filter(col("creating_solution").isNotNull()).head(5)

[Row(eoc_id='EpisodeOfCare/2000469910', eoc_patient_id='Patient/1000684720', creating_solution='xb', eoc_team_id='CareTeam/3000143203'),
 Row(eoc_id='EpisodeOfCare/2000469920', eoc_patient_id='Patient/1000684720', creating_solution='xb', eoc_team_id='CareTeam/3000143203'),
 Row(eoc_id='EpisodeOfCare/2000469925', eoc_patient_id='Patient/1000684720', creating_solution='xb', eoc_team_id='CareTeam/3000143203'),
 Row(eoc_id='EpisodeOfCare/2000469931', eoc_patient_id='Patient/1000684720', creating_solution='xb', eoc_team_id='CareTeam/3000143203'),
 Row(eoc_id='EpisodeOfCare/2000469943', eoc_patient_id='Patient/1000684720', creating_solution='xb', eoc_team_id='CareTeam/3000143203')]

In [4]:
(
    eoc_with_pt_id.groupby("creating_solution")
    .agg(F.countDistinct("eoc_patient_id").alias("citizen_count"))
    .sort(F.desc("citizen_count"))
    .toPandas()
)

Unnamed: 0,creating_solution,citizen_count
0,,3
1,xb,1


Bemærk at antallet er for TRIFORKs testmiljø.

# By careteam

Fordi hver episode of care kun kan være tilknyttet én patient, så kan vi tælle antal patienter per careteam ved at:
* Finde alle episodes of care knyttet til det careteam
* Tælle antal unikke patient-id'er blandt alle disse episodes of care

Det gør vi:

In [5]:
careteams = delta_lake.view(
    resource="CareTeam", select=[{"column": [{"name": "careteam_id", "path": "getResourceKey()"}]}]
)
careteams.head(5)

[Row(careteam_id='CareTeam/3000000001'),
 Row(careteam_id='CareTeam/3000000002'),
 Row(careteam_id='CareTeam/3000000005'),
 Row(careteam_id='CareTeam/3000000006'),
 Row(careteam_id='CareTeam/3000000007')]

In [6]:
eoc_with_pt_id = delta_lake.view(
    resource="EpisodeOfCare",
    select=[
        {
            "column": [
                {"name": "episodeofcare_id", "path": "getResourceKey()"},
                {"name": "patient_id", "path": "patient.getReferenceKey()"},
                {
                    "name": "organization_id",
                    "path": "managingOrganization.first().getReferenceKey()",
                },
            ]
        },
        {"forEach": "team", "column": [{"name": "team_id", "path": "getReferenceKey()"}]},
    ],
)
eoc_with_pt_id.head(5)

[Row(episodeofcare_id='EpisodeOfCare/2000000029', patient_id='Patient/1000264558', organization_id='Organization/3000008564', team_id='CareTeam/3000108752'),
 Row(episodeofcare_id='EpisodeOfCare/2000000035', patient_id='Patient/1000264558', organization_id='Organization/3000008564', team_id='CareTeam/3000108752'),
 Row(episodeofcare_id='EpisodeOfCare/2000000042', patient_id='Patient/1000264558', organization_id='Organization/3000008564', team_id='CareTeam/3000108752'),
 Row(episodeofcare_id='EpisodeOfCare/2000000049', patient_id='Patient/1000264558', organization_id='Organization/3000008564', team_id='CareTeam/3000108752'),
 Row(episodeofcare_id='EpisodeOfCare/2000000068', patient_id='Patient/1000264558', organization_id='Organization/3000008564', team_id='CareTeam/3000108752')]

In [16]:
(
    careteams.join(eoc_with_pt_id, careteams.careteam_id == eoc_with_pt_id.team_id, how="left")
    .select("team_id", "patient_id")
    .distinct()
    .groupBy("team_id")
    .agg(F.count("*").alias("n_patients"))
    .sort("n_patients", ascending=False)
    .toPandas()
)

Unnamed: 0,team_id,n_patients
0,CareTeam/3000143203,3
1,CareTeam/3000000002,2
2,CareTeam/3000178177,1
3,,1


# By diagnosis

For at finde patientens diagnoser/behandlingsområder skal vi lave koblingen:

`Patient <-> EpisodeOfCare <-> CarePlan.addresses <-> Condition.code`

Vi skal bruge denne beregning flere steder, og derfor har vi lavet en utility-funktion der udfører det:

In [17]:
from spark_bi.dfs import compute_patient2condition


patient2condition = compute_patient2condition(delta_lake)

(
    patient2condition.groupby("diagnosis_code")
    .agg(F.countDistinct("eoc_patient_id").alias("n_citizens"))
    .toPandas()
)

Unnamed: 0,diagnosis_code,n_citizens
0,DJ44,2


Vær opmærksom på, at disse tal er fra TRIFORKs testmiljø, og derfor ikke repræsentative.

# By kommune

Antal borgere per kommune fortolkes som:
* Antal borgere der er tilknyttet en episodeofcare, der er administreret af et careteam, der er administreret af kommunen

Derfor skal vi lave koblingen `EpisodeOfCare.team <-> CareTeam.managingOrganization <-> Org.municipalityCode`

In [9]:
from pyspark.sql.functions import col

eoc_municipality = delta_lake.view(
    resource="EpisodeOfCare",
    select=[
        {
            "column": [
                {"name": "eoc_id", "path": "getResourceKey()"},
                {"name": "eoc_patient_id", "path": "patient.getReferenceKey()"},
            ]
        },
        {"forEach": "team", "column": [{"name": "eoc_team_id", "path": "getReferenceKey()"}]},
    ],
)
eoc_municipality.head(5)

[Row(eoc_id='EpisodeOfCare/2000000029', eoc_patient_id='Patient/1000264558', eoc_team_id='CareTeam/3000108752'),
 Row(eoc_id='EpisodeOfCare/2000000035', eoc_patient_id='Patient/1000264558', eoc_team_id='CareTeam/3000108752'),
 Row(eoc_id='EpisodeOfCare/2000000042', eoc_patient_id='Patient/1000264558', eoc_team_id='CareTeam/3000108752'),
 Row(eoc_id='EpisodeOfCare/2000000049', eoc_patient_id='Patient/1000264558', eoc_team_id='CareTeam/3000108752'),
 Row(eoc_id='EpisodeOfCare/2000000068', eoc_patient_id='Patient/1000264558', eoc_team_id='CareTeam/3000108752')]

In [10]:
careteams_with_managing_org = delta_lake.view(
    resource="CareTeam",
    select=[
        {
            "column": [
                {"name": "ct_id", "path": "getResourceKey()"},
                {"name": "ct_org_id", "path": "managingOrganization.first().getReferenceKey()"},
            ]
        }
    ],
)
careteams_with_managing_org.filter(F.col("ct_org_id").isNotNull()).head(5)

[Row(ct_id='CareTeam/3000148060', ct_org_id='Organization/3000038806'),
 Row(ct_id='CareTeam/3000148061', ct_org_id='Organization/3000029719')]


Desværre er der på TRIFORK-miljøet kun 2 careteams der har en tilknyttet organisation. Vi fortsætter analysen.

In [None]:
organizations_with_municipality = delta_lake.view(
    resource="Organization",
    select=[
        {
            "column": [
                {"name": "org_id", "path": "getResourceKey()"},
                {
                    "name": ColNames.MUNICIPALITY_CODE.value,
                    "path": "extension('http://ehealth.sundhed.dk/fhir/StructureDefinition/ehealth-organization-municipalityCode').valueString",
                },
            ]
        }
    ],
)
organizations_with_municipality.head(5)

[Row(org_id='Organization/3000000064', municipality_code='0787'),
 Row(org_id='Organization/3000000069', municipality_code='0265'),
 Row(org_id='Organization/3000000072', municipality_code='0173'),
 Row(org_id='Organization/3000000088', municipality_code='0360'),
 Row(org_id='Organization/3000000090', municipality_code='0787')]

In [12]:
patients = delta_lake.view(
    resource="Patient", select=[{"column": [{"name": "patient_id", "path": "getResourceKey()"}]}]
)
patients.head(5)

[Row(patient_id='Patient/1000264558'),
 Row(patient_id='Patient/1000264559'),
 Row(patient_id='Patient/1000264560'),
 Row(patient_id='Patient/1000264604'),
 Row(patient_id='Patient/1000264605')]

In [13]:
joined = (
    eoc_municipality.join(
        careteams_with_managing_org,
        eoc_municipality.eoc_team_id == careteams_with_managing_org.ct_id,
        how="left",
    )
    .join(patients, eoc_municipality.eoc_patient_id == patients.patient_id, how="left")
    .join(
        organizations_with_municipality,
        careteams_with_managing_org.ct_org_id == organizations_with_municipality.org_id,
        how="left",
    )
    .filter(col("org_id").isNotNull())
)

joined.head(5)

[]

Det viser sig at der på TRIFORKs testmiljø ikke er nogle patienter med episodes of care for de careteams, der har `.managingOrganization`. 

## By bopælskommune

En alternativ fortolkning er antal borgere fordelt på bopælskommune:

In [14]:
from spark_bi.dfs import compute_patient2municipality


patients_with_municipality = compute_patient2municipality(delta_lake)

(
    patients_with_municipality.groupBy(ColNames.MUNICIPALITY_CODE.value)
    .agg(F.countDistinct("patient_id").alias("n_patients"))
    .sort("n_patients", ascending=False)
    .toPandas()
)

Unnamed: 0,municipality_code,n_patients
0,0575,18
1,0621,17
2,0370,16
3,0630,15
4,0756,15
...,...,...
93,0270,3
94,0849,2
95,0561,2
96,0840,2


# By organisatorisk enhed

See README.md under "Organistorisk enhed".

# By region

In [None]:
from spark_bi.dfs import compute_patient2region

patients_with_region = compute_patient2region(delta_lake)

(
    patients_with_region.groupby(ColNames.REGION_CODE.value, "region_name")
    .agg(F.countDistinct("patient_id").alias("n_patients"))
    .sort("n_patients", ascending=False)
    .toPandas()
)


Unnamed: 0,regional_subdivision_code,region_name,n_patients
0,DK-84,Capital Region of Denmark,196
1,DK-83,Region of Southern Denmark,176
2,DK-85,Region Zealand,140
3,DK-82,Central Denmark Region,134
4,DK-81,Nord Denmark Region,69
