In [None]:
import pyspark.sql.functions as F
from credentials import MY_CREDENTIALS
from data_location import DELTA_LOCATION

from spark_bi.constants import ColNames, Extensions
from spark_bi.spark import FutPathlingContext

pc = FutPathlingContext.create(
    app_name="example-spark-app",
    # hadoop_config=MY_CREDENTIALS.to_hadoop_config() # Relay on auto injected credentials
)
delta_lake = pc.read.delta(DELTA_LOCATION)

:: loading settings :: url = jar:file:/Users/mabe/Git/spark-bi/.venv/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /Users/mabe/.ivy2.5.2/cache
The jars for the packages stored in: /Users/mabe/.ivy2.5.2/jars
au.csiro.pathling#library-runtime added as a dependency
io.delta#delta-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-419a71bc-726b-4a0b-9df7-fe88a3c41064;1.0
	confs: [default]
	found au.csiro.pathling#library-runtime;9.1.0 in local-m2-cache
	found io.delta#delta-spark_2.13;4.0.0 in local-m2-cache
	found io.delta#delta-storage;4.0.0 in local-m2-cache
	found org.antlr#antlr4-runtime;4.13.1 in local-m2-cache
:: resolution report :: resolve 89ms :: artifacts dl 5ms
	:: modules in use:
	au.csiro.pathling#library-runtime;9.1.0 from local-m2-cache in [default]
	io.delta#delta-spark_2.13;4.0.0 from local-m2-cache in [default]
	io.delta#delta-storage;4.0.0 fr

# By anvenderløsning

For at finde anvenderløsning bruger vi coexistence-tags der er beskrevet i: https://ehealth-dk.atlassian.net/wiki/spaces/EDTW/pages/2355986433/Multitenancy

Patienterne selv stammer fra CPR-registret, og har derfor ikke et coexistence-tag. Alle relevante patienter har mindst ét forløb, og derfor tæller vi antal unikke patienter fra episodes of care, og finder disses creating_solution.

In [5]:
from pyspark.sql.functions import col

eoc_with_pt_id = delta_lake.view(
    resource="EpisodeOfCare",
    select=[
        {
            "column": [
                {"name": "eoc_id", "path": "getResourceKey()"},
                {"name": "eoc_patient_id", "path": "patient.getReferenceKey()"},
                {
                    "name": ColNames.CREATING_SOLUTION.value,
                    "path": f"meta.tag.where(system='{Extensions.EHEALTH_COLOCATION.value}').code.first()",
                },
            ]
        }
    ],
)
eoc_with_pt_id.filter(col("creating_solution").isNotNull()).head(5)

[Row(eoc_id='EpisodeOfCare/2000469910', eoc_patient_id='Patient/1000684720', creating_solution='xb'),
 Row(eoc_id='EpisodeOfCare/2000469920', eoc_patient_id='Patient/1000684720', creating_solution='xb'),
 Row(eoc_id='EpisodeOfCare/2000469925', eoc_patient_id='Patient/1000684720', creating_solution='xb'),
 Row(eoc_id='EpisodeOfCare/2000469931', eoc_patient_id='Patient/1000684720', creating_solution='xb'),
 Row(eoc_id='EpisodeOfCare/2000469943', eoc_patient_id='Patient/1000684720', creating_solution='xb')]

In [6]:
(eoc_with_pt_id.groupby("creating_solution").count().toPandas())

Unnamed: 0,creating_solution,count
0,,79777
1,xb,2881


Bemærk at antallet er for TRIFORKs testmiljø.

# By careteam

In [7]:
eocs_with_team = delta_lake.view(
    resource="EpisodeOfCare",
    select=[
        {"column": [{"name": "eoc_id", "path": "getResourceKey()"}]},
        {"forEach": "team", "column": [{"name": "eoc_team_id", "path": "getReferenceKey()"}]},
    ],
)
eocs_with_team.groupby("eoc_team_id").count().toPandas()

Unnamed: 0,eoc_team_id,count
0,CareTeam/3000178177,1
1,CareTeam/3000138554,3749
2,CareTeam/3000000002,6869
3,CareTeam/3000108752,11477
4,CareTeam/3000143203,60562


Bemærk, at dette er fordelingen for TRIFORKs testmiljø.

# By diagnosis

For at finde episode-of-care'ens diagnoser/behandlingsområder skal vi lave koblingen:

`EpisodeOfCare <-> CarePlan.addresses <-> Condition.code`

Vi skal bruge denne beregning flere steder, og genbruger derfor en utility-funktion, der udfører det.

In [11]:
from spark_bi.dfs import compute_patient2condition


patient2condition = compute_patient2condition(delta_lake)
patient2condition.head(5)

(
    eoc_with_pt_id.join(
        patient2condition,
        eoc_with_pt_id.eoc_patient_id == patient2condition.eoc_patient_id,
        "inner",
    )
    .groupby("diagnosis_code")
    .count()
    .toPandas()
)

Unnamed: 0,diagnosis_code,count
0,DJ44,82656


Vær opmærksom på, at disse tal er fra TRIFORKs testmiljø, og derfor ikke repræsentative.

# By kommune

En alternativ fortolkning er antal episodesofcare fordelt på den patient de omhandlers bopælskommune:

In [13]:
from spark_bi.dfs import compute_patient2municipality


patients_with_municipality = compute_patient2municipality(delta_lake)

(
    eoc_with_pt_id.join(
        patients_with_municipality,
        eoc_with_pt_id.eoc_patient_id == patients_with_municipality.patient_id,
        "inner",
    )
    .groupby(ColNames.MUNICIPALITY_CODE.value)
    .count()
    .toPandas()
)

Unnamed: 0,municipality_code,count
0,410,33733
1,155,2
2,185,48923


Vær opmærksom på, at disse tal er fra TRIFORKs testmiljø, og derfor ikke repræsentative.

# By organisatorisk enhed

See README.md under "Organistorisk enhed".

# By region

In [14]:
from spark_bi.dfs import compute_patient2region

patients_with_region = compute_patient2region(delta_lake)

(
    eoc_with_pt_id.join(
        patients_with_region,
        eoc_with_pt_id.eoc_patient_id == patients_with_region.patient_id,
        "inner",
    )
    .groupby(ColNames.REGION_CODE.value)
    .count()
    .toPandas()
)


Unnamed: 0,region_code,count
0,DK-84,48925
1,DK-83,33733


Vær opmærksom på, at disse tal er fra TRIFORKs testmiljø, og derfor ikke repræsentative.