In [1]:
import pyspark.sql.functions as F
from credentials import MY_CREDENTIALS
from data_location import DELTA_LOCATION

from spark_bi.spark import FutPathlingContext

pc = FutPathlingContext.create(
    app_name="example-spark-app", hadoop_config=MY_CREDENTIALS.to_hadoop_config()
)
delta_lake = pc.read.delta(DELTA_LOCATION)

:: loading settings :: url = jar:file:/Users/mabe/Git/spark-bi/.venv/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /Users/mabe/.ivy2.5.2/cache
The jars for the packages stored in: /Users/mabe/.ivy2.5.2/jars
au.csiro.pathling#library-runtime added as a dependency
io.delta#delta-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e4bd161b-52af-4596-800a-31593981d371;1.0
	confs: [default]
	found au.csiro.pathling#library-runtime;9.1.0 in local-m2-cache
	found io.delta#delta-spark_2.13;4.0.0 in local-m2-cache
	found io.delta#delta-storage;4.0.0 in local-m2-cache
	found org.antlr#antlr4-runtime;4.13.1 in local-m2-cache
:: resolution report :: resolve 129ms :: artifacts dl 5ms
	:: modules in use:
	au.csiro.pathling#library-runtime;9.1.0 from local-m2-cache in [default]
	io.delta#delta-spark_2.13;4.0.0 from local-m2-cache in [default]
	io.delta#delta-storage;4.0.0 f

For overvejelser re: bop√¶lslokation vs. administrationslokation, se "Citizens per municipality.ipynb".

In [5]:
patients_by_municipality = delta_lake.view(
    resource="Patient",
    select=[
        {
            "column": [
                {"name": "patient_id", "path": "getResourceKey()"},
                {
                    "name": "regional_subdivision_code",
                    "path": "address.where(use = 'home').extension('http://hl7.dk/fhir/core/StructureDefinition/dk-core-RegionalSubDivisionCodes').valueCodeableConcept.coding.code",
                },
            ]
        }
    ],
)

# Map regional subdivision codes to region names
# See https://hl7.dk/fhir/core/1.1.0/ValueSet-dk-core-RegionalSubDivisionCodes.html
region_mapping = {
    "DK-81": "Nord Denmark Region",
    "DK-82": "Central Denmark Region",
    "DK-83": "Region of Southern Denmark",
    "DK-84": "Capital Region of Denmark",
    "DK-85": "Region Zealand",
}

# Apply mapping to the result
result_df = (
    patients_by_municipality.groupBy("regional_subdivision_code")
    .agg(F.countDistinct("patient_id").alias("n_patients"))
    .sort("n_patients", ascending=False)
    .toPandas()
)

result_df["region_name"] = result_df["regional_subdivision_code"].map(region_mapping)
result_df

Unnamed: 0,regional_subdivision_code,n_patients,region_name
0,DK-84,196,Capital Region of Denmark
1,DK-83,176,Region of Southern Denmark
2,DK-85,140,Region Zealand
3,DK-82,134,Central Denmark Region
4,DK-81,69,Nord Denmark Region
