globaldothealth · pipliggins · May 29, 2024 · May 8, 2024 · May 13, 2024 · May 13, 2024
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -27,7 +27,7 @@ jobs:
         python-version: "3.11"
     - name: Install dependencies
       run: |
-        python3 -m pip install -r requirements.txt
+        python3 -m pip install '.[test]'
     - name: Test with pytest
       run: |
         python3 -m pytest --cov

diff --git a/.gitignore b/.gitignore
@@ -68,4 +68,7 @@ coverage.xml
 .pytest_cache/
 cover/
 
-.DS_Store
+.DS_Store
+
+# tests on private data
+tests/tests_private/
diff --git a/fhirflat/__init__.py b/fhirflat/__init__.py
@@ -0,0 +1,16 @@
+from .resources import (
+    Condition,
+    Encounter,
+    Immunization,
+    Location,
+    MedicationAdministration,
+    MedicationStatement,
+    Observation,
+    Organization,
+    Patient,
+    Procedure,
+    ResearchSubject,
+    Specimen,
+)
+
+from .ingest import convert_data_to_flat
diff --git a/fhirflat/fhir2flat.py b/fhirflat/fhir2flat.py
@@ -208,7 +208,9 @@ def flattenExtensions(df: pd.DataFrame, extension: str) -> pd.DataFrame:
 
     def expand_and_redefine(df, extension):
 
-        def redefine(row: pd.Series, extension: str) -> pd.Series:
+        def redefine(
+            row: pd.Series | pd.DataFrame, extension: str
+        ) -> pd.Series | pd.DataFrame:
             """Expands out simple extensions and leaves complex ones as is.
             To be dealt with later in the pipeline."""
 

diff --git a/fhirflat/flat2fhir.py b/fhirflat/flat2fhir.py
@@ -1,5 +1,9 @@
 # Converts FHIRflat files into FHIR resources
-from .util import group_keys, get_fhirtype, get_local_extension_type
+from .util import (
+    group_keys,
+    get_fhirtype,
+    get_local_extension_type,
+)
 from fhir.resources.quantity import Quantity
 from fhir.resources.codeableconcept import CodeableConcept
 from fhir.resources.period import Period
@@ -9,13 +13,33 @@
 from fhir.resources.backbonetype import BackboneType as _BackboneType
 
 from pydantic.v1.error_wrappers import ValidationError
+from pydantic.v1 import BaseModel
 
 
 def create_codeable_concept(
     old_dict: dict[str, list[str] | str], name: str
 ) -> dict[str, list[str]]:
     """Re-creates a codeableConcept structure from the FHIRflat representation."""
-    codes = old_dict.get(name + ".code")
+
+    # for reading in from ingestion pipeline
+    if name + ".code" in old_dict and name + ".system" in old_dict:
+        raw_codes = old_dict.get(name + ".code")
+        if not isinstance(raw_codes, list):
+            formatted_code = (
+                raw_codes if isinstance(raw_codes, str) else str(int(raw_codes))
+            )
+            codes = [old_dict[name + ".system"] + "|" + formatted_code]
+        else:
+            formatted_code = [
+                c if isinstance(c, str) else str(int(c)) for c in raw_codes
+            ]
+            codes = [
+                [s + "|" + c]
+                for s, c in zip(old_dict[name + ".system"], formatted_code)
+            ]
+    else:
+        # From FHIRflat file
+        codes = old_dict.get(name + ".code")
 
     if codes is None:
         return {
@@ -60,9 +84,14 @@ def createQuantity(df, group):
     for attribute in df.keys():
         attr = attribute.split(".")[-1]
         if attr == "code":
-            system, code = df[group + ".code"].split("|")
-            quant["code"] = code
-            quant["system"] = system
+            if group + ".system" in df.keys():
+                # reading in from ingestion pipeline
+                quant["code"] = df[group + ".code"]
+                quant["system"] = df[group + ".system"]
+            else:
+                system, code = df[group + ".code"].split("|")
+                quant["code"] = code
+                quant["system"] = system
         else:
             quant[attr] = df[group + "." + attr]
 
@@ -130,12 +159,53 @@ def set_datatypes(k, v_dict, klass) -> dict:
             }
 
         data_type = prop[value_type[0]]["type"]
-        data_class = get_fhirtype(data_type)
-        return {"url": k, f"{value_type[0]}": set_datatypes(k, v_dict, data_class)}
+        try:
+            data_class = get_fhirtype(data_type)
+            return {"url": k, f"{value_type[0]}": set_datatypes(k, v_dict, data_class)}
+        except AttributeError:
+            # datatype should be a primitive
+            return {"url": k, f"{value_type[0]}": v_dict[k]}
 
     return {s.split(".", 1)[1]: v_dict[s] for s in v_dict}
 
 
+def find_data_class(data_class: list[BaseModel] | BaseModel, k: str) -> BaseModel:
+    """
+    Finds the type class for item k within the data class.
+
+    Parameters
+    ----------
+    data_class: list[BaseModel] or BaseModel
+        The data class to search within. If a list, the function will search for the
+        a class with a matching title to k.
+    k: str
+        The property to search for within the data class
+    """
+
+    if isinstance(data_class, list):
+        title_matches = [k.lower() == c.schema()["title"].lower() for c in data_class]
+        result = [x for x, y in zip(data_class, title_matches) if y]
+        if len(result) == 1:
+            return get_fhirtype(k)
+        else:
+            raise ValueError(f"Couldn't find a matching class for {k} in {data_class}")
+
+    else:
+        k_schema = data_class.schema()["properties"].get(k)
+
+        base_class = (
+            k_schema.get("items").get("type")
+            if k_schema.get("items") is not None
+            else k_schema.get("type")
+        )
+
+        if base_class is None:
+            assert k_schema.get("type") == "array"
+
+            base_class = [opt.get("type") for opt in k_schema["items"]["anyOf"]]
+        return get_fhirtype(base_class)
+
+
 def expand_concepts(data: dict, data_class: type[_DomainResource]) -> dict:
     """
     Combines columns containing flattened FHIR concepts back into
@@ -146,36 +216,7 @@ def expand_concepts(data: dict, data_class: type[_DomainResource]) -> dict:
 
     for k in groups.keys():
 
-        if isinstance(data_class, list):
-            title_matches = [
-                k.lower() == c.schema()["title"].lower() for c in data_class
-            ]
-            result = [x for x, y in zip(data_class, title_matches) if y]
-            if len(result) == 1:
-                group_classes[k] = k
-                continue
-            else:
-                raise ValueError(
-                    f"Couldn't find a matching class for {k} in {data_class}"
-                )
-
-        else:
-            k_schema = data_class.schema()["properties"].get(k)
-
-            group_classes[k] = (
-                k_schema.get("items").get("type")
-                if k_schema.get("items") is not None
-                else k_schema.get("type")
-            )
-
-            if group_classes[k] is None:
-                assert k_schema.get("type") == "array"
-
-                group_classes[k] = [
-                    opt.get("type") for opt in k_schema["items"]["anyOf"]
-                ]
-
-    group_classes = {k: get_fhirtype(v) for k, v in group_classes.items()}
+        group_classes[k] = find_data_class(data_class, k)
 
     expanded = {}
     keys_to_replace = []
@@ -193,15 +234,34 @@ def expand_concepts(data: dict, data_class: type[_DomainResource]) -> dict:
         if all(isinstance(v, dict) for v in v_dict.values()):
             # coming back out of nested recursion
             expanded[k] = {s.split(".", 1)[1]: v_dict[s] for s in v_dict}
-            if data_class.schema()["properties"][k].get("type") == "array":
-                if k == "extension":
-                    expanded[k] = [v for v in expanded[k].values()]
-                else:
-                    expanded[k] = [expanded[k]]
+
+        elif any(isinstance(v, dict) for v in v_dict.values()) and isinstance(
+            group_classes[k], list
+        ):
+            # extensions, where some classes are just values and others have codes etc
+            non_dict_items = {
+                k: v for k, v in v_dict.items() if not isinstance(v, dict)
+            }
+            stripped_dict = {
+                s.split(".", 1)[1]: non_dict_items[s] for s in non_dict_items.keys()
+            }
+            for k1, v1 in stripped_dict.items():
+                klass = find_data_class(group_classes[k], k1)
+                v_dict[k + "." + k1] = set_datatypes(k1, {k1: v1}, klass)
+
+            expanded[k] = {s.split(".", 1)[1]: v_dict[s] for s in v_dict}
 
         else:
             expanded[k] = set_datatypes(k, v_dict, group_classes[k])
 
+        if isinstance(data_class, list):
+            continue
+        elif data_class.schema()["properties"][k].get("type") == "array":
+            if k == "extension":
+                expanded[k] = [v for v in expanded[k].values()]
+            else:
+                expanded[k] = [expanded[k]]
+
     dense_cols = {
         k: k.removesuffix("_dense") for k in data.keys() if k.endswith("_dense")
     }