Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft attempt at ingestion using mapping file #24

Merged
merged 21 commits into from
May 29, 2024
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
b7f0848
Draft attempt at ingestion using mapping file
pipliggins May 8, 2024
dd61500
Ignore duplicate keys if values are the same
pipliggins May 13, 2024
a80c963
Improve single row 1:1 test without output comparison
pipliggins May 13, 2024
7a456ed
Multiple rows of encounter can be read in & out
pipliggins May 14, 2024
21e777b
Draft one-to-many conversion for observation
pipliggins May 14, 2024
cfadee9
Update overwritten cleanup() func in remaining classes
pipliggins May 15, 2024
b591be9
Start condensing ingestion code
pipliggins May 15, 2024
c142643
Create generic data conversion function for users
pipliggins May 15, 2024
6f9f4ad
Remove load_data functions
pipliggins May 15, 2024
e352003
Make fhirflat installable
pipliggins May 15, 2024
464dd67
Allow mappings from google sheets
pipliggins May 15, 2024
a49d78b
Update test workflow for package
pipliggins May 15, 2024
de6c177
Allow lists to be created during ingestion.
pipliggins May 17, 2024
234fd09
Improve references
pipliggins May 20, 2024
c524efe
Add race extension
pipliggins May 20, 2024
9fa116f
Misc fixes, add presenceAbsence and prespecifiedQuery extensions
pipliggins May 20, 2024
60e943b
Misc updates, now passes private checks on dengue data subset
pipliggins May 22, 2024
9bf774c
Fix some typehinting errors
pipliggins May 22, 2024
dd4afe9
Update init file
pipliggins May 22, 2024
bef7c35
Update some relative imports and fix different types test warning
pipliggins May 22, 2024
08eda15
Fix more types
pipliggins May 23, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
python-version: "3.11"
- name: Install dependencies
run: |
python3 -m pip install -r requirements.txt
python3 -m pip install '.[test]'
- name: Test with pytest
run: |
python3 -m pytest --cov
Expand Down
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -68,4 +68,7 @@ coverage.xml
.pytest_cache/
cover/

.DS_Store
.DS_Store

# tests on private data
tests/tests_private/
16 changes: 16 additions & 0 deletions fhirflat/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from .resources import (
Condition,
Encounter,
Immunization,
Location,
MedicationAdministration,
MedicationStatement,
Observation,
Organization,
Patient,
Procedure,
ResearchSubject,
Specimen,
)

from .ingest import convert_data_to_flat
4 changes: 3 additions & 1 deletion fhirflat/fhir2flat.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,9 @@ def flattenExtensions(df: pd.DataFrame, extension: str) -> pd.DataFrame:

def expand_and_redefine(df, extension):

def redefine(row: pd.Series, extension: str) -> pd.Series:
def redefine(
row: pd.Series | pd.DataFrame, extension: str
) -> pd.Series | pd.DataFrame:
"""Expands out simple extensions and leaves complex ones as is.
To be dealt with later in the pipeline."""

Expand Down
144 changes: 102 additions & 42 deletions fhirflat/flat2fhir.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Converts FHIRflat files into FHIR resources
from .util import group_keys, get_fhirtype, get_local_extension_type
from .util import (
group_keys,
get_fhirtype,
get_local_extension_type,
)
from fhir.resources.quantity import Quantity
from fhir.resources.codeableconcept import CodeableConcept
from fhir.resources.period import Period
Expand All @@ -9,13 +13,33 @@
from fhir.resources.backbonetype import BackboneType as _BackboneType

from pydantic.v1.error_wrappers import ValidationError
from pydantic.v1 import BaseModel


def create_codeable_concept(
old_dict: dict[str, list[str] | str], name: str
) -> dict[str, list[str]]:
"""Re-creates a codeableConcept structure from the FHIRflat representation."""
codes = old_dict.get(name + ".code")

# for reading in from ingestion pipeline
if name + ".code" in old_dict and name + ".system" in old_dict:
raw_codes = old_dict.get(name + ".code")
if not isinstance(raw_codes, list):
formatted_code = (
raw_codes if isinstance(raw_codes, str) else str(int(raw_codes))
)
codes = [old_dict[name + ".system"] + "|" + formatted_code]
else:
formatted_code = [
c if isinstance(c, str) else str(int(c)) for c in raw_codes
abhidg marked this conversation as resolved.
Show resolved Hide resolved
]
codes = [
[s + "|" + c]
for s, c in zip(old_dict[name + ".system"], formatted_code)
]
else:
# From FHIRflat file
codes = old_dict.get(name + ".code")

if codes is None:
return {
Expand Down Expand Up @@ -60,9 +84,14 @@ def createQuantity(df, group):
for attribute in df.keys():
attr = attribute.split(".")[-1]
if attr == "code":
system, code = df[group + ".code"].split("|")
quant["code"] = code
quant["system"] = system
if group + ".system" in df.keys():
# reading in from ingestion pipeline
quant["code"] = df[group + ".code"]
quant["system"] = df[group + ".system"]
else:
system, code = df[group + ".code"].split("|")
quant["code"] = code
quant["system"] = system
else:
quant[attr] = df[group + "." + attr]

Expand Down Expand Up @@ -130,12 +159,53 @@ def set_datatypes(k, v_dict, klass) -> dict:
}

data_type = prop[value_type[0]]["type"]
data_class = get_fhirtype(data_type)
return {"url": k, f"{value_type[0]}": set_datatypes(k, v_dict, data_class)}
try:
data_class = get_fhirtype(data_type)
return {"url": k, f"{value_type[0]}": set_datatypes(k, v_dict, data_class)}
except AttributeError:
# datatype should be a primitive
return {"url": k, f"{value_type[0]}": v_dict[k]}

return {s.split(".", 1)[1]: v_dict[s] for s in v_dict}


def find_data_class(data_class: list[BaseModel] | BaseModel, k: str) -> BaseModel:
abhidg marked this conversation as resolved.
Show resolved Hide resolved
"""
Finds the type class for item k within the data class.

Parameters
----------
data_class: list[BaseModel] or BaseModel
The data class to search within. If a list, the function will search for the
a class with a matching title to k.
k: str
The property to search for within the data class
"""

if isinstance(data_class, list):
title_matches = [k.lower() == c.schema()["title"].lower() for c in data_class]
result = [x for x, y in zip(data_class, title_matches) if y]
if len(result) == 1:
return get_fhirtype(k)
else:
raise ValueError(f"Couldn't find a matching class for {k} in {data_class}")

else:
k_schema = data_class.schema()["properties"].get(k)

base_class = (
k_schema.get("items").get("type")
if k_schema.get("items") is not None
else k_schema.get("type")
)

if base_class is None:
assert k_schema.get("type") == "array"

base_class = [opt.get("type") for opt in k_schema["items"]["anyOf"]]
return get_fhirtype(base_class)


def expand_concepts(data: dict, data_class: type[_DomainResource]) -> dict:
"""
Combines columns containing flattened FHIR concepts back into
Expand All @@ -146,36 +216,7 @@ def expand_concepts(data: dict, data_class: type[_DomainResource]) -> dict:

for k in groups.keys():

if isinstance(data_class, list):
title_matches = [
k.lower() == c.schema()["title"].lower() for c in data_class
]
result = [x for x, y in zip(data_class, title_matches) if y]
if len(result) == 1:
group_classes[k] = k
continue
else:
raise ValueError(
f"Couldn't find a matching class for {k} in {data_class}"
)

else:
k_schema = data_class.schema()["properties"].get(k)

group_classes[k] = (
k_schema.get("items").get("type")
if k_schema.get("items") is not None
else k_schema.get("type")
)

if group_classes[k] is None:
assert k_schema.get("type") == "array"

group_classes[k] = [
opt.get("type") for opt in k_schema["items"]["anyOf"]
]

group_classes = {k: get_fhirtype(v) for k, v in group_classes.items()}
group_classes[k] = find_data_class(data_class, k)

expanded = {}
keys_to_replace = []
Expand All @@ -193,15 +234,34 @@ def expand_concepts(data: dict, data_class: type[_DomainResource]) -> dict:
if all(isinstance(v, dict) for v in v_dict.values()):
# coming back out of nested recursion
expanded[k] = {s.split(".", 1)[1]: v_dict[s] for s in v_dict}
if data_class.schema()["properties"][k].get("type") == "array":
if k == "extension":
expanded[k] = [v for v in expanded[k].values()]
else:
expanded[k] = [expanded[k]]

elif any(isinstance(v, dict) for v in v_dict.values()) and isinstance(
group_classes[k], list
):
# extensions, where some classes are just values and others have codes etc
non_dict_items = {
k: v for k, v in v_dict.items() if not isinstance(v, dict)
}
stripped_dict = {
s.split(".", 1)[1]: non_dict_items[s] for s in non_dict_items.keys()
}
for k1, v1 in stripped_dict.items():
klass = find_data_class(group_classes[k], k1)
v_dict[k + "." + k1] = set_datatypes(k1, {k1: v1}, klass)

expanded[k] = {s.split(".", 1)[1]: v_dict[s] for s in v_dict}

else:
expanded[k] = set_datatypes(k, v_dict, group_classes[k])

if isinstance(data_class, list):
continue
elif data_class.schema()["properties"][k].get("type") == "array":
if k == "extension":
expanded[k] = [v for v in expanded[k].values()]
else:
expanded[k] = [expanded[k]]

dense_cols = {
k: k.removesuffix("_dense") for k in data.keys() if k.endswith("_dense")
}
Expand Down