diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b17ce45..a8ad5e7 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -27,7 +27,7 @@ jobs: python-version: "3.11" - name: Install dependencies run: | - python3 -m pip install -r requirements.txt + python3 -m pip install '.[test]' - name: Test with pytest run: | python3 -m pytest --cov diff --git a/.gitignore b/.gitignore index ad56c74..c150b65 100644 --- a/.gitignore +++ b/.gitignore @@ -68,4 +68,7 @@ coverage.xml .pytest_cache/ cover/ -.DS_Store \ No newline at end of file +.DS_Store + +# tests on private data +tests/tests_private/ \ No newline at end of file diff --git a/fhirflat/__init__.py b/fhirflat/__init__.py new file mode 100644 index 0000000..701e314 --- /dev/null +++ b/fhirflat/__init__.py @@ -0,0 +1,16 @@ +from .resources import ( + Condition, + Encounter, + Immunization, + Location, + MedicationAdministration, + MedicationStatement, + Observation, + Organization, + Patient, + Procedure, + ResearchSubject, + Specimen, +) + +from .ingest import convert_data_to_flat diff --git a/fhirflat/fhir2flat.py b/fhirflat/fhir2flat.py index a278d4c..31df281 100644 --- a/fhirflat/fhir2flat.py +++ b/fhirflat/fhir2flat.py @@ -208,7 +208,9 @@ def flattenExtensions(df: pd.DataFrame, extension: str) -> pd.DataFrame: def expand_and_redefine(df, extension): - def redefine(row: pd.Series, extension: str) -> pd.Series: + def redefine( + row: pd.Series | pd.DataFrame, extension: str + ) -> pd.Series | pd.DataFrame: """Expands out simple extensions and leaves complex ones as is. To be dealt with later in the pipeline.""" diff --git a/fhirflat/flat2fhir.py b/fhirflat/flat2fhir.py index 62dcf2f..2d87448 100644 --- a/fhirflat/flat2fhir.py +++ b/fhirflat/flat2fhir.py @@ -1,5 +1,9 @@ # Converts FHIRflat files into FHIR resources -from .util import group_keys, get_fhirtype, get_local_extension_type +from .util import ( + group_keys, + get_fhirtype, + get_local_extension_type, +) from fhir.resources.quantity import Quantity from fhir.resources.codeableconcept import CodeableConcept from fhir.resources.period import Period @@ -9,13 +13,33 @@ from fhir.resources.backbonetype import BackboneType as _BackboneType from pydantic.v1.error_wrappers import ValidationError +from pydantic.v1 import BaseModel def create_codeable_concept( - old_dict: dict[str, list[str] | str], name: str + old_dict: dict[str, list[str] | str | float], name: str ) -> dict[str, list[str]]: """Re-creates a codeableConcept structure from the FHIRflat representation.""" - codes = old_dict.get(name + ".code") + + # for reading in from ingestion pipeline + if name + ".code" in old_dict and name + ".system" in old_dict: + raw_codes: str | float | list[str] = old_dict.get(name + ".code") + if not isinstance(raw_codes, list): + formatted_code = ( + raw_codes if isinstance(raw_codes, str) else str(int(raw_codes)) + ) + codes = [old_dict[name + ".system"] + "|" + formatted_code] + else: + formatted_codes = [ + c if isinstance(c, str) else str(int(c)) for c in raw_codes + ] + codes = [ + [s + "|" + c] + for s, c in zip(old_dict[name + ".system"], formatted_codes) + ] + else: + # From FHIRflat file + codes = old_dict.get(name + ".code") if codes is None: return { @@ -60,9 +84,14 @@ def createQuantity(df, group): for attribute in df.keys(): attr = attribute.split(".")[-1] if attr == "code": - system, code = df[group + ".code"].split("|") - quant["code"] = code - quant["system"] = system + if group + ".system" in df.keys(): + # reading in from ingestion pipeline + quant["code"] = df[group + ".code"] + quant["system"] = df[group + ".system"] + else: + system, code = df[group + ".code"].split("|") + quant["code"] = code + quant["system"] = system else: quant[attr] = df[group + "." + attr] @@ -130,52 +159,64 @@ def set_datatypes(k, v_dict, klass) -> dict: } data_type = prop[value_type[0]]["type"] - data_class = get_fhirtype(data_type) - return {"url": k, f"{value_type[0]}": set_datatypes(k, v_dict, data_class)} + try: + data_class = get_fhirtype(data_type) + return {"url": k, f"{value_type[0]}": set_datatypes(k, v_dict, data_class)} + except AttributeError: + # datatype should be a primitive + return {"url": k, f"{value_type[0]}": v_dict[k]} return {s.split(".", 1)[1]: v_dict[s] for s in v_dict} -def expand_concepts(data: dict, data_class: type[_DomainResource]) -> dict: +def find_data_class(data_class: list[BaseModel] | BaseModel, k: str) -> BaseModel: """ - Combines columns containing flattened FHIR concepts back into - JSON-like structures. + Finds the type class for item k within the data class. + + Parameters + ---------- + data_class: list[BaseModel] or BaseModel + The data class to search within. If a list, the function will search for the + a class with a matching title to k. + k: str + The property to search for within the data class """ - groups = group_keys(data.keys()) - group_classes = {} - for k in groups.keys(): + if isinstance(data_class, list): + title_matches = [k.lower() == c.schema()["title"].lower() for c in data_class] + result = [x for x, y in zip(data_class, title_matches) if y] + if len(result) == 1: + return get_fhirtype(k) + else: + raise ValueError(f"Couldn't find a matching class for {k} in {data_class}") - if isinstance(data_class, list): - title_matches = [ - k.lower() == c.schema()["title"].lower() for c in data_class - ] - result = [x for x, y in zip(data_class, title_matches) if y] - if len(result) == 1: - group_classes[k] = k - continue - else: - raise ValueError( - f"Couldn't find a matching class for {k} in {data_class}" - ) + else: + k_schema = data_class.schema()["properties"].get(k) - else: - k_schema = data_class.schema()["properties"].get(k) + base_class = ( + k_schema.get("items").get("type") + if k_schema.get("items") is not None + else k_schema.get("type") + ) + + if base_class is None: + assert k_schema.get("type") == "array" + + base_class = [opt.get("type") for opt in k_schema["items"]["anyOf"]] + return get_fhirtype(base_class) - group_classes[k] = ( - k_schema.get("items").get("type") - if k_schema.get("items") is not None - else k_schema.get("type") - ) - if group_classes[k] is None: - assert k_schema.get("type") == "array" +def expand_concepts(data: dict[str, str], data_class: type[_DomainResource]) -> dict: + """ + Combines columns containing flattened FHIR concepts back into + JSON-like structures. + """ + groups = group_keys(data.keys()) + group_classes = {} - group_classes[k] = [ - opt.get("type") for opt in k_schema["items"]["anyOf"] - ] + for k in groups.keys(): - group_classes = {k: get_fhirtype(v) for k, v in group_classes.items()} + group_classes[k] = find_data_class(data_class, k) expanded = {} keys_to_replace = [] @@ -193,15 +234,34 @@ def expand_concepts(data: dict, data_class: type[_DomainResource]) -> dict: if all(isinstance(v, dict) for v in v_dict.values()): # coming back out of nested recursion expanded[k] = {s.split(".", 1)[1]: v_dict[s] for s in v_dict} - if data_class.schema()["properties"][k].get("type") == "array": - if k == "extension": - expanded[k] = [v for v in expanded[k].values()] - else: - expanded[k] = [expanded[k]] + + elif any(isinstance(v, dict) for v in v_dict.values()) and isinstance( + group_classes[k], list + ): + # extensions, where some classes are just values and others have codes etc + non_dict_items = { + k: v for k, v in v_dict.items() if not isinstance(v, dict) + } + stripped_dict = { + s.split(".", 1)[1]: non_dict_items[s] for s in non_dict_items.keys() + } + for k1, v1 in stripped_dict.items(): + klass = find_data_class(group_classes[k], k1) + v_dict[k + "." + k1] = set_datatypes(k1, {k1: v1}, klass) + + expanded[k] = {s.split(".", 1)[1]: v_dict[s] for s in v_dict} else: expanded[k] = set_datatypes(k, v_dict, group_classes[k]) + if isinstance(data_class, list): + continue + elif data_class.schema()["properties"][k].get("type") == "array": + if k == "extension": + expanded[k] = [v for v in expanded[k].values()] + else: + expanded[k] = [expanded[k]] + dense_cols = { k: k.removesuffix("_dense") for k in data.keys() if k.endswith("_dense") } diff --git a/fhirflat/ingest.py b/fhirflat/ingest.py new file mode 100644 index 0000000..4df3da4 --- /dev/null +++ b/fhirflat/ingest.py @@ -0,0 +1,338 @@ +""" +Stores the main functions for converting clinical data (initally from RedCap-ARCH) to +FHIRflat. +""" + +import pandas as pd +import numpy as np +import warnings +import os +from math import isnan +from fhirflat.util import get_local_resource + +# 1:1 (single row, single resource) mapping: Patient, Encounter +# 1:M (single row, multiple resources) mapping: Observation, Condition, Procedure, ... + +""" +TODO +* sort out how to choose ID's e.g. location within encounter etc +* cope with 'if' statements - e.g. for date overwriting. +* deal with how to check if lists are appropriate when adding multiple values to a + single field - list options. +* Consider using pandarallel (https://pypi.org/project/pandarallel/) to parallelize + the apply function, particularly for one to many mappings. +""" + + +def find_field_value(row, response, mapp, raw_data=None): + """ + Returns the data for a given field, given the mapping. + For one to many resources the raw data is provided to allow for searching for other + fields than in the melted data. + """ + if mapp == "": + return response + elif "+" in mapp: + mapp = mapp.split("+") + results = [find_field_value(row, response, m, raw_data) for m in mapp] + results = [str(x) for x in results if not (isinstance(x, float) and isnan(x))] + return " ".join(results) if "/" not in results[0] else "".join(results) + elif "if not" in mapp: + mapp = mapp.replace(" ", "").split("ifnot") + results = [find_field_value(row, response, m, raw_data) for m in mapp] + x, y = results + if isinstance(y, float): + return x if isnan(y) else None + else: + return x if not y else None + elif "<" in mapp: + col = mapp.lstrip("<").rstrip(">") + try: + return row[col] + except KeyError: + if raw_data is not None: + try: + return raw_data.loc[row["index"], col] + except KeyError: + raise KeyError(f"Column {col} not found in data") + else: + raise KeyError(f"Column {col} not found in the filtered data") + else: + return mapp + + +def create_dict_wide(row: pd.Series, map_df: pd.DataFrame) -> dict: + """ + Takes a wide-format dataframe and iterates through the columns of the row, + applying the mapping to each column and produces a fhirflat-like dictionary to + initialize the resource object for each row. + """ + + result: dict = {} + for column in row.index: + if column in map_df.index.get_level_values(0): + response = row[column] + if pd.notna(response): # Ensure there is a response to map + try: + # Retrieve the mapping for the given column and response + if pd.isna(map_df.loc[column].index).all(): + mapping = map_df.loc[(column, np.nan)].dropna() + else: + mapping = map_df.loc[(column, str(int(response)))].dropna() + snippet = { + k: ( + v + if "<" not in str(v) + else find_field_value(row, response, v) + ) + for k, v in mapping.items() + } + except KeyError: + # No mapping found for this column and response despite presence + # in mapping file + warnings.warn( + f"No mapping for column {column} response {response}", + UserWarning, + ) + continue + else: + continue + else: + raise ValueError(f"Column {column} not found in mapping file") + duplicate_keys = set(result.keys()).intersection(snippet.keys()) + if not duplicate_keys: + result = result | snippet + else: + if all( + result[key] == snippet[key] for key in duplicate_keys + ): # Ignore duplicates if they are the same + continue + elif all(result[key] is None for key in duplicate_keys): + result.update(snippet) + else: + for key in duplicate_keys: + if isinstance(result[key], list): + result[key].append(snippet[key]) + else: + result[key] = [result[key], snippet[key]] + return result + + +def create_dict_long( + row: pd.Series, full_df: pd.DataFrame, map_df: pd.DataFrame +) -> dict | None: + """ + Takes a long-format dataframe and a mapping file, and produces a fhirflat-like + dictionary for each row in the dataframe. + """ + + column = row["column"] + response = row["value"] + if pd.notna(response): # Ensure there is a response to map + try: + # Retrieve the mapping for the given column and response + if pd.isna(map_df.loc[column].index).all(): + mapping = map_df.loc[(column, np.nan)].dropna() + else: + mapping = map_df.loc[(column, str(int(response)))].dropna() + snippet = { + k: ( + v + if "<" not in str(v) + else find_field_value(row, response, v, raw_data=full_df) + ) + for k, v in mapping.items() + } + return snippet + except KeyError: + # No mapping found for this column and response despite presence + # in mapping file + warnings.warn( + f"No mapping for column {column} response {response}", + UserWarning, + ) + return None + + +def create_dictionary( + data_file: str, + map_file: str, + resource: str, + one_to_one=False, + subject_id="subjid", +) -> pd.DataFrame | None: + """ + Given a data file and a single mapping file for one FHIR resource type, + returns a single column dataframe with the mapped data in a FHIRflat-like + format, ready for further processing. + + Parameters + ---------- + data: str + The path to the data file containing the clinical data. + map_file: pd.DataFrame + The path to the mapping file containing the mapping of the clinical data to the + FHIR resource. + resource: str + The name of the resource being mapped. + one_to_one: bool + Whether the resource should be mapped as one-to-one or one-to-many. + subject_id: str + The name of the column containing the subject ID in the data file. + """ + + data: pd.DataFrame = pd.read_csv(data_file, header=0) + map_df: pd.DataFrame = pd.read_csv(map_file, header=0) + + # setup the data ----------------------------------------------------------- + relevant_cols = map_df["raw_variable"].dropna().unique() + filtered_data = data.loc[:, data.columns.isin(relevant_cols)].copy() + + if filtered_data.empty: + warnings.warn(f"No data found for the {resource} resource.", UserWarning) + return None + + if one_to_one: + + def condense(x): + """ + In case where data is actually multi-row per subject, condenses the relevant + data into a single row for 1:1 mapping. + """ + + # Check if the column contains nan values + if x.isnull().any(): + # If the column contains a single non-nan value, return it + non_nan_values = x.dropna() + if non_nan_values.nunique() == 1: + return non_nan_values + elif non_nan_values.empty: + return np.nan + else: + raise ValueError("Multiple values found in one-to-one mapping") + else: + if len(x) == 1: + return x + else: + raise ValueError("Multiple values found in one-to-one mapping") + + filtered_data = filtered_data.groupby(subject_id, as_index=False).agg(condense) + + if not one_to_one: + filtered_data = filtered_data.reset_index() + melted_data = filtered_data.melt(id_vars="index", var_name="column") + + # set up the mappings ------------------------------------------------------- + + # Fills the na input variables with the previous value + map_df["raw_variable"] = map_df["raw_variable"].ffill() + + # strips the text answers out of the response column + map_df["raw_response"] = map_df["raw_response"].apply( + lambda x: x.split(",")[0] if isinstance(x, str) else x + ) + + # Set multi-index for easier access + map_df.set_index(["raw_variable", "raw_response"], inplace=True) + + # Generate the flat_like dictionary + if one_to_one: + filtered_data["flat_dict"] = filtered_data.apply( + create_dict_wide, args=[map_df], axis=1 + ) + return filtered_data + else: + melted_data["flat_dict"] = melted_data.apply( + create_dict_long, args=[data, map_df], axis=1 + ) + return melted_data["flat_dict"].to_frame() + + +def convert_data_to_flat( + data: str, + folder_name: str, + mapping_files_types: tuple[dict, dict] | None = None, + sheet_id: str | None = None, + subject_id="subjid", +): + """ + Takes raw clinical data (currently assumed to be a one-row-per-patient format like + RedCap exports) and produces a folder of FHIRflat files, one per resource. Takes + either local mapping files, or a Google Sheet ID containing the mapping files. + + Parameters + ---------- + data: str + The path to the raw clinical data file. + folder_name: str + The name of the folder to store the FHIRflat files. + mapping_files_types: tuple[dict, dict] | None + A tuple containing two dictionaries, one with the mapping files for each + resource type and one with the mapping type (either one-to-one or one-to-many) + for each resource type. + sheet_id: str | None + The Google Sheet ID containing the mapping files. The first sheet must contain + the mapping types - one column listing the resource name, and another describing + whether the mapping is one-to-one or one-to-many. The subsequent sheets must + be named by resource, and contain the mapping for that resource. + subject_id: str + The name of the column containing the subject ID in the data file. + """ + + if not mapping_files_types and not sheet_id: + raise TypeError("Either mapping_files_types or sheet_id must be provided") + + if not os.path.exists(folder_name): + os.makedirs(folder_name) + + if mapping_files_types: + mappings, types = mapping_files_types + else: + sheet_link = ( + f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv" + ) + + df_types = pd.read_csv(sheet_link, header=0, index_col="Resources") + types = dict( + zip( + df_types.index, + df_types["Resource Type"], + ) + ) + sheet_keys = {r: df_types.loc[r, "Sheet ID"] for r in types.keys()} + mappings = { + get_local_resource(r): sheet_link + f"&gid={i}" + for r, i in sheet_keys.items() + } + + for resource, map_file in mappings.items(): + + t = types[resource.__name__] + if t == "one-to-one": + df = create_dictionary( + data, + map_file, + resource.__name__, + one_to_one=True, + subject_id=subject_id, + ) + if df is None: + continue + elif t == "one-to-many": + df = create_dictionary( + data, + map_file, + resource.__name__, + one_to_one=False, + subject_id=subject_id, + ) + if df is None: + continue + else: + df = df.dropna().reset_index(drop=True) + else: + raise ValueError(f"Unknown mapping type {t}") + + resource.ingest_to_flat( + df, os.path.join(folder_name, resource.__name__.lower()) + ) diff --git a/fhirflat/resources/__init__.py b/fhirflat/resources/__init__.py new file mode 100644 index 0000000..de3ba2b --- /dev/null +++ b/fhirflat/resources/__init__.py @@ -0,0 +1,12 @@ +from .condition import Condition +from .encounter import Encounter +from .immunization import Immunization +from .location import Location +from .medicationadministration import MedicationAdministration +from .medicationstatement import MedicationStatement +from .observation import Observation +from .organization import Organization +from .patient import Patient +from .procedure import Procedure +from .researchsubject import ResearchSubject +from .specimen import Specimen diff --git a/fhirflat/resources/base.py b/fhirflat/resources/base.py index 015a0b4..7719212 100644 --- a/fhirflat/resources/base.py +++ b/fhirflat/resources/base.py @@ -1,29 +1,36 @@ # from pydantic import BaseModel from __future__ import annotations -from fhir.resources.domainresource import DomainResource +from fhir.resources.domainresource import DomainResource as _DomainResource import pandas as pd import orjson -from ..fhir2flat import fhir2flat +from fhirflat.fhir2flat import fhir2flat +from fhirflat.flat2fhir import expand_concepts + from typing import TypeAlias, ClassVar JsonString: TypeAlias = str -class FHIRFlatBase(DomainResource): +class FHIRFlatBase(_DomainResource): + """ + Base class for FHIR resources to add FHIRflat functionality. + """ - flat_exclusions: ClassVar[set[str]] = ( + flat_exclusions: ClassVar[set[str]] = { "meta", "implicitRules", "language", "text", "contained", "modifierExtension", - ) + } flat_defaults: ClassVar[list[str]] = [] + backbone_elements: ClassVar[dict] = {} + @classmethod def attr_lists(cls) -> list[str]: """Attributes which take a list of FHIR types.""" @@ -39,7 +46,7 @@ def flat_fields(cls) -> list[str]: return [x for x in cls.elements_sequence() if x not in cls.flat_exclusions] @classmethod - def cleanup(cls, data: JsonString) -> FHIRFlatBase: + def cleanup(cls, data: JsonString | dict, json_data=True) -> FHIRFlatBase: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data @@ -73,6 +80,91 @@ def from_flat(cls, file: str) -> FHIRFlatBase | list[FHIRFlatBase]: else: return list(df["fhir"]) + @classmethod + def ingest_backbone_elements(cls, mapped_data: pd.Series) -> pd.Series: + """ + Unflattens ordered lists of data and forms the correct FHIR format which won't + be flattened after ingestion (*_dense columns). + + Extends the flat2fhir.expand_concepts function specifically for data ingestion. + + Parameters + ---------- + mapped_data: pd.Series + Pandas series of FHIRflat-like dictionaries ready to be converted to FHIR + format. + + Returns + ------- + pd.Series + + """ + + def fhir_format(row: pd.Series) -> pd.Series: + for b_e, b_c in cls.backbone_elements.items(): + keys_present = [key for key in row if key.startswith(b_e)] + if keys_present: + condensed_dict = {k: row[k] for k in keys_present} + if all( + not isinstance(v, list) or len(v) == 1 + for v in condensed_dict.values() + ): + continue + else: + backbone_list = [] + for i in range(len(next(iter(condensed_dict.values())))): + first_item = { + k.lstrip(b_e + "."): v[i] + for k, v in condensed_dict.items() + } + backbone_list.append(expand_concepts(first_item, b_c)) + for k_d in condensed_dict: + row.pop(k_d) + row[b_e] = backbone_list + return row + + condensed_mapped_data = mapped_data.apply(fhir_format) + return condensed_mapped_data + + @classmethod + def ingest_to_flat(cls, data: pd.DataFrame, filename: str): + """ + Takes a pandas dataframe and populates the resource with the data. + Creates a FHIRflat parquet file for the resources. + + data: pd.DataFrame + Pandas dataframe containing the data + """ + + data.loc[:, "flat_dict"] = cls.ingest_backbone_elements(data["flat_dict"]) + + # Creates a columns of FHIR resource instances + data["fhir"] = data["flat_dict"].apply( + lambda x: cls.cleanup(x, json_data=False) + ) + + # flattens resources back out + flat_df = data["fhir"].apply(lambda x: x.to_flat()) + + # Stops parquet conversion from stripping the time from mixed date/datetime + # columns + for date_cols in [ + x for x in flat_df.columns if "date" in x.lower() or "period" in x.lower() + ]: + flat_df[date_cols] = flat_df[date_cols].astype(str) + flat_df[date_cols] = flat_df[date_cols].replace("nan", None) + + for coding_column in [ + x + for x in flat_df.columns + if x.lower().endswith(".code") or x.lower().endswith(".text") + ]: + flat_df[coding_column] = flat_df[coding_column].apply( + lambda x: [x] if isinstance(x, str) else x + ) + + flat_df.to_parquet(f"{filename}.parquet") + @classmethod def fhir_bulk_import(cls, file: str) -> FHIRFlatBase | list[FHIRFlatBase]: """ @@ -110,7 +202,8 @@ def fhir_file_to_flat(cls, source_file: str, output_name: str | None = None): source_file: str Path to the FHIR resource file. output_name: str (optional) - Name of the parquet file to be generated, optional, defaults to {resource}.parquet + Name of the parquet file to be generated, optional, defaults to + {resource}.parquet """ if not output_name: @@ -128,11 +221,17 @@ def fhir_file_to_flat(cls, source_file: str, output_name: str | None = None): flat_rows.append(fhir2flat(resource, lists=list_resources)) df = pd.concat(flat_rows) + + # remove required attributes now it's in the flat representation + for attr in cls.flat_defaults: + df.drop(list(df.filter(regex=attr)), axis=1, inplace=True) + df.to_parquet(output_name) - def to_flat(self, filename: str) -> None: + def to_flat(self, filename: str | None = None) -> None | pd.Series: """ Generates a FHIRflat parquet file from the resource. + If no file name is provided, returns a pandas Series. Parameters ---------- @@ -154,4 +253,8 @@ def to_flat(self, filename: str) -> None: for attr in self.flat_defaults: flat_df.drop(list(flat_df.filter(regex=attr)), axis=1, inplace=True) - flat_df.to_parquet(filename) + if filename: + flat_df.to_parquet(filename) + else: + assert flat_df.shape[0] == 1 + return flat_df.loc[0] diff --git a/fhirflat/resources/condition.py b/fhirflat/resources/condition.py index 32a3b6a..ae1b25b 100644 --- a/fhirflat/resources/condition.py +++ b/fhirflat/resources/condition.py @@ -1,10 +1,14 @@ from __future__ import annotations from fhir.resources.condition import Condition as _Condition from .base import FHIRFlatBase +from .extension_types import presenceAbsenceType, prespecifiedQueryType, timingPhaseType +from .extensions import presenceAbsence, prespecifiedQuery, timingPhase import orjson -from ..flat2fhir import expand_concepts -from typing import TypeAlias, ClassVar +from fhirflat.flat2fhir import expand_concepts +from typing import TypeAlias, ClassVar, Union +from fhir.resources import fhirtypes +from pydantic.v1 import Field, validator JsonString: TypeAlias = str @@ -12,19 +16,54 @@ class Condition(_Condition, FHIRFlatBase): + extension: list[ + Union[ + presenceAbsenceType, + prespecifiedQueryType, + timingPhaseType, + fhirtypes.ExtensionType, + ] + ] = Field( + None, + alias="extension", + title="Additional content defined by implementations", + description=( + """ + Contains the G.H 'age' and 'birthSex' extensions, + and allows extensions from other implementations to be included.""" + ), + # if property is element of this resource. + element_property=True, + union_mode="smart", + ) + # attributes to exclude from the flat representation - flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions + ( + flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions | { "id", "identifier", "verificationStatus", "evidence", "note", "participant", - ) + } # required attributes that are not present in the FHIRflat representation flat_defaults: ClassVar[list[str]] = FHIRFlatBase.flat_defaults + ["clinicalStatus"] + @validator("extension") + def validate_extension_contents(cls, extensions): + present_count = sum(isinstance(item, presenceAbsence) for item in extensions) + query_count = sum(isinstance(item, prespecifiedQuery) for item in extensions) + timing_count = sum(isinstance(item, timingPhase) for item in extensions) + + if present_count > 1 or query_count > 1 or timing_count > 1: + raise ValueError( + "presenceAbsence, prespecifiedQuery and timingPhase can only appear" + " once." + ) + + return extensions + @classmethod def flat_descriptions(cls) -> dict[str, str]: """ @@ -42,13 +81,16 @@ def flat_descriptions(cls) -> dict[str, str]: return descrip @classmethod - def cleanup(cls, data: JsonString) -> Condition: + def cleanup(cls, data_dict: JsonString | dict, json_data=True) -> Condition: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - data = orjson.loads(data) + if json_data and isinstance(data_dict, str): + data: dict = orjson.loads(data_dict) + elif isinstance(data_dict, dict): + data: dict = data_dict data["encounter"] = {"reference": data["encounter"]} data["subject"] = {"reference": data["subject"]} diff --git a/fhirflat/resources/encounter.py b/fhirflat/resources/encounter.py index b6f386d..78fe9f3 100644 --- a/fhirflat/resources/encounter.py +++ b/fhirflat/resources/encounter.py @@ -1,9 +1,16 @@ from __future__ import annotations from fhir.resources.encounter import Encounter as _Encounter +from fhir.resources.encounter import ( + EncounterAdmission, + EncounterDiagnosis, + EncounterLocation, + EncounterParticipant, + EncounterReason, +) from .base import FHIRFlatBase import orjson -from ..flat2fhir import expand_concepts +from fhirflat.flat2fhir import expand_concepts from .extensions import relativePeriod, timingPhase from .extension_types import relativePeriodType, timingPhaseType @@ -24,8 +31,8 @@ class Encounter(_Encounter, FHIRFlatBase): title="List of `Extension` items (represented as `dict` in JSON)", description=( """ - Contains the Global.health 'eventTiming' and 'relativePeriod' extensions, and allows - extensions from other implementations to be included. + Contains the Global.health 'eventTiming' and 'relativePeriod' extensions, + and allows extensions from other implementations to be included. """ ), # if property is element of this resource. @@ -35,8 +42,7 @@ class Encounter(_Encounter, FHIRFlatBase): ) # attributes to exclude from the flat representation - flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions + ( - "id", + flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions | { "identifier", "participant", # participants other than the patient "appointment", # appointment that scheduled the encounter @@ -44,11 +50,19 @@ class Encounter(_Encounter, FHIRFlatBase): "dietPreference", "specialArrangement", # if translator, streatcher, wheelchair etc. needed "specialCourtesy", # contains ID information, VIP, board member, etc. - ) + } # required attributes that are not present in the FHIRflat representation flat_defaults: ClassVar[list[str]] = FHIRFlatBase.flat_defaults + ["status"] + backbone_elements: ClassVar[dict] = { + "participant": EncounterParticipant, + "reason": EncounterReason, + "diagnosis": EncounterDiagnosis, + "admission": EncounterAdmission, + "location": EncounterLocation, + } + @validator("extension") def validate_extension_contents(cls, extensions): rel_phase_count = sum(isinstance(item, relativePeriod) for item in extensions) @@ -60,13 +74,16 @@ def validate_extension_contents(cls, extensions): return extensions @classmethod - def cleanup(cls, data: JsonString) -> Encounter: + def cleanup(cls, data_dict: JsonString | dict, json_data=True) -> Encounter: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - data = orjson.loads(data) + if json_data and isinstance(data_dict, str): + data: dict = orjson.loads(data_dict) + elif isinstance(data_dict, dict): + data: dict = data_dict for field in { "subject", diff --git a/fhirflat/resources/extension_types.py b/fhirflat/resources/extension_types.py index 3eea251..a8ffcdb 100644 --- a/fhirflat/resources/extension_types.py +++ b/fhirflat/resources/extension_types.py @@ -4,7 +4,7 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - from pydantic.v1.types import CallableGenerator + from pydantic.v1.typing import CallableGenerator class AbstractType(_AbstractType): @@ -51,5 +51,17 @@ class birthSexType(AbstractType): __resource_type__ = "birthSex" +class raceType(AbstractType): + __resource_type__ = "Race" + + +class presenceAbsenceType(AbstractType): + __resource_type__ = "presenceAbsence" + + +class prespecifiedQueryType(AbstractType): + __resource_type__ = "prespecifiedQuery" + + class dateTimeExtensionType(AbstractType): __resource_type__ = "dateTimeExtension" diff --git a/fhirflat/resources/extension_validators.py b/fhirflat/resources/extension_validators.py index 7e96460..4d98c6c 100644 --- a/fhirflat/resources/extension_validators.py +++ b/fhirflat/resources/extension_validators.py @@ -65,6 +65,9 @@ def __init__(self): "Duration": (None, ".extensions"), "Age": (None, ".extensions"), "birthSex": (None, ".extensions"), + "Race": (None, ".extensions"), + "presenceAbsence": (None, ".extensions"), + "prespecifiedQuery": (None, ".extensions"), "dateTimeExtension": (None, ".extensions"), } @@ -230,5 +233,17 @@ def birthsex_validator(v: Union[StrBytes, dict, Path, FHIRAbstractModel]): return Validators().fhir_model_validator("birthSex", v) +def race_validator(v: Union[StrBytes, dict, Path, FHIRAbstractModel]): + return Validators().fhir_model_validator("Race", v) + + +def presenceabsence_validator(v: Union[StrBytes, dict, Path, FHIRAbstractModel]): + return Validators().fhir_model_validator("presenceAbsence", v) + + +def prespecifiedquery_validator(v: Union[StrBytes, dict, Path, FHIRAbstractModel]): + return Validators().fhir_model_validator("prespecifiedQuery", v) + + def datetimeextension_validator(v: Union[StrBytes, dict, Path, FHIRAbstractModel]): return Validators().fhir_model_validator("dateTimeExtension", v) diff --git a/fhirflat/resources/extensions.py b/fhirflat/resources/extensions.py index dad428b..5529f93 100644 --- a/fhirflat/resources/extensions.py +++ b/fhirflat/resources/extensions.py @@ -23,9 +23,9 @@ class timingPhase(_DataType): with an appropriate SNOMED (or similar) code. """ - resource_type = Field("timingPhase", const=True) + resource_type: str = Field(default="timingPhase", const=True) - url = Field("timingPhase", const=True, alias="url") + url: str = Field("timingPhase", const=True, alias="url") valueCodeableConcept: fhirtypes.CodeableConceptType = Field( None, @@ -61,9 +61,9 @@ class relativeDay(_DataType): both the relative start and end dates instead. """ - resource_type = Field("relativeDay", const=True) + resource_type: str = Field(default="relativeDay", const=True) - url = Field("relativeDay", const=True, alias="url") + url: str = Field("relativeDay", const=True, alias="url") valueInteger: fhirtypes.Integer = Field( None, @@ -97,9 +97,9 @@ class relativeStart(_DataType): An ISARIC extension for use inside the complex `relativePeriod` extension. """ - resource_type = Field("relativeStart", const=True) + resource_type: str = Field(default="relativeStart", const=True) - url = Field("relativeStart", const=True, alias="url") + url: str = Field("relativeStart", const=True, alias="url") valueInteger: fhirtypes.Integer = Field( None, @@ -133,9 +133,9 @@ class relativeEnd(_DataType): An ISARIC extension for use inside the complex `relativePeriod` extension. """ - resource_type = Field("relativeEnd", const=True) + resource_type: str = Field(default="relativeEnd", const=True) - url = Field("relativeEnd", const=True, alias="url") + url: str = Field("relativeEnd", const=True, alias="url") valueInteger: fhirtypes.Integer = Field( None, @@ -177,9 +177,9 @@ class relativePeriod(_DataType): relativeEnd is 5. """ - resource_type = Field("relativePeriod", const=True) + resource_type: str = Field(default="relativePeriod", const=True) - url = Field("relativePeriod", const=True, alias="url") + url: str = Field("relativePeriod", const=True, alias="url") extension: list[Union[et.relativeStartType, et.relativeEndType]] = Field( None, @@ -224,9 +224,9 @@ class approximateDate(_DataType): approximateDate extension with a valueString of "3 months". """ - resource_type = Field("approximateDate", const=True) + resource_type: str = Field(default="approximateDate", const=True) - url = Field("approximateDate", const=True, alias="url") + url: str = Field("approximateDate", const=True, alias="url") valueDate: fhirtypes.Date = Field( None, @@ -314,9 +314,9 @@ class Duration(_DataType): duration is not an option in the base FHIR specification. """ - resource_type = Field("Duration", const=True) + resource_type: str = Field(default="Duration", const=True) - url = Field("duration", const=True, alias="url") + url: str = Field("duration", const=True, alias="url") valueQuantity: fhirtypes.QuantityType = Field( None, @@ -350,9 +350,9 @@ class Age(_DataType): An ISARIC extension collecting data on the age of a patient. """ - resource_type = Field("Age", const=True) + resource_type: str = Field(default="Age", const=True) - url = Field("age", const=True, alias="url") + url: str = Field("age", const=True, alias="url") valueQuantity: fhirtypes.QuantityType = Field( None, @@ -386,9 +386,9 @@ class birthSex(_DataType): An ISARIC extension collecting data on the birth sex of a patient. """ - resource_type = Field("birthSex", const=True) + resource_type: str = Field(default="birthSex", const=True) - url = Field("birthSex", const=True, alias="url") + url: str = Field("birthSex", const=True, alias="url") valueCodeableConcept: fhirtypes.CodeableConceptType = Field( None, @@ -417,6 +417,114 @@ def elements_sequence(cls): ] +class Race(_DataType): + """ + An ISARIC extension collecting data on the race of a patient. + """ + + resource_type: str = Field(default="Race", const=True) + + url: str = Field("race", const=True, alias="url") + + valueCodeableConcept: fhirtypes.CodeableConceptType = Field( + None, + alias="valueCodeableConcept", + title="Value of extension", + description=( + "Value of extension - must be one of a constrained set of the data " + "types (see [Extensibility](extensibility.html) for a list)." + ), + # if property is element of this resource. + element_property=True, + element_required=True, + ) + + @classmethod + def elements_sequence(cls): + """returning all elements names from + ``Extension`` according specification, + with preserving original sequence order. + """ + return [ + "id", + "extension", + "url", + "valueCodeableConcept", + ] + + +class presenceAbsence(_DataType): + """ + An ISARIC extension to indicate if a clinical finding is present, absent or unknown. + """ + + resource_type: str = Field(default="presenceAbsence", const=True) + + url: str = Field("presenceAbsence", const=True, alias="url") + + valueCodeableConcept: fhirtypes.CodeableConceptType = Field( + None, + alias="valueCodeableConcept", + title="Value of extension", + description=( + "Value of extension - must be one of a constrained set of the data " + "types (see [Extensibility](extensibility.html) for a list)." + ), + # if property is element of this resource. + element_property=True, + element_required=True, + ) + + @classmethod + def elements_sequence(cls): + """returning all elements names from + ``Extension`` according specification, + with preserving original sequence order. + """ + return [ + "id", + "extension", + "url", + "valueCodeableConcept", + ] + + +class prespecifiedQuery(_DataType): + """ + An ISARIC extension to indicate if a finding is the result of a prespecified query. + """ + + resource_type: str = Field(default="prespecifiedQuery", const=True) + + url: str = Field("prespecifiedQuery", const=True, alias="url") + + valueBoolean: bool = Field( + None, + alias="valueBoolean", + title="Value of extension", + description=( + "Value of extension - must be one of a constrained set of the data " + "types (see [Extensibility](extensibility.html) for a list)." + ), + # if property is element of this resource. + element_property=True, + elementRequired=True, + ) + + @classmethod + def elements_sequence(cls): + """returning all elements names from + ``Extension`` according specification, + with preserving original sequence order. + """ + return [ + "id", + "extension", + "url", + "valueBoolean", + ] + + # ------------------- extension types ------------------------------ @@ -427,7 +535,7 @@ class dateTimeExtension(_FHIRPrimitiveExtension): to the current date. """ - resource_type = Field("dateTimeExtension", const=True) + resource_type: str = Field(default="dateTimeExtension", const=True) extension: list[ Union[et.approximateDateType, et.relativeDayType, fhirtypes.ExtensionType] diff --git a/fhirflat/resources/immunization.py b/fhirflat/resources/immunization.py index 4bf2dfe..115439e 100644 --- a/fhirflat/resources/immunization.py +++ b/fhirflat/resources/immunization.py @@ -6,7 +6,7 @@ from pydantic.v1 import Field, validator import orjson -from ..flat2fhir import expand_concepts +from fhirflat.flat2fhir import expand_concepts from typing import TypeAlias, ClassVar, Union from fhir.resources import fhirtypes @@ -37,7 +37,7 @@ class Immunization(_Immunization, FHIRFlatBase): ) # attributes to exclude from the flat representation - flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions + ( + flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions | { "id", "identifier", "basedOn", @@ -50,7 +50,7 @@ class Immunization(_Immunization, FHIRFlatBase): "informationSource", "performer", "note", - ) + } # required attributes that are not present in the FHIRflat representation flat_defaults: ClassVar[list[str]] = FHIRFlatBase.flat_defaults + ["status"] @@ -65,17 +65,21 @@ def validate_extension_contents(cls, extensions): return extensions @classmethod - def cleanup(cls, data: JsonString) -> Immunization: + def cleanup(cls, data_dict: JsonString | dict, json_data=True) -> Immunization: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - data = orjson.loads(data) - - for field in ({"patient", "encounter", "location"} | { - x for x in data.keys() if x.endswith(".reference") - }).intersection(data.keys()): + if json_data and isinstance(data_dict, str): + data: dict = orjson.loads(data_dict) + elif isinstance(data_dict, dict): + data: dict = data_dict + + for field in ( + {"patient", "encounter", "location"} + | {x for x in data.keys() if x.endswith(".reference")} + ).intersection(data.keys()): data[field] = {"reference": data[field]} # add default status back in diff --git a/fhirflat/resources/location.py b/fhirflat/resources/location.py index 29b917e..ba1b96c 100644 --- a/fhirflat/resources/location.py +++ b/fhirflat/resources/location.py @@ -3,7 +3,7 @@ from .base import FHIRFlatBase import orjson -from ..flat2fhir import expand_concepts +from fhirflat.flat2fhir import expand_concepts from typing import TypeAlias, ClassVar JsonString: TypeAlias = str @@ -12,22 +12,25 @@ class Location(_Location, FHIRFlatBase): # attributes to exclude from the flat representation - flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions + ( + flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions | { "id", "identifier", "status", "contact", # phone numbers, addresses, "hoursOfOperation", - ) + } @classmethod - def cleanup(cls, data: JsonString) -> Location: + def cleanup(cls, data_dict: JsonString | dict, json_data=True) -> Location: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - data = orjson.loads(data) + if json_data and isinstance(data_dict, str): + data: dict = orjson.loads(data_dict) + elif isinstance(data_dict, dict): + data: dict = data_dict for field in { "managingOrganization", diff --git a/fhirflat/resources/medicationadministration.py b/fhirflat/resources/medicationadministration.py index 136a82f..c46aa15 100644 --- a/fhirflat/resources/medicationadministration.py +++ b/fhirflat/resources/medicationadministration.py @@ -5,7 +5,7 @@ from .base import FHIRFlatBase import orjson -from ..flat2fhir import expand_concepts +from fhirflat.flat2fhir import expand_concepts from typing import TypeAlias, ClassVar JsonString: TypeAlias = str @@ -14,35 +14,43 @@ class MedicationAdministration(_MedicationAdministration, FHIRFlatBase): # attributes to exclude from the flat representation - flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions + ( + flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions | { "id", "identifier", "basedOn", "performer", "note", - ) + } # required attributes that are not present in the FHIRflat representation flat_defaults: ClassVar[list[str]] = FHIRFlatBase.flat_defaults + ["status"] @classmethod - def cleanup(cls, data: JsonString) -> MedicationAdministration: + def cleanup( + cls, data_dict: JsonString | dict, json_data=True + ) -> MedicationAdministration: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - data = orjson.loads(data) - - for field in ({ - "basedOn", - "partOf", - "subject", - "encounter", - "supportingInformation", - "request", - "eventHistory", - } | {x for x in data.keys() if x.endswith(".reference")}).intersection(data.keys()): + if json_data and isinstance(data_dict, str): + data: dict = orjson.loads(data_dict) + elif isinstance(data_dict, dict): + data: dict = data_dict + + for field in ( + { + "basedOn", + "partOf", + "subject", + "encounter", + "supportingInformation", + "request", + "eventHistory", + } + | {x for x in data.keys() if x.endswith(".reference")} + ).intersection(data.keys()): data[field] = {"reference": data[field]} # add default status back in diff --git a/fhirflat/resources/medicationstatement.py b/fhirflat/resources/medicationstatement.py index f208c5b..4136745 100644 --- a/fhirflat/resources/medicationstatement.py +++ b/fhirflat/resources/medicationstatement.py @@ -5,7 +5,7 @@ from .base import FHIRFlatBase import orjson -from ..flat2fhir import expand_concepts +from fhirflat.flat2fhir import expand_concepts from typing import TypeAlias, ClassVar JsonString: TypeAlias = str @@ -14,32 +14,40 @@ class MedicationStatement(_MedicationStatement, FHIRFlatBase): # attributes to exclude from the flat representation - flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions + ( + flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions | { "id", "identifier", "informationSource", "note", - ) + } # required attributes that are not present in the FHIRflat representation flat_defaults: ClassVar[list[str]] = FHIRFlatBase.flat_defaults + ["status"] @classmethod - def cleanup(cls, data: JsonString) -> MedicationStatement: + def cleanup( + cls, data_dict: JsonString | dict, json_data=True + ) -> MedicationStatement: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - data = orjson.loads(data) - - for field in ({ - "partOf", - "subject", - "encounter", - "derivedFrom", - "relatedClinicalInformation", - } | {x for x in data.keys() if x.endswith(".reference")}).intersection(data.keys()): + if json_data and isinstance(data_dict, str): + data: dict = orjson.loads(data_dict) + elif isinstance(data_dict, dict): + data: dict = data_dict + + for field in ( + { + "partOf", + "subject", + "encounter", + "derivedFrom", + "relatedClinicalInformation", + } + | {x for x in data.keys() if x.endswith(".reference")} + ).intersection(data.keys()): data[field] = {"reference": data[field]} # add default status back in diff --git a/fhirflat/resources/observation.py b/fhirflat/resources/observation.py index 288f679..582faf7 100644 --- a/fhirflat/resources/observation.py +++ b/fhirflat/resources/observation.py @@ -9,7 +9,7 @@ import orjson from fhir.resources import fhirtypes -from ..flat2fhir import expand_concepts +from fhirflat.flat2fhir import expand_concepts from typing import TypeAlias, ClassVar, Union JsonString: TypeAlias = str @@ -67,7 +67,7 @@ class Observation(_Observation, FHIRFlatBase): ) # attributes to exclude from the flat representation - flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions + ( + flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions | { "id", "identifier", "instantiatesCanonical", @@ -77,7 +77,7 @@ class Observation(_Observation, FHIRFlatBase): "referenceRange", "issued", "note", - ) + } # required attributes that are not present in the FHIRflat representation flat_defaults: ClassVar[list[str]] = FHIRFlatBase.flat_defaults + ["status"] @@ -92,13 +92,16 @@ def validate_extension_contents(cls, extensions): return extensions @classmethod - def cleanup(cls, data: JsonString) -> Observation: + def cleanup(cls, data_dict: JsonString | dict, json_data=True) -> Observation: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - data = orjson.loads(data) + if json_data and isinstance(data_dict, str): + data: dict = orjson.loads(data_dict) + elif isinstance(data_dict, dict): + data: dict = data_dict for field in { "encounter", @@ -108,7 +111,7 @@ def cleanup(cls, data: JsonString) -> Observation: "specimen", "device", }.intersection(data.keys()): - data[field] = {"reference": data[field]} + data[field] = {"reference": str(data[field])} # add default status back in data["status"] = "final" diff --git a/fhirflat/resources/organization.py b/fhirflat/resources/organization.py index f8b0aa7..66ab4a8 100644 --- a/fhirflat/resources/organization.py +++ b/fhirflat/resources/organization.py @@ -3,7 +3,7 @@ from .base import FHIRFlatBase import orjson -from ..flat2fhir import expand_concepts +from fhirflat.flat2fhir import expand_concepts from typing import TypeAlias, ClassVar JsonString: TypeAlias = str @@ -12,21 +12,24 @@ class Organization(_Organization, FHIRFlatBase): # attributes to exclude from the flat representation - flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions + ( + flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions | { "id", "identifier", "active", "contact", # phone numbers, addresses - ) + } @classmethod - def cleanup(cls, data: JsonString) -> Organization: + def cleanup(cls, data_dict: JsonString | dict, json_data=True) -> Organization: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - data = orjson.loads(data) + if json_data and isinstance(data_dict, str): + data: dict = orjson.loads(data_dict) + elif isinstance(data_dict, dict): + data: dict = data_dict for field in { "partOf", diff --git a/fhirflat/resources/patient.py b/fhirflat/resources/patient.py index db465cb..e94d705 100644 --- a/fhirflat/resources/patient.py +++ b/fhirflat/resources/patient.py @@ -1,13 +1,11 @@ -from fhir.resources.patient import Patient +from __future__ import annotations +from fhir.resources.patient import Patient as _Patient from .base import FHIRFlatBase -from .extension_types import ( - ageType, - birthSexType, -) -from .extensions import Age, birthSex +from .extension_types import ageType, birthSexType, raceType +from .extensions import Age, birthSex, Race import orjson -from ..flat2fhir import expand_concepts +from fhirflat.flat2fhir import expand_concepts from typing import TypeAlias, ClassVar, Union from fhir.resources import fhirtypes from pydantic.v1 import Field, validator @@ -15,23 +13,25 @@ JsonString: TypeAlias = str -class Patient(Patient, FHIRFlatBase): - extension: list[Union[ageType, birthSexType, fhirtypes.ExtensionType]] = Field( - None, - alias="extension", - title="Additional content defined by implementations", - description=( - """ +class Patient(_Patient, FHIRFlatBase): + extension: list[Union[ageType, birthSexType, raceType, fhirtypes.ExtensionType]] = ( + Field( + None, + alias="extension", + title="Additional content defined by implementations", + description=( + """ Contains the G.H 'age' and 'birthSex' extensions, and allows extensions from other implementations to be included.""" - ), - # if property is element of this resource. - element_property=True, - union_mode="smart", + ), + # if property is element of this resource. + element_property=True, + union_mode="smart", + ) ) # attributes to exclude from the flat representation - flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions + ( + flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions | { "identifier", "active", "name", @@ -41,15 +41,16 @@ class Patient(Patient, FHIRFlatBase): "contact", "communication", "link", - ) + } @validator("extension") def validate_extension_contents(cls, extensions): age_count = sum(isinstance(item, Age) for item in extensions) birthsex_count = sum(isinstance(item, birthSex) for item in extensions) + race_count = sum(isinstance(item, Race) for item in extensions) - if age_count > 1 or birthsex_count > 1: - raise ValueError("Age and birthSex can only appear once.") + if age_count > 1 or birthsex_count > 1 or race_count > 1: + raise ValueError("Age, birthSex and Race can only appear once.") return extensions @@ -69,11 +70,20 @@ def flat_descriptions(cls) -> dict[str, str]: return descrip @classmethod - def cleanup(cls, data: JsonString) -> Patient: - # Load the data and apply resource-specific changes - data = orjson.loads(data) - - # # Strip time from the birthDate + def cleanup(cls, data_dict: JsonString | dict, json_data=True) -> Patient: + """ + Load data into a dictionary-like structure, then + apply resource-specific changes and unpack flattened data + like codeableConcepts back into structured data. + """ + if json_data and isinstance(data_dict, str): + data: dict = orjson.loads(data_dict) + elif isinstance(data_dict, dict): + data: dict = data_dict + + data["id"] = str(data["id"]) + + # Strip time from the birthDate if "birthDate" in data: data["birthDate"] = data["birthDate"].split("T", 1)[0] diff --git a/fhirflat/resources/procedure.py b/fhirflat/resources/procedure.py index 2dc0989..9848228 100644 --- a/fhirflat/resources/procedure.py +++ b/fhirflat/resources/procedure.py @@ -14,7 +14,7 @@ from pydantic.v1 import Field, validator import orjson -from ..flat2fhir import expand_concepts +from fhirflat.flat2fhir import expand_concepts from typing import TypeAlias, ClassVar, Union from fhir.resources import fhirtypes @@ -48,7 +48,7 @@ class Procedure(_Procedure, FHIRFlatBase): ) # attributes to exclude from the flat representation - flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions + ( + flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions | { "id", "identifier", "instantiatesCanonical", @@ -62,7 +62,7 @@ class Procedure(_Procedure, FHIRFlatBase): "reason", "note", "supportingInfo", - ) + } # required attributes that are not present in the FHIRflat representation flat_defaults: ClassVar[list[str]] = FHIRFlatBase.flat_defaults + ["status"] @@ -81,13 +81,16 @@ def validate_extension_contents(cls, extensions): return extensions @classmethod - def cleanup(cls, data: JsonString) -> Procedure: + def cleanup(cls, data_dict: JsonString | dict, json_data=True) -> Procedure: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - data = orjson.loads(data) + if json_data and isinstance(data_dict, str): + data: dict = orjson.loads(data_dict) + elif isinstance(data_dict, dict): + data: dict = data_dict for field in { "partOf", diff --git a/fhirflat/resources/researchsubject.py b/fhirflat/resources/researchsubject.py index c24bad6..2bf930e 100644 --- a/fhirflat/resources/researchsubject.py +++ b/fhirflat/resources/researchsubject.py @@ -3,7 +3,7 @@ from .base import FHIRFlatBase import orjson -from ..flat2fhir import expand_concepts +from fhirflat.flat2fhir import expand_concepts from typing import TypeAlias, ClassVar JsonString: TypeAlias = str @@ -12,26 +12,30 @@ class ResearchSubject(_ResearchSubject, FHIRFlatBase): # attributes to exclude from the flat representation - flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions + ( + flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions | { "id", "identifier", - ) + } # required attributes that are not present in the FHIRflat representation flat_defaults: ClassVar[list[str]] = FHIRFlatBase.flat_defaults + ["status"] @classmethod - def cleanup(cls, data: JsonString) -> ResearchSubject: + def cleanup(cls, data_dict: JsonString | dict, json_data=True) -> ResearchSubject: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - data = orjson.loads(data) - - for field in ({"study", "subject", "consent"} | { - x for x in data.keys() if x.endswith(".reference") - }).intersection(data.keys()): + if json_data and isinstance(data_dict, str): + data: dict = orjson.loads(data_dict) + elif isinstance(data_dict, dict): + data: dict = data_dict + + for field in ( + {"study", "subject", "consent"} + | {x for x in data.keys() if x.endswith(".reference")} + ).intersection(data.keys()): data[field] = {"reference": data[field]} # add default status back in diff --git a/fhirflat/resources/specimen.py b/fhirflat/resources/specimen.py index 3ef491a..732871d 100644 --- a/fhirflat/resources/specimen.py +++ b/fhirflat/resources/specimen.py @@ -3,7 +3,7 @@ from .base import FHIRFlatBase import orjson -from ..flat2fhir import expand_concepts +from fhirflat.flat2fhir import expand_concepts from typing import TypeAlias, ClassVar JsonString: TypeAlias = str @@ -12,32 +12,38 @@ class Specimen(_Specimen, FHIRFlatBase): # attributes to exclude from the flat representation - flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions + ( + flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions | { "id", "identifier", "accessionIdentifier", "status", "note", - ) + } @classmethod - def cleanup(cls, data: JsonString) -> Specimen: + def cleanup(cls, data_dict: JsonString | dict, json_data=True) -> Specimen: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - data = orjson.loads(data) - - for field in ({ - "subject", - "parent", - "request", - "collection.collector", - "collection.procedure", - "container.device", - "container.location", - } | {x for x in data.keys() if x.endswith(".reference")}).intersection(data.keys()): + if json_data and isinstance(data_dict, str): + data: dict = orjson.loads(data_dict) + elif isinstance(data_dict, dict): + data: dict = data_dict + + for field in ( + { + "subject", + "parent", + "request", + "collection.collector", + "collection.procedure", + "container.device", + "container.location", + } + | {x for x in data.keys() if x.endswith(".reference")} + ).intersection(data.keys()): data[field] = {"reference": data[field]} data = expand_concepts(data, cls) diff --git a/fhirflat/util.py b/fhirflat/util.py index fe1a8b6..760cdfd 100644 --- a/fhirflat/util.py +++ b/fhirflat/util.py @@ -3,11 +3,14 @@ import fhir.resources import re import importlib +from collections.abc import KeysView from .resources import extensions +import fhirflat -def group_keys(data_keys: list[str]) -> list[dict[str, list[str]]]: + +def group_keys(data_keys: list[str] | KeysView) -> dict[str, list[str]]: """ Finds columns with a '.' in the name denoting data that has been flattened and groups them together. @@ -65,3 +68,7 @@ def get_local_extension_type(t: str): return getattr(extensions, t.capitalize()) except AttributeError: raise AttributeError(f"Could not find {t} in fhirflat extensions") + + +def get_local_resource(t: str): + return getattr(fhirflat, t) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..5641cd8 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,44 @@ +[build-system] +requires = ['setuptools>=40.8.0'] +build-backend = 'setuptools.build_meta' + +[tool.setuptools] +packages = ["fhirflat"] + +[project] +name = "fhirflat" +version = "0.1.0" +description = "Flattened FHIR resources" +authors = [ + {name = "Pip Liggins", email = "philippa.liggins@dtc.ox.ac.uk"}, + {name = "Abhishek Dasgupta", email = "abhishek.dasgupta@dtc.ox.ac.uk"}, +] +license = {file = "LICENSE"} +requires-python = ">=3.10" +readme = "README.md" +classifiers = ["License :: OSI Approved :: MIT License"] +dependencies = [ + "fhir.resources==7.1.0", + "numpy==1.26.4", + "orjson==3.9.13", + "pandas>=2.2.0", + "pyarrow==15.0.0", + "pydantic==2.6.1", + "pydantic_core==2.16.2", +] + +[project.optional-dependencies] +test = [ + "pytest", + "pytest-cov", + "pytest-unordered" +] + +[project.urls] +Home = "https://github.com/globaldothealth/fhirflat" + +[tool.black] +line-length = 88 + +[tool.pytest.ini_options] +pythonpath = "." diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index 03f586d..0000000 --- a/pytest.ini +++ /dev/null @@ -1,2 +0,0 @@ -[pytest] -pythonpath = . \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 1755bfb..0000000 --- a/requirements.txt +++ /dev/null @@ -1,10 +0,0 @@ -fhir.resources==7.1.0 -numpy==1.26.4 -orjson==3.9.13 -pandas==2.2.0 -pyarrow==15.0.0 -pydantic==2.6.1 -pydantic_core==2.16.2 -pytest==8.0.0 -pytest-cov -pytest-unordered diff --git a/tests/data/condition_flat.parquet b/tests/data/condition_flat.parquet index af35f9b..f213484 100644 Binary files a/tests/data/condition_flat.parquet and b/tests/data/condition_flat.parquet differ diff --git a/tests/data/encounter_flat.parquet b/tests/data/encounter_flat.parquet index cf34fd9..a1cdf7a 100644 Binary files a/tests/data/encounter_flat.parquet and b/tests/data/encounter_flat.parquet differ diff --git a/tests/dummy_data/combined_dummy_data.csv b/tests/dummy_data/combined_dummy_data.csv new file mode 100644 index 0000000..eb4d59d --- /dev/null +++ b/tests/dummy_data/combined_dummy_data.csv @@ -0,0 +1,5 @@ +subjid,visitid,dates_enrolment,dates_adm,dates_admdate,dates_admtime,non_encounter_field,outco_denguediag,outco_denguediag_main,outco_denguediag_class,outco_not_dengue,outco_secondiag_oth,outco_date,outco_outcome,daily_date,vital_highesttem_c,vital_hr,vital_rr,vital_systolicbp,vital_diastolicbp,vital_spo2,vital_fio2spo2_02110,vital_fio2spo2_pcnt,vital_capillaryr,vital_avpu,vital_gcs,vital_urineflow +1,10,2020-05-01,0,,,,,,,cough,,,7,2020-01-01,36.2,120,30,70,120,5,,75,1,1,1,150 +2,11,,1,2021-04-01,18:00,fish,1,,2,,,2021-04-10,1,2021-02-02,37,100,40,80,130,6,10,85,0,2,1,200 +3,12,,1,2021-05-10,17:30,,1,,1,flu,,2021-05-15,4,2022-03-03,35.5,70,50,90,140,7,,95,0,3,1, +4,13,,1,2022-06-15,21:00,dolphin,0,Malaria,,,,2022-06-20,2,,,,,,,,,,,,, \ No newline at end of file diff --git a/tests/dummy_data/encounter_dummy_data_multi.csv b/tests/dummy_data/encounter_dummy_data_multi.csv new file mode 100644 index 0000000..eedb6fc --- /dev/null +++ b/tests/dummy_data/encounter_dummy_data_multi.csv @@ -0,0 +1,5 @@ +subjid,visitid,dates_enrolment,dates_adm,dates_admdate,dates_admtime,non_encounter_field,outco_denguediag,outco_denguediag_main,outco_denguediag_class,outco_not_dengue,outco_secondiag_oth,outco_date,outco_outcome +1,10,2020-05-01,0,,,,,,,cough,,,7 +2,11,,1,2021-04-01,18:00,fish,1,,2,,,2021-04-10,1 +3,12,,1,2021-05-10,17:30,,1,,1,flu,,2021-05-15,4 +4,13,,1,2022-06-15,21:00,dolphin,0,Malaria,,,,2022-06-20,2 \ No newline at end of file diff --git a/tests/dummy_data/encounter_dummy_data_single.csv b/tests/dummy_data/encounter_dummy_data_single.csv new file mode 100644 index 0000000..d0c908d --- /dev/null +++ b/tests/dummy_data/encounter_dummy_data_single.csv @@ -0,0 +1,2 @@ +subjid,visitid,dates_enrolment,dates_adm,dates_admdate,dates_admtime,non_encounter_field,outco_denguediag,outco_denguediag_main,outco_denguediag_class,outco_not_dengue,outco_secondiag_oth,outco_date,outco_outcome +2,11,2021-04-02,1,2021-04-01,18:00,fish,1,,2,,,2021-04-10,1 \ No newline at end of file diff --git a/tests/dummy_data/encounter_dummy_mapping.csv b/tests/dummy_data/encounter_dummy_mapping.csv new file mode 100644 index 0000000..ae99d6b --- /dev/null +++ b/tests/dummy_data/encounter_dummy_mapping.csv @@ -0,0 +1,25 @@ +raw_variable,raw_response,id,subject,extension.timingPhase.system,extension.timingPhase.code,extension.timingPhase.text,class.system,class.code,class.text,actualPeriod.start,actualPeriod.end,diagnosis.condition.concept.system,diagnosis.condition.concept.code,diagnosis.condition.concept.text,diagnosis.use.system,diagnosis.use.code,diagnosis.use.text,admission.dischargeDisposition.system,admission.dischargeDisposition.code,admission.dischargeDisposition.text +subjid,,,Patient/+,,,,,,,,,,,,,,,,, +visitid,,,,,,,,,,,,,,,,,,,, +dates_enrolment,,,,,,,,,, if not , if not ,,,,,,,,, +dates_adm,"1, Yes",,,https://snomed.info/sct,278307001,On admission (qualifier value),https://snomed.info/sct,32485007,Hospital admission (procedure),,,,,,,,,,, +,"0, No",,,https://snomed.info/sct,281379000,Pre-admission (qualifier value),https://snomed.info/sct,371883000,Outpatient procedure (procedure),,,,,,,,,,, +,"99, Unknown",,,,,,https://snomed.info/sct,32485007,Hospital admission (procedure),,,,,,,,,,, +dates_admdate,,,,,,,,,,+,,,,,,,,,, +dates_admtime,,,,,,,,,,+,,,,,,,,,, +outco_denguediag,"1, Yes",,,,,,,,,,,https://snomed.info/sct,38362002,Dengue (disorder),https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,, +,"0, No",,,,,,,,,,,,,,,,,,, +,"99, Unknown",,,,,,,,,,,https://snomed.info/sct,261665006,Unknown (qualifier value),,,,,, +outco_date,,,,,,,,,,,,,,,,,,,, +outco_outcome,"1, Discharged alive",,,,,,,,,,,,,,,,,https://snomed.info/sct,371827001,Patient discharged alive (finding) +,"2, Still hospitalised",,,,,,,,,,,,,,,,,https://snomed.info/sct,32485007,Hospital admission (procedure) +,"3, Transfer to other facility",,,,,,,,,,,,,,,,,https://snomed.info/sct,306685000,Discharge to establishment (procedure) +,"4, Death",,,,,,,,,,,,,,,,,https://snomed.info/sct,419099009,Dead (finding) +,"5, Palliative care",,,,,,,,,,,,,,,,,https://snomed.info/sct,306237005,Referral to palliative care service (procedure) +,"6, Discharged against medical advice",,,,,,,,,,,,,,,,,https://snomed.info/sct,225928004,Patient self-discharge against medical advice (procedure) +,"7, Alive, not admitted",,,,,,,,,,,,,,,,,https://snomed.info/sct,371827001,Patient discharged alive (finding) +outco_denguediag_main,,,,,,,,,,,,,,,https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,, +outco_denguediag_class,"1, Uncomplicated dengue",,,,,,,,,,,https://snomed.info/sct,722862003,Dengue without warning signs (disorder),https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,, +,"2, Dengue with warning signs",,,,,,,,,,,https://snomed.info/sct,722863008,Dengue with warning signs (disorder),https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,, +,"3, Severe dengue",,,,,,,,,,,https://snomed.info/sct,20927009,Dengue hemorrhagic fever (disorder),https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,, +outco_secondiag_oth,,,,,,,,,,,,,,,https://snomed.info/sct,85097005,Secondary diagnosis (contextual qualifier) (qualifier value),,, \ No newline at end of file diff --git a/tests/dummy_data/observation_dummy_mapping.csv b/tests/dummy_data/observation_dummy_mapping.csv new file mode 100644 index 0000000..6e39236 --- /dev/null +++ b/tests/dummy_data/observation_dummy_mapping.csv @@ -0,0 +1,20 @@ +raw_variable,raw_response,single_resource_group,category.system,category.code,category.text,effectiveDateTime,code.system,code.code,code.text,subject,encounter,valueQuantity.value,valueQuantity.system,valueQuantity.code,valueQuantity.unit,valueCodeableConcept.system,valueCodeableConcept.code,valueCodeableConcept.text,valueDateTime,valueInteger +vital_highesttem_c,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8310-5,Body temperature,Patient/+,Encounter/+,,http://unitsofmeasure,Cel,DegreesCelsius,,,,, +vital_highesttem_f,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8310-5,Body temperature,Patient/+,Encounter/+,,http://unitsofmeasure,degF,DegreesFarenheit,,,,, +vital_hr,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8867-4,Heart rate,Patient/+,Encounter/+,,https://snomed.info/sct,258983007,Beats/minute (qualifier value),,,,, +vital_rr,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,9279-1,Respiratory rate,Patient/+,Encounter/+,,https://snomed.info/sct,258984001,Breaths/minute (qualifier value),,,,, +vital_systolicbp,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8480-6,Systolic blood pressure,Patient/+,Encounter/+,,http://unitsofmeasure,mm[Hg],MilliMetersOfMercury,,,,, +vital_diastolicbp,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8462-4,Diastolic blood pressure,Patient/+,Encounter/+,,http://unitsofmeasure,mm[Hg],MilliMetersOfMercury,,,,, +vital_spo2,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,59408-5,Oxygen saturation in Arterial blood by Pulse oximetry,Patient/+,Encounter/+,,http://unitsofmeasure,%,Percent,,,,, +vital_fio2spo2_02110,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,3150-0,Inhaled oxygen concentration,Patient/+,Encounter/+,,,,,,,,, +vital_fio2spo2_pcnt,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,3150-0,Inhaled oxygen concentration,Patient/+,Encounter/+,,http://unitsofmeasure,%,Percent,,,,, +vital_capillaryr,"1, Yes",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,50427001,Increased capillary filling time (finding),Patient/+,Encounter/+,,,,,https://snomed.info/sct,373066001,Yes,, +,"0, No",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,50427001,Increased capillary filling time (finding),Patient/+,Encounter/+,,,,,https://snomed.info/sct,373067005,No,, +,"99, Unknown",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,50427001,Increased capillary filling time (finding),Patient/+,Encounter/+,,,,,https://snomed.info/sct,261665006,Unknown,, +vital_avpu,"1, Alert",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),Patient/+,Encounter/+,,,,,https://snomed.info/sct,271591004,Fully conscious (finding),, +,"5, Confusion",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),Patient/+,Encounter/+,,,,,https://snomed.info/sct,40917007,Clouded consciousness (finding),, +,"2, Verbal",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),Patient/+,Encounter/+,,,,,https://snomed.info/sct,300202002,Responds to voice (finding),, +,"3, Pain",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),Patient/+,Encounter/+,,,,,https://snomed.info/sct,450847001,Responds to pain (finding),, +,"4, Unresponsive",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),Patient/+,Encounter/+,,,,,https://snomed.info/sct,422768004,Unresponsive (finding),, +vital_gcs,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,9269-2,Glasgow coma score total,Patient/+,Encounter/+,,,,,,,,, +vital_urineflow,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,9192-6,Urine output 24 hour,Patient/+,Encounter/+,,https://snomed.info/sct,258861009,Millilitre/24 hours (qualifier value),,,,, \ No newline at end of file diff --git a/tests/dummy_data/vital_signs_dummy_data.csv b/tests/dummy_data/vital_signs_dummy_data.csv new file mode 100644 index 0000000..b03903b --- /dev/null +++ b/tests/dummy_data/vital_signs_dummy_data.csv @@ -0,0 +1,4 @@ +subjid,visitid,daily_date,vital_highesttem_c,vital_hr,vital_rr,vital_systolicbp,vital_diastolicbp,vital_spo2,vital_fio2spo2_02110,vital_fio2spo2_pcnt,vital_capillaryr,vital_avpu,vital_gcs,vital_urineflow +1,10,2020-01-01,36.2,120,30,70,120,5,,75,1,1,1,150 +2,11,2021-02-02,37,100,40,80,130,6,10,85,0,2,1,200 +3,12,2022-03-03,35.5,70,50,90,140,7,,95,0,3,1, \ No newline at end of file diff --git a/tests/test_condition_resource.py b/tests/test_condition_resource.py index 958d224..208370f 100644 --- a/tests/test_condition_resource.py +++ b/tests/test_condition_resource.py @@ -6,6 +6,21 @@ CONDITION_DICT_INPUT = { "id": "c201", + "extension": [ + { + "url": "presenceAbsence", + "valueCodeableConcept": { + "coding": [ + { + "system": "http://snomed.info/sct", + "code": "410605003", + "display": "Present", + } + ] + }, + }, + {"url": "prespecifiedQuery", "valueBoolean": True}, + ], "identifier": [{"value": "12345"}], "clinicalStatus": { "coding": [ @@ -90,6 +105,9 @@ CONDITION_FLAT = { "resourceType": ["Condition"], + "extension.presenceAbsence.code": ["http://snomed.info/sct|410605003"], + "extension.presenceAbsence.text": ["Present"], + "extension.prespecifiedQuery": [True], "category.code": [ [ "http://snomed.info/sct|55607006", @@ -111,6 +129,21 @@ } CONDITION_DICT_OUT = { + "extension": [ + {"url": "prespecifiedQuery", "valueBoolean": True}, + { + "url": "presenceAbsence", + "valueCodeableConcept": { + "coding": [ + { + "system": "http://snomed.info/sct", + "code": "410605003", + "display": "Present", + } + ] + }, + }, + ], "clinicalStatus": { "coding": [ { @@ -177,6 +210,7 @@ def test_condition_to_flat(): assert_frame_equal( pd.read_parquet("test_condition.parquet"), pd.DataFrame(CONDITION_FLAT), + check_like=True, ) os.remove("test_condition.parquet") diff --git a/tests/test_encounter_resource.py b/tests/test_encounter_resource.py index fac7f83..260435d 100644 --- a/tests/test_encounter_resource.py +++ b/tests/test_encounter_resource.py @@ -176,6 +176,7 @@ ENCOUNTER_FLAT = { "resourceType": "Encounter", + "id": "f203", "extension.timingPhase.code": "http://snomed.info/sct|278307001", "extension.timingPhase.text": "on admission", "extension.relativePeriod.relativeStart": 2, @@ -236,6 +237,7 @@ ENCOUNTER_DICT_OUT = { "resourceType": "Encounter", + "id": "f203", "status": "completed", "extension": [ { diff --git a/tests/test_ingest.py b/tests/test_ingest.py new file mode 100644 index 0000000..5d6716a --- /dev/null +++ b/tests/test_ingest.py @@ -0,0 +1,487 @@ +from fhirflat.ingest import ( + create_dictionary, + convert_data_to_flat, +) +from fhirflat.resources.encounter import Encounter +from fhirflat.resources.observation import Observation +import pandas as pd +from pandas.testing import assert_frame_equal +import os +import shutil +from decimal import Decimal +import numpy as np + + +ENCOUNTER_DICT_OUT = { + "id": 11, + "subject": "Patient/2", + "actualPeriod.start": "2021-04-01 18:00", + "actualPeriod.end": "2021-04-10", + "extension.timingPhase.system": "https://snomed.info/sct", + "extension.timingPhase.code": 278307001, + "extension.timingPhase.text": "On admission (qualifier value)", + "class.system": "https://snomed.info/sct", + "class.code": 32485007, + "class.text": "Hospital admission (procedure)", + "diagnosis.condition.concept.system": [ + "https://snomed.info/sct", + "https://snomed.info/sct", + ], + "diagnosis.condition.concept.code": [38362002, 722863008], + "diagnosis.condition.concept.text": [ + "Dengue (disorder)", + "Dengue with warning signs (disorder)", + ], + "diagnosis.use.system": ["https://snomed.info/sct", "https://snomed.info/sct"], + "diagnosis.use.code": [89100005, 89100005], + "diagnosis.use.text": [ + "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", + "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", + ], + "admission.dischargeDisposition.system": "https://snomed.info/sct", + "admission.dischargeDisposition.code": 371827001, + "admission.dischargeDisposition.text": "Patient discharged alive (finding)", +} + + +def test_create_dict_one_to_one_single_row(): + df = create_dictionary( + "tests/dummy_data/encounter_dummy_data_single.csv", + "tests/dummy_data/encounter_dummy_mapping.csv", + "Encounter", + one_to_one=True, + ) + + assert df is not None + dict_out = df["flat_dict"][0] + + assert dict_out == ENCOUNTER_DICT_OUT + + +ENCOUNTER_SINGLE_ROW_FLAT = { + "resourceType": "Encounter", + "id": "11", + "class.code": "https://snomed.info/sct|32485007", + "class.text": "Hospital admission (procedure)", + "diagnosis_dense": [ + { + "condition": [ + { + "concept": { + "coding": [ + { + "code": "38362002", + "display": "Dengue (disorder)", + "system": "https://snomed.info/sct", + } + ] + } + } + ], + "use": [ + { + "coding": [ + { + "code": "89100005", + "display": "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + "system": "https://snomed.info/sct", + } + ] + } + ], + }, + { + "condition": [ + { + "concept": { + "coding": [ + { + "system": "https://snomed.info/sct", + "code": "722863008", + "display": "Dengue with warning signs (disorder)", + } + ] + } + } + ], + "use": [ + { + "coding": [ + { + "code": "89100005", + "display": "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + "system": "https://snomed.info/sct", + } + ] + } + ], + }, + ], + "subject": "Patient/2", + "actualPeriod.start": "2021-04-01 18:00:00", + "actualPeriod.end": "2021-04-10", + "admission.dischargeDisposition.code": "https://snomed.info/sct|371827001", + "admission.dischargeDisposition.text": "Patient discharged alive (finding)", + "extension.timingPhase.code": ["https://snomed.info/sct|278307001"], + "extension.timingPhase.text": ["On admission (qualifier value)"], +} + + +def test_load_data_one_to_one_single_row(): + df = create_dictionary( + "tests/dummy_data/encounter_dummy_data_single.csv", + "tests/dummy_data/encounter_dummy_mapping.csv", + "Encounter", + one_to_one=True, + ) + + assert df is not None + Encounter.ingest_to_flat(df, "encounter_ingestion_single") + + assert_frame_equal( + pd.read_parquet("encounter_ingestion_single.parquet"), + pd.DataFrame([ENCOUNTER_SINGLE_ROW_FLAT], index=[0]), + check_dtype=False, + ) + os.remove("encounter_ingestion_single.parquet") + + +ENCOUNTER_SINGLE_ROW_MULTI = { + "resourceType": ["Encounter", "Encounter", "Encounter", "Encounter"], + "class.code": [ + "https://snomed.info/sct|371883000", + "https://snomed.info/sct|32485007", + "https://snomed.info/sct|32485007", + "https://snomed.info/sct|32485007", + ], + "class.text": [ + "Outpatient procedure (procedure)", + "Hospital admission (procedure)", + "Hospital admission (procedure)", + "Hospital admission (procedure)", + ], + "diagnosis_dense": [ + None, + [ + { + "condition": [ + { + "concept": { + "coding": [ + { + "code": "38362002", + "display": "Dengue (disorder)", + "system": "https://snomed.info/sct", + } + ] + } + } + ], + "use": [ + { + "coding": [ + { + "code": "89100005", + "display": "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + "system": "https://snomed.info/sct", + } + ] + } + ], + }, + { + "condition": [ + { + "concept": { + "coding": [ + { + "code": "722863008", + "display": "Dengue with warning signs (disorder)", + "system": "https://snomed.info/sct", + } + ] + } + } + ], + "use": [ + { + "coding": [ + { + "code": "89100005", + "display": "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + "system": "https://snomed.info/sct", + } + ] + } + ], + }, + ], + [ + { + "condition": [ + { + "concept": { + "coding": [ + { + "code": "38362002", + "display": "Dengue (disorder)", + "system": "https://snomed.info/sct", + } + ] + } + } + ], + "use": [ + { + "coding": [ + { + "code": "89100005", + "display": "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + "system": "https://snomed.info/sct", + } + ] + } + ], + }, + { + "condition": [ + { + "concept": { + "coding": [ + { + "code": "722862003", + "display": "Dengue without warning signs (disorder)", # noqa: E501 + "system": "https://snomed.info/sct", + } + ] + } + } + ], + "use": [ + { + "coding": [ + { + "code": "89100005", + "display": "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + "system": "https://snomed.info/sct", + } + ] + } + ], + }, + ], + None, + ], + "diagnosis.condition.concept.text": [ + None, + None, + None, + ["Malaria"], + ], + "diagnosis.use.code": [ + None, + None, + None, + ["https://snomed.info/sct|89100005"], + ], + "diagnosis.use.text": [ + None, + None, + None, + ["Final diagnosis (discharge) (contextual qualifier) (qualifier value)"], + ], + "subject": ["Patient/1", "Patient/2", "Patient/3", "Patient/4"], + "id": ["10", "11", "12", "13"], + "actualPeriod.start": [ + "2020-05-01", + "2021-04-01 18:00:00", + "2021-05-10 17:30:00", + "2022-06-15 21:00:00", + ], + "actualPeriod.end": [ + "2020-05-01", + "2021-04-10", + "2021-05-15", + "2022-06-20", + ], + "admission.dischargeDisposition.code": [ + "https://snomed.info/sct|371827001", + "https://snomed.info/sct|371827001", + "https://snomed.info/sct|419099009", + "https://snomed.info/sct|32485007", + ], + "admission.dischargeDisposition.text": [ + "Patient discharged alive (finding)", + "Patient discharged alive (finding)", + "Dead (finding)", + "Hospital admission (procedure)", + ], + "extension.timingPhase.code": [ + ["https://snomed.info/sct|281379000"], + ["https://snomed.info/sct|278307001"], + ["https://snomed.info/sct|278307001"], + ["https://snomed.info/sct|278307001"], + ], + "extension.timingPhase.text": [ + ["Pre-admission (qualifier value)"], + ["On admission (qualifier value)"], + ["On admission (qualifier value)"], + ["On admission (qualifier value)"], + ], +} + + +def test_load_data_one_to_one_multi_row(): + df = create_dictionary( + "tests/dummy_data/encounter_dummy_data_multi.csv", + "tests/dummy_data/encounter_dummy_mapping.csv", + "Encounter", + one_to_one=True, + ) + + assert df is not None + Encounter.ingest_to_flat(df, "encounter_ingestion_multi") + + assert_frame_equal( + pd.read_parquet("encounter_ingestion_multi.parquet"), + pd.DataFrame(ENCOUNTER_SINGLE_ROW_MULTI), + check_dtype=False, + check_like=True, + ) + os.remove("encounter_ingestion_multi.parquet") + + +OBS_FLAT = { + "resourceType": [ + "Observation", + "Observation", + "Observation", + "Observation", + "Observation", + ], + "category.code": [ + "http://terminology.hl7.org/CodeSystem/observation-category|vital-signs", + "http://terminology.hl7.org/CodeSystem/observation-category|vital-signs", + "http://terminology.hl7.org/CodeSystem/observation-category|vital-signs", + "http://terminology.hl7.org/CodeSystem/observation-category|vital-signs", + "http://terminology.hl7.org/CodeSystem/observation-category|vital-signs", + ], + "category.text": [ + "Vital Signs", + "Vital Signs", + "Vital Signs", + "Vital Signs", + "Vital Signs", + ], + "effectiveDateTime": [ + "2020-01-01", + "2021-02-02", + "2022-03-03", + "2020-01-01", + "2021-02-02", + ], + "code.code": [ + "https://loinc.org|8310-5", + "https://loinc.org|8310-5", + "https://loinc.org|8310-5", + "https://loinc.org|8867-4", + "https://loinc.org|8867-4", + ], + "code.text": [ + "Body temperature", + "Body temperature", + "Body temperature", + "Heart rate", + "Heart rate", + ], + "subject": ["Patient/1", "Patient/2", "Patient/3", "Patient/1", "Patient/2"], + "encounter": [ + "Encounter/10", + "Encounter/11", + "Encounter/12", + "Encounter/10", + "Encounter/11", + ], + "valueQuantity.value": [Decimal("36.2"), 37.0, 35.5, 120.0, 100.0], + "valueQuantity.unit": [ + "DegreesCelsius", + "DegreesCelsius", + "DegreesCelsius", + "Beats/minute (qualifier value)", + "Beats/minute (qualifier value)", + ], + "valueQuantity.code": [ + "http://unitsofmeasure|Cel", + "http://unitsofmeasure|Cel", + "http://unitsofmeasure|Cel", + "https://snomed.info/sct|258983007", + "https://snomed.info/sct|258983007", + ], + "valueCodeableConcept.code": [None, None, None, None, None], + "valueCodeableConcept.text": [None, None, None, None, None], + "valueInteger": [np.nan, np.nan, np.nan, np.nan, np.nan], +} + + +def test_load_data_one_to_many_multi_row(): + df = create_dictionary( + "tests/dummy_data/vital_signs_dummy_data.csv", + "tests/dummy_data/observation_dummy_mapping.csv", + "Observation", + one_to_one=False, + ) + + assert df is not None + Observation.ingest_to_flat(df.dropna(), "observation_ingestion") + + full_df = pd.read_parquet("observation_ingestion.parquet") + + assert len(full_df) == 33 + + df_head = full_df.head(5) + + assert_frame_equal( + df_head, + pd.DataFrame(OBS_FLAT), + check_dtype=False, + check_like=True, + ) + os.remove("observation_ingestion.parquet") + + +def test_convert_data_to_flat_local_mapping(): + output_folder = "tests/ingestion_output" + mappings = { + Encounter: "tests/dummy_data/encounter_dummy_mapping.csv", + Observation: "tests/dummy_data/observation_dummy_mapping.csv", + } + resource_types = {"Encounter": "one-to-one", "Observation": "one-to-many"} + + convert_data_to_flat( + "tests/dummy_data/combined_dummy_data.csv", + mapping_files_types=(mappings, resource_types), + folder_name=output_folder, + ) + + encounter_df = pd.read_parquet("tests/ingestion_output/encounter.parquet") + obs_df = pd.read_parquet("tests/ingestion_output/observation.parquet") + + assert_frame_equal( + encounter_df, + pd.DataFrame(ENCOUNTER_SINGLE_ROW_MULTI), + check_dtype=False, + check_like=True, + ) + + assert len(obs_df) == 33 + + obs_df_head = obs_df.head(5) + + assert_frame_equal( + obs_df_head, + pd.DataFrame(OBS_FLAT), + check_dtype=False, + check_like=True, + ) + + shutil.rmtree(output_folder)