From b7f08481219afd463967ca78b4ab31749c770c42 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Wed, 8 May 2024 17:19:37 +0100 Subject: [PATCH 01/21] Draft attempt at ingestion using mapping file - Reads data into class then back out again, probably not very efficient. - Not tested with multiple rows of input data - Can't cope with multiple columns trying to set the same variable (need logic in mapping file for this) --- fhirflat/flat2fhir.py | 28 +++- fhirflat/ingest.py | 124 ++++++++++++++++++ fhirflat/resources/base.py | 43 +++++- fhirflat/resources/encounter.py | 5 +- .../encounter_dummy_data_single.csv | 2 + tests/dummy_data/encounter_dummy_mapping.csv | 25 ++++ tests/test_ingest.py | 14 ++ 7 files changed, 230 insertions(+), 11 deletions(-) create mode 100644 fhirflat/ingest.py create mode 100644 tests/dummy_data/encounter_dummy_data_single.csv create mode 100644 tests/dummy_data/encounter_dummy_mapping.csv create mode 100644 tests/test_ingest.py diff --git a/fhirflat/flat2fhir.py b/fhirflat/flat2fhir.py index 62dcf2f..237ffd3 100644 --- a/fhirflat/flat2fhir.py +++ b/fhirflat/flat2fhir.py @@ -15,6 +15,21 @@ def create_codeable_concept( old_dict: dict[str, list[str] | str], name: str ) -> dict[str, list[str]]: """Re-creates a codeableConcept structure from the FHIRflat representation.""" + + # for reading in from ingestion pipeline + if (name + ".code" and name + ".system") in old_dict: + new_dict = { + "coding": [ + { + "system": old_dict[name + ".system"], + "code": str(int(old_dict[name + ".code"])), + "display": old_dict[name + ".text"], + } + ] + } + return new_dict + + # From FHIRflat file codes = old_dict.get(name + ".code") if codes is None: @@ -193,15 +208,18 @@ def expand_concepts(data: dict, data_class: type[_DomainResource]) -> dict: if all(isinstance(v, dict) for v in v_dict.values()): # coming back out of nested recursion expanded[k] = {s.split(".", 1)[1]: v_dict[s] for s in v_dict} - if data_class.schema()["properties"][k].get("type") == "array": - if k == "extension": - expanded[k] = [v for v in expanded[k].values()] - else: - expanded[k] = [expanded[k]] else: expanded[k] = set_datatypes(k, v_dict, group_classes[k]) + if isinstance(data_class, list): + continue + elif data_class.schema()["properties"][k].get("type") == "array": + if k == "extension": + expanded[k] = [v for v in expanded[k].values()] + else: + expanded[k] = [expanded[k]] + dense_cols = { k: k.removesuffix("_dense") for k in data.keys() if k.endswith("_dense") } diff --git a/fhirflat/ingest.py b/fhirflat/ingest.py new file mode 100644 index 0000000..cbe8b4d --- /dev/null +++ b/fhirflat/ingest.py @@ -0,0 +1,124 @@ +""" +Stores the main functions for converting clinical data (initally from RedCap-ARCH) to +FHIRflat. + +Assumes two files are provided: one with the clinical data and one containing the +mappings. PL: Actually, maybe rather than the mappings it's either a file or a +dictionary showing the location of each mapping file (one per resource type). + +TODO: Eventually, this ahould link to a google sheet file that contains the mappings +""" + +import pandas as pd +import numpy as np + +# 1:1 (single row, single resource) mapping: Patient, Encounter +# 1:M (single row, multiple resources) mapping: Observation, Condition, Procedure, ... + +""" +1. Create one input-data dataframe per resource type, using the column names from +the mapping file + +2. For 1:1 mappings: use an apply function to create a fhir-like (or maybe +fhir-flat-like?) input data dictionary in one column, then a resource object in another. +Then follow format similar to fhir_file_to_flat to create the flat representation. + +3. For 1:M mappings: (PL: not sure about this) Group columns by single_resource column +(to be created in the mapping file), explode the dataframe by these groups, then follow +the 1:1 process. +""" + +""" +TODO +* cope with 'if' statements - e.g. for date overwriting. +* deal with duplicates/how to add multiple values to a single field +""" + + +def create_dictionary(data, map_file): + """ + Given a data file and a single mapping file for one FHIR resource type, + returns a single column dataframe with the mapped data in a FHIRflat-like + format, ready for further processing. + """ + + data = pd.read_csv(data, header=0) + map_df = pd.read_csv(map_file, header=0) + + filtered_data = data[map_df["redcap_variable"].dropna().unique()] + + # Fills the na redcap variables with the previous value + map_df["redcap_variable"] = map_df["redcap_variable"].ffill() + + # strips the text answers out of the redcap_response column + map_df["redcap_response"] = map_df["redcap_response"].apply( + lambda x: x.split(",")[0] if isinstance(x, str) else x + ) + + # Set multi-index for easier access + map_df.set_index(["redcap_variable", "redcap_response"], inplace=True) + + def create_dict_from_row(row): + """ + Iterates through the columns of the row, applying the mapping to each columns + and produces a fhirflat-like dictionary to initialize the resource object. + """ + + def find_field_value(row, response, map): + """Returns the data for a given field, given the map.""" + if map == "": + return response + elif "+" in map: + map = map.split("+") + results = [find_field_value(row, response, m) for m in map] + results = [x for x in results if x == x] + return "".join(results) + else: + col = map.lstrip("<").rstrip(">") + return row[col] + + result = {} + for column in row.index: + if column in map_df.index.get_level_values(0): + response = row[column] + if pd.notna(response): # Ensure there is a response to map + try: + # Retrieve the mapping for the given column and response + if pd.isna(map_df.loc[column].index).all(): + mapping = map_df.loc[(column, np.nan)].dropna() + else: + mapping = map_df.loc[(column, str(response))].dropna() + snippet = { + k: ( + v + if "<" not in str(v) + else find_field_value(row, response, v) + ) + for k, v in mapping.items() + } + except KeyError: + # No mapping found for this column and response + result[column] = f"No mapping for response {response}" + else: + continue + else: + raise ValueError(f"Column {column} not found in mapping file") + if not set(result.keys()).intersection(snippet.keys()): + result = result | snippet + else: + raise ValueError( + "Duplicate keys in mapping:" + f" {set(result.keys()).intersection(snippet.keys())}" + ) + return result + + # Apply the function across the DataFrame rows + filtered_data["flat_dict"] = filtered_data.apply(create_dict_from_row, axis=1) + return filtered_data + + +def load_data(data, mapping_files, resource_type, file_name): + + df = create_dictionary(data, mapping_files) + + resource_type.ingest_to_flat(df, file_name) diff --git a/fhirflat/resources/base.py b/fhirflat/resources/base.py index 015a0b4..56e36f3 100644 --- a/fhirflat/resources/base.py +++ b/fhirflat/resources/base.py @@ -39,7 +39,7 @@ def flat_fields(cls) -> list[str]: return [x for x in cls.elements_sequence() if x not in cls.flat_exclusions] @classmethod - def cleanup(cls, data: JsonString) -> FHIRFlatBase: + def cleanup(cls, data: JsonString | dict, json_data=True) -> FHIRFlatBase: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data @@ -73,6 +73,31 @@ def from_flat(cls, file: str) -> FHIRFlatBase | list[FHIRFlatBase]: else: return list(df["fhir"]) + @classmethod + def ingest_to_flat(cls, data: pd.DataFrame, filename: str): + """ + Takes a pandas dataframe and populates the resource with the data. + + data: pd.DataFrame + Pandas dataframe containing the data + + Returns + ------- + FHIRFlatBase or list[FHIRFlatBase] + """ + + # Creates a columns of FHIR resource instances + data["fhir"] = data["flat_dict"].apply( + lambda x: cls.cleanup(x, json_data=False) + ) + + data["fhir_flat"] = data["fhir"].apply(lambda x: x.to_flat()) + + # get the flat dataframe out into it's own variable + flat_df = pd.concat(data["fhir_flat"].tolist(), ignore_index=True) + + flat_df.to_parquet(f"{filename}.parquet") + @classmethod def fhir_bulk_import(cls, file: str) -> FHIRFlatBase | list[FHIRFlatBase]: """ @@ -110,7 +135,8 @@ def fhir_file_to_flat(cls, source_file: str, output_name: str | None = None): source_file: str Path to the FHIR resource file. output_name: str (optional) - Name of the parquet file to be generated, optional, defaults to {resource}.parquet + Name of the parquet file to be generated, optional, defaults to + {resource}.parquet """ if not output_name: @@ -128,11 +154,17 @@ def fhir_file_to_flat(cls, source_file: str, output_name: str | None = None): flat_rows.append(fhir2flat(resource, lists=list_resources)) df = pd.concat(flat_rows) + + # remove required attributes now it's in the flat representation + for attr in cls.flat_defaults: + df.drop(list(df.filter(regex=attr)), axis=1, inplace=True) + df.to_parquet(output_name) - def to_flat(self, filename: str) -> None: + def to_flat(self, filename: str | None = None) -> None: """ Generates a FHIRflat parquet file from the resource. + If no file name is provided, returns the pandas dataframe. Parameters ---------- @@ -154,4 +186,7 @@ def to_flat(self, filename: str) -> None: for attr in self.flat_defaults: flat_df.drop(list(flat_df.filter(regex=attr)), axis=1, inplace=True) - flat_df.to_parquet(filename) + if filename: + flat_df.to_parquet(filename) + else: + return flat_df diff --git a/fhirflat/resources/encounter.py b/fhirflat/resources/encounter.py index b6f386d..d0cc054 100644 --- a/fhirflat/resources/encounter.py +++ b/fhirflat/resources/encounter.py @@ -60,13 +60,14 @@ def validate_extension_contents(cls, extensions): return extensions @classmethod - def cleanup(cls, data: JsonString) -> Encounter: + def cleanup(cls, data: JsonString | dict, json_data=True) -> Encounter: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - data = orjson.loads(data) + if json_data: + data = orjson.loads(data) for field in { "subject", diff --git a/tests/dummy_data/encounter_dummy_data_single.csv b/tests/dummy_data/encounter_dummy_data_single.csv new file mode 100644 index 0000000..0c1560e --- /dev/null +++ b/tests/dummy_data/encounter_dummy_data_single.csv @@ -0,0 +1,2 @@ +usubjid,visitid,dates_enrolment,dates_adm,dates_admdate,dates_admtime,non_encounter_field,outco_denguediag,outco_denguediag_main,outco_denguediag_class,outco_not_dengue,outco_secondiag_oth,outco_date,outco_outcome +2,11,,1,2021-04-01,,fish,1,,2,,,2021-04-10,1 \ No newline at end of file diff --git a/tests/dummy_data/encounter_dummy_mapping.csv b/tests/dummy_data/encounter_dummy_mapping.csv new file mode 100644 index 0000000..5d88a4e --- /dev/null +++ b/tests/dummy_data/encounter_dummy_mapping.csv @@ -0,0 +1,25 @@ +redcap_variable,redcap_response,subject,class.system,class.code,class.text,actualPeriod.start,actualPeriod.end,reason.value.concept.system,reason.value.concept.code,reason.value.concept.text,reason.use.system,reason.use.code,reason.use.text,diagnosis.condition.concept.system,diagnosis.condition.concept.code,diagnosis.condition.concept.text,diagnosis.use.system,diagnosis.use.code,diagnosis.use.text,admission.dischargeDisposition.system,admission.dischargeDisposition.code,admission.dischargeDisposition.text +usubjid,,,,,,,,,,,,,,,,,,,,,, +dates_enrolment,,,,,,,,,,,,,,,,,,,,,, +dates_adm,"1, Yes",,https://snomed.info/sct,32485007,Hospital admission (procedure),,,,,,,,,,,,,,,,, +,"0, No",,https://snomed.info/sct,32485007,Hospital admission (procedure),,,,,,,,,,,,,,,,, +,,,https://snomed.info/sct,371883000,Outpatient procedure (procedure),,,,,,,,,,,,,,,,, +,"99, Unknown",,https://snomed.info/sct,32485007,Hospital admission (procedure),,,,,,,,,,,,,,,,, +dates_admdate,,,,,,+,,,,,,,,,,,,,,,, +dates_admtime,,,,,,+,,,,,,,,,,,,,,,, +outco_denguediag,"1, Yes",,,,,,,https://snomed.info/sct,38362002,Dengue (disorder),https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,,,,,,,, +,"0, No",,,,,,,,,,,,,,,,,,,,, +,"99, Unknown",,,,,,,https://snomed.info/sct,261665006,Unknown (qualifier value),,,,,,,,,,,, +outco_denguediag_main,,,,,,,,,,,,,,,,,https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,, +outco_denguediag_class,"1, Uncomplicated dengue",,,,,,,,,,,,,https://snomed.info/sct,722862003,Dengue without warning signs (disorder),https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,, +,"2, Dengue with warning signs",,,,,,,,,,,,,https://snomed.info/sct,722863008,Dengue with warning signs (disorder),https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,, +,"3, Severe dengue",,,,,,,,,,,,,https://snomed.info/sct,20927009,Dengue hemorrhagic fever (disorder),https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,, +outco_secondiag_oth,,,,,,,,,,,,,,,,,https://snomed.info/sct,85097005,Secondary diagnosis (contextual qualifier) (qualifier value),,, +outco_date,,,,,,,,,,,,,,,,,,,,,, +outco_outcome,"1, Discharged alive",,,,,,,,,,,,,,,,,,,https://snomed.info/sct,371827001,Patient discharged alive (finding) +,"2, Still hospitalised",,,,,,,,,,,,,,,,,,,https://snomed.info/sct,32485007,Hospital admission (procedure) +,"3, Transfer to other facility",,,,,,,,,,,,,,,,,,,https://snomed.info/sct,306685000,Discharge to establishment (procedure) +,"4, Death",,,,,,,,,,,,,,,,,,,https://snomed.info/sct,419099009,Dead (finding) +,"5, Palliative care",,,,,,,,,,,,,,,,,,,https://snomed.info/sct,306237005,Referral to palliative care service (procedure) +,"6, Discharged against medical advice",,,,,,,,,,,,,,,,,,,https://snomed.info/sct,225928004,Patient self-discharge against medical advice (procedure) +,"7, Alive, not admitted",,,,,,,,,,,,,,,,,,,https://snomed.info/sct,371827001,Patient discharged alive (finding) \ No newline at end of file diff --git a/tests/test_ingest.py b/tests/test_ingest.py new file mode 100644 index 0000000..8b9c183 --- /dev/null +++ b/tests/test_ingest.py @@ -0,0 +1,14 @@ +from fhirflat.ingest import load_data +from fhirflat.resources.encounter import Encounter +import pandas as pd + + +def test_load_data(): + load_data( + "tests/dummy_data/encounter_dummy_data_single.csv", + "tests/dummy_data/encounter_dummy_mapping.csv", + Encounter, + "encounter_ingestion_single", + ) + + pd.read_parquet("encounter_ingestion_single.parquet") From dd6150028d040f2b64893de834fa22deb472ce78 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Mon, 13 May 2024 11:53:55 +0100 Subject: [PATCH 02/21] Ignore duplicate keys if values are the same --- fhirflat/ingest.py | 18 ++++++++++++------ .../dummy_data/encounter_dummy_data_single.csv | 2 +- tests/test_ingest.py | 2 +- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/fhirflat/ingest.py b/fhirflat/ingest.py index cbe8b4d..6641cfc 100644 --- a/fhirflat/ingest.py +++ b/fhirflat/ingest.py @@ -72,7 +72,7 @@ def find_field_value(row, response, map): map = map.split("+") results = [find_field_value(row, response, m) for m in map] results = [x for x in results if x == x] - return "".join(results) + return " ".join(results) else: col = map.lstrip("<").rstrip(">") return row[col] @@ -103,13 +103,19 @@ def find_field_value(row, response, map): continue else: raise ValueError(f"Column {column} not found in mapping file") - if not set(result.keys()).intersection(snippet.keys()): + duplicate_keys = set(result.keys()).intersection(snippet.keys()) + if not duplicate_keys: result = result | snippet else: - raise ValueError( - "Duplicate keys in mapping:" - f" {set(result.keys()).intersection(snippet.keys())}" - ) + if all( + result[key] == snippet[key] for key in duplicate_keys + ): # Ignore duplicates if they are the same + continue + else: + raise ValueError( + "Duplicate keys in mapping:" + f" {set(result.keys()).intersection(snippet.keys())}" + ) return result # Apply the function across the DataFrame rows diff --git a/tests/dummy_data/encounter_dummy_data_single.csv b/tests/dummy_data/encounter_dummy_data_single.csv index 0c1560e..b7c635d 100644 --- a/tests/dummy_data/encounter_dummy_data_single.csv +++ b/tests/dummy_data/encounter_dummy_data_single.csv @@ -1,2 +1,2 @@ usubjid,visitid,dates_enrolment,dates_adm,dates_admdate,dates_admtime,non_encounter_field,outco_denguediag,outco_denguediag_main,outco_denguediag_class,outco_not_dengue,outco_secondiag_oth,outco_date,outco_outcome -2,11,,1,2021-04-01,,fish,1,,2,,,2021-04-10,1 \ No newline at end of file +2,11,,1,2021-04-01,18:00,fish,1,,2,,,2021-04-10,1 \ No newline at end of file diff --git a/tests/test_ingest.py b/tests/test_ingest.py index 8b9c183..bf8f0dd 100644 --- a/tests/test_ingest.py +++ b/tests/test_ingest.py @@ -3,7 +3,7 @@ import pandas as pd -def test_load_data(): +def test_load_data_one_to_one_single_row(): load_data( "tests/dummy_data/encounter_dummy_data_single.csv", "tests/dummy_data/encounter_dummy_mapping.csv", From a80c9637b0c79157a60c93971f86190ed24c932f Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Mon, 13 May 2024 13:13:54 +0100 Subject: [PATCH 03/21] Improve single row 1:1 test without output comparison --- fhirflat/ingest.py | 2 +- tests/test_ingest.py | 30 +++++++++++++++++++++++++++++- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/fhirflat/ingest.py b/fhirflat/ingest.py index 6641cfc..547f23f 100644 --- a/fhirflat/ingest.py +++ b/fhirflat/ingest.py @@ -45,7 +45,7 @@ def create_dictionary(data, map_file): data = pd.read_csv(data, header=0) map_df = pd.read_csv(map_file, header=0) - filtered_data = data[map_df["redcap_variable"].dropna().unique()] + filtered_data = data[map_df["redcap_variable"].dropna().unique()].copy() # Fills the na redcap variables with the previous value map_df["redcap_variable"] = map_df["redcap_variable"].ffill() diff --git a/tests/test_ingest.py b/tests/test_ingest.py index bf8f0dd..a3d7ef5 100644 --- a/tests/test_ingest.py +++ b/tests/test_ingest.py @@ -1,6 +1,29 @@ from fhirflat.ingest import load_data from fhirflat.resources.encounter import Encounter import pandas as pd +from pandas.testing import assert_frame_equal +import datetime +import os + + +ENCOUNTER_SINGLE_ROW_FLAT = { + "resourceType": "Encounter", + "class.code": "https://snomed.info/sct|32485007", + "class.text": "Hospital admission (procedure)", + "reason.use.code": "https://snomed.info/sct|89100005", + "reason.use.text": "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + "reason.value.concept.code": "https://snomed.info/sct|38362002", + "reason.value.concept.text": "Dengue (disorder)", + "diagnosis.condition.concept.code": "https://snomed.info/sct|722863008", + "diagnosis.condition.concept.text": "Dengue with warning signs (disorder)", + "diagnosis.use.code": "https://snomed.info/sct|89100005", + "diagnosis.use.text": "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + "subject": "2", + "actualPeriod.start": datetime.datetime(2021, 4, 1, 18, 0), + "actualPeriod.end": datetime.date(2021, 4, 10), + "admission.dischargeDisposition.code": "https://snomed.info/sct|371827001", + "admission.dischargeDisposition.text": "Patient discharged alive (finding)", +} def test_load_data_one_to_one_single_row(): @@ -11,4 +34,9 @@ def test_load_data_one_to_one_single_row(): "encounter_ingestion_single", ) - pd.read_parquet("encounter_ingestion_single.parquet") + assert_frame_equal( + pd.read_parquet("encounter_ingestion_single.parquet"), + pd.DataFrame([ENCOUNTER_SINGLE_ROW_FLAT], index=[0]), + check_dtype=False, + ) + os.remove("encounter_ingestion_single.parquet") From 7a456edcbbc12b07c953a86c414ca09c8a8d32d7 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Tue, 14 May 2024 11:15:17 +0100 Subject: [PATCH 04/21] Multiple rows of encounter can be read in & out --- fhirflat/ingest.py | 14 ++- fhirflat/resources/base.py | 7 ++ .../dummy_data/encounter_dummy_data_multi.csv | 5 + tests/test_ingest.py | 107 +++++++++++++++++- 4 files changed, 126 insertions(+), 7 deletions(-) create mode 100644 tests/dummy_data/encounter_dummy_data_multi.csv diff --git a/fhirflat/ingest.py b/fhirflat/ingest.py index 547f23f..4e6d9a3 100644 --- a/fhirflat/ingest.py +++ b/fhirflat/ingest.py @@ -11,6 +11,7 @@ import pandas as pd import numpy as np +import warnings # 1:1 (single row, single resource) mapping: Patient, Encounter # 1:M (single row, multiple resources) mapping: Observation, Condition, Procedure, ... @@ -31,7 +32,7 @@ """ TODO * cope with 'if' statements - e.g. for date overwriting. -* deal with duplicates/how to add multiple values to a single field +* deal with duplicates/how to add multiple values to a single field - list options. """ @@ -87,7 +88,7 @@ def find_field_value(row, response, map): if pd.isna(map_df.loc[column].index).all(): mapping = map_df.loc[(column, np.nan)].dropna() else: - mapping = map_df.loc[(column, str(response))].dropna() + mapping = map_df.loc[(column, str(int(response)))].dropna() snippet = { k: ( v @@ -97,8 +98,13 @@ def find_field_value(row, response, map): for k, v in mapping.items() } except KeyError: - # No mapping found for this column and response - result[column] = f"No mapping for response {response}" + # No mapping found for this column and response despite presence + # in mapping file + warnings.warn( + f"No mapping for column {column} response {response}", + UserWarning, + ) + continue else: continue else: diff --git a/fhirflat/resources/base.py b/fhirflat/resources/base.py index 56e36f3..516bcc6 100644 --- a/fhirflat/resources/base.py +++ b/fhirflat/resources/base.py @@ -96,6 +96,13 @@ def ingest_to_flat(cls, data: pd.DataFrame, filename: str): # get the flat dataframe out into it's own variable flat_df = pd.concat(data["fhir_flat"].tolist(), ignore_index=True) + # Stops parquet conversion from stripping the time from mixed date/datetime + # columns + for date_cols in [ + x for x in flat_df.columns if "date" in x.lower() or "period" in x.lower() + ]: + flat_df[date_cols] = flat_df[date_cols].astype(str) + flat_df.to_parquet(f"{filename}.parquet") @classmethod diff --git a/tests/dummy_data/encounter_dummy_data_multi.csv b/tests/dummy_data/encounter_dummy_data_multi.csv new file mode 100644 index 0000000..d0ad22e --- /dev/null +++ b/tests/dummy_data/encounter_dummy_data_multi.csv @@ -0,0 +1,5 @@ +usubjid,visitid,dates_enrolment,dates_adm,dates_admdate,dates_admtime,non_encounter_field,outco_denguediag,outco_denguediag_main,outco_denguediag_class,outco_not_dengue,outco_secondiag_oth,outco_date,outco_outcome +1,10,2020-05-01,0,,,,,,,cough,,,7 +2,11,,1,2021-04-01,18:00,fish,1,,2,,,2021-04-10,1 +3,12,,1,2021-05-10,17:30,,1,,1,flu,,2021-05-15,4 +4,13,,1,2022-06-15,21:00,dolphin,0,Malaria,,,,2022-06-20,2 \ No newline at end of file diff --git a/tests/test_ingest.py b/tests/test_ingest.py index a3d7ef5..f4d16de 100644 --- a/tests/test_ingest.py +++ b/tests/test_ingest.py @@ -2,7 +2,6 @@ from fhirflat.resources.encounter import Encounter import pandas as pd from pandas.testing import assert_frame_equal -import datetime import os @@ -19,8 +18,8 @@ "diagnosis.use.code": "https://snomed.info/sct|89100005", "diagnosis.use.text": "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 "subject": "2", - "actualPeriod.start": datetime.datetime(2021, 4, 1, 18, 0), - "actualPeriod.end": datetime.date(2021, 4, 10), + "actualPeriod.start": "2021-04-01 18:00:00", + "actualPeriod.end": "2021-04-10", "admission.dischargeDisposition.code": "https://snomed.info/sct|371827001", "admission.dischargeDisposition.text": "Patient discharged alive (finding)", } @@ -40,3 +39,105 @@ def test_load_data_one_to_one_single_row(): check_dtype=False, ) os.remove("encounter_ingestion_single.parquet") + + +ENCOUNTER_SINGLE_ROW_MULTI = { + "resourceType": ["Encounter", "Encounter", "Encounter", "Encounter"], + "class.code": [ + "https://snomed.info/sct|32485007", + "https://snomed.info/sct|32485007", + "https://snomed.info/sct|32485007", + "https://snomed.info/sct|32485007", + ], + "class.text": [ + "Hospital admission (procedure)", + "Hospital admission (procedure)", + "Hospital admission (procedure)", + "Hospital admission (procedure)", + ], + "reason.use.code": [ + None, + "https://snomed.info/sct|89100005", + "https://snomed.info/sct|89100005", + None, + ], + "reason.use.text": [ + None, + "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", + "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", + None, + ], + "reason.value.concept.code": [ + None, + "https://snomed.info/sct|38362002", + "https://snomed.info/sct|38362002", + None, + ], + "reason.value.concept.text": [None, "Dengue (disorder)", "Dengue (disorder)", None], + "diagnosis.condition.concept.code": [ + None, + "https://snomed.info/sct|722863008", + "https://snomed.info/sct|722862003", + None, + ], + "diagnosis.condition.concept.text": [ + None, + "Dengue with warning signs (disorder)", + "Dengue without warning signs (disorder)", + None, + ], + "diagnosis.use.code": [ + None, + "https://snomed.info/sct|89100005", + "https://snomed.info/sct|89100005", + "https://snomed.info/sct|89100005", + ], + "diagnosis.use.text": [ + None, + "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", + "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", + "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", + ], + "subject": ["1", "2", "3", "4"], + "actualPeriod.start": [ + "2020-05-01", + "2021-04-01 18:00:00", + "2021-05-10 17:30:00", + "2022-06-15 21:00:00", + ], + "actualPeriod.end": [ + "2020-05-01", + "2021-04-10", + "2021-05-15", + "2022-06-20", + ], + "admission.dischargeDisposition.code": [ + "https://snomed.info/sct|371827001", + "https://snomed.info/sct|371827001", + "https://snomed.info/sct|419099009", + "https://snomed.info/sct|32485007", + ], + "admission.dischargeDisposition.text": [ + "Patient discharged alive (finding)", + "Patient discharged alive (finding)", + "Dead (finding)", + "Hospital admission (procedure)", + ], +} + + +def test_load_data_one_to_one_multi_row(): + load_data( + "tests/dummy_data/encounter_dummy_data_multi.csv", + "tests/dummy_data/encounter_dummy_mapping.csv", + Encounter, + "encounter_ingestion_multi", + ) + + assert_frame_equal( + pd.read_parquet("encounter_ingestion_multi.parquet"), + pd.DataFrame(ENCOUNTER_SINGLE_ROW_MULTI), + check_dtype=False, + check_like=True, + ) + os.remove("encounter_ingestion_multi.parquet") From 21e777b5527b04b401c3740965d892d45e3cb3a4 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Tue, 14 May 2024 18:18:01 +0100 Subject: [PATCH 05/21] Draft one-to-many conversion for observation --- fhirflat/flat2fhir.py | 15 ++- fhirflat/ingest.py | 112 +++++++++++++++--- fhirflat/resources/observation.py | 7 +- .../dummy_data/observation_dummy_mapping.csv | 20 ++++ tests/dummy_data/vital_signs_dummy_data.csv | 4 + tests/test_ingest.py | 93 ++++++++++++++- 6 files changed, 228 insertions(+), 23 deletions(-) create mode 100644 tests/dummy_data/observation_dummy_mapping.csv create mode 100644 tests/dummy_data/vital_signs_dummy_data.csv diff --git a/fhirflat/flat2fhir.py b/fhirflat/flat2fhir.py index 237ffd3..c0d23af 100644 --- a/fhirflat/flat2fhir.py +++ b/fhirflat/flat2fhir.py @@ -18,11 +18,13 @@ def create_codeable_concept( # for reading in from ingestion pipeline if (name + ".code" and name + ".system") in old_dict: + code = old_dict[name + ".code"] + formatted_code = code if isinstance(code, str) else str(int(code)) new_dict = { "coding": [ { "system": old_dict[name + ".system"], - "code": str(int(old_dict[name + ".code"])), + "code": formatted_code, "display": old_dict[name + ".text"], } ] @@ -75,9 +77,14 @@ def createQuantity(df, group): for attribute in df.keys(): attr = attribute.split(".")[-1] if attr == "code": - system, code = df[group + ".code"].split("|") - quant["code"] = code - quant["system"] = system + if group + ".system" in df.keys(): + # reading in from ingestion pipeline + quant["code"] = df[group + ".code"] + quant["system"] = df[group + ".system"] + else: + system, code = df[group + ".code"].split("|") + quant["code"] = code + quant["system"] = system else: quant[attr] = df[group + "." + attr] diff --git a/fhirflat/ingest.py b/fhirflat/ingest.py index 4e6d9a3..2233835 100644 --- a/fhirflat/ingest.py +++ b/fhirflat/ingest.py @@ -24,19 +24,39 @@ fhir-flat-like?) input data dictionary in one column, then a resource object in another. Then follow format similar to fhir_file_to_flat to create the flat representation. -3. For 1:M mappings: (PL: not sure about this) Group columns by single_resource column +3. For 1:M mappings: (PL: not sure about this) Group columns by single_resource column (to be created in the mapping file), explode the dataframe by these groups, then follow the 1:1 process. """ """ TODO +* sort out reference formatting * cope with 'if' statements - e.g. for date overwriting. * deal with duplicates/how to add multiple values to a single field - list options. +* Consider using pandarallel (https://pypi.org/project/pandarallel/) to parallelize + the apply function, particularly for one to many mappings. """ -def create_dictionary(data, map_file): +def find_field_value(row, response, map, raw_data=None): + """Returns the data for a given field, given the map.""" + if map == "": + return response + elif "+" in map: + map = map.split("+") + results = [find_field_value(row, response, m) for m in map] + results = [x for x in results if x == x] + return " ".join(results) + else: + col = map.lstrip("<").rstrip(">") + try: + return row[col] + except KeyError: + return raw_data.loc[row["index"], col] + + +def create_dictionary(data: pd.DataFrame, map_file: pd.DataFrame) -> pd.DataFrame: """ Given a data file and a single mapping file for one FHIR resource type, returns a single column dataframe with the mapped data in a FHIRflat-like @@ -65,19 +85,6 @@ def create_dict_from_row(row): and produces a fhirflat-like dictionary to initialize the resource object. """ - def find_field_value(row, response, map): - """Returns the data for a given field, given the map.""" - if map == "": - return response - elif "+" in map: - map = map.split("+") - results = [find_field_value(row, response, m) for m in map] - results = [x for x in results if x == x] - return " ".join(results) - else: - col = map.lstrip("<").rstrip(">") - return row[col] - result = {} for column in row.index: if column in map_df.index.get_level_values(0): @@ -134,3 +141,78 @@ def load_data(data, mapping_files, resource_type, file_name): df = create_dictionary(data, mapping_files) resource_type.ingest_to_flat(df, file_name) + + +def create_one_to_many_dictionary(data: pd.DataFrame, map_file: pd.DataFrame): + """ + Given a data file and a single mapping file for one FHIR resource type, + returns a single column dataframe with the mapped data in a FHIRflat-like + format, ready for further processing. + """ + data = pd.read_csv(data, header=0) + map_df = pd.read_csv(map_file, header=0) + + # setup the data ----------------------------------------------------------- + relevant_cols = map_df["redcap_variable"].dropna().unique() + filtered_data = data.loc[ + :, data.columns.isin(relevant_cols) + ].reset_index() # .copy() + + melted_data = filtered_data.melt(id_vars="index", var_name="column") + + # set up the mappings ------------------------------------------------------- + + # Fills the na redcap variables with the previous value + map_df["redcap_variable"] = map_df["redcap_variable"].ffill() + + # strips the text answers out of the redcap_response column + map_df["redcap_response"] = map_df["redcap_response"].apply( + lambda x: x.split(",")[0] if isinstance(x, str) else x + ) + + # Set multi-index for easier access + map_df.set_index(["redcap_variable", "redcap_response"], inplace=True) + + def create_dict_from_cell(row, full_df=data): + """ + Iterates through the columns of the row, applying the mapping to each columns + and produces a fhirflat-like dictionary to initialize the resource object. + """ + + column = row["column"] + response = row["value"] + if pd.notna(response): # Ensure there is a response to map + try: + # Retrieve the mapping for the given column and response + if pd.isna(map_df.loc[column].index).all(): + mapping = map_df.loc[(column, np.nan)].dropna() + else: + mapping = map_df.loc[(column, str(int(response)))].dropna() + snippet = { + k: ( + v + if "<" not in str(v) + else find_field_value(row, response, v, raw_data=full_df) + ) + for k, v in mapping.items() + } + return snippet + except KeyError: + # No mapping found for this column and response despite presence + # in mapping file + warnings.warn( + f"No mapping for column {column} response {response}", + UserWarning, + ) + return None + + # Apply the function across the DataFrame rows + melted_data["flat_dict"] = melted_data.apply(create_dict_from_cell, axis=1) + return melted_data + + +def load_data_one_to_many(data, mapping_files, resource_type, file_name): + + df = create_one_to_many_dictionary(data, mapping_files) + + resource_type.ingest_to_flat(df.dropna(), file_name) diff --git a/fhirflat/resources/observation.py b/fhirflat/resources/observation.py index 288f679..956c49d 100644 --- a/fhirflat/resources/observation.py +++ b/fhirflat/resources/observation.py @@ -92,13 +92,14 @@ def validate_extension_contents(cls, extensions): return extensions @classmethod - def cleanup(cls, data: JsonString) -> Observation: + def cleanup(cls, data: JsonString | dict, json_data=True) -> Observation: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - data = orjson.loads(data) + if json_data: + data = orjson.loads(data) for field in { "encounter", @@ -108,7 +109,7 @@ def cleanup(cls, data: JsonString) -> Observation: "specimen", "device", }.intersection(data.keys()): - data[field] = {"reference": data[field]} + data[field] = {"reference": str(data[field])} # add default status back in data["status"] = "final" diff --git a/tests/dummy_data/observation_dummy_mapping.csv b/tests/dummy_data/observation_dummy_mapping.csv new file mode 100644 index 0000000..b1a9feb --- /dev/null +++ b/tests/dummy_data/observation_dummy_mapping.csv @@ -0,0 +1,20 @@ +redcap_variable,redcap_response,single_resource_group,category.system,category.code,category.text,effectiveDateTime,code.system,code.code,code.text,subject,encounter,valueQuantity.value,valueQuantity.system,valueQuantity.code,valueQuantity.unit,valueCodeableConcept.system,valueCodeableConcept.code,valueCodeableConcept.text,valueDateTime,valueInteger +vital_highesttem_c,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8310-5,Body temperature,,,,http://unitsofmeasure,Cel,DegreesCelsius,,,,, +vital_highesttem_f,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8310-5,Body temperature,,,,http://unitsofmeasure,degF,DegreesFarenheit,,,,, +vital_hr,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8867-4,Heart rate,,,,https://snomed.info/sct,258983007,Beats/minute (qualifier value),,,,, +vital_rr,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,9279-1,Respiratory rate,,,,https://snomed.info/sct,258984001,Breaths/minute (qualifier value),,,,, +vital_systolicbp,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8480-6,Systolic blood pressure,,,,http://unitsofmeasure,mm[Hg],MilliMetersOfMercury,,,,, +vital_diastolicbp,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8462-4,Diastolic blood pressure,,,,http://unitsofmeasure,mm[Hg],MilliMetersOfMercury,,,,, +vital_spo2,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,59408-5,Oxygen saturation in Arterial blood by Pulse oximetry,,,,http://unitsofmeasure,%,Percent,,,,, +vital_fio2spo2_02110,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,3150-0,Inhaled oxygen concentration,,,,,,,,,,, +vital_fio2spo2_pcnt,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,3150-0,Inhaled oxygen concentration,,,,http://unitsofmeasure,%,Percent,,,,, +vital_capillaryr,"1, Yes",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,50427001,Increased capillary filling time (finding),,,,,,,https://snomed.info/sct,373066001,Yes,, +,"0, No",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,50427001,Increased capillary filling time (finding),,,,,,,https://snomed.info/sct,373067005,No,, +,"99, Unknown",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,50427001,Increased capillary filling time (finding),,,,,,,https://snomed.info/sct,261665006,Unknown,, +vital_avpu,"1, Alert",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),,,,,,,https://snomed.info/sct,271591004,Fully conscious (finding),, +,"5, Confusion",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),,,,,,,https://snomed.info/sct,40917007,Clouded consciousness (finding),, +,"2, Verbal",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),,,,,,,https://snomed.info/sct,300202002,Responds to voice (finding),, +,"3, Pain",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),,,,,,,https://snomed.info/sct,450847001,Responds to pain (finding),, +,"4, Unresponsive",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),,,,,,,https://snomed.info/sct,422768004,Unresponsive (finding),, +vital_gcs,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,9269-2,Glasgow coma score total,,,,,,,,,,, +vital_urineflow,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,9192-6,Urine output 24 hour,,,,https://snomed.info/sct,258861009,Millilitre/24 hours (qualifier value),,,,, \ No newline at end of file diff --git a/tests/dummy_data/vital_signs_dummy_data.csv b/tests/dummy_data/vital_signs_dummy_data.csv new file mode 100644 index 0000000..a361b93 --- /dev/null +++ b/tests/dummy_data/vital_signs_dummy_data.csv @@ -0,0 +1,4 @@ +usubjid,visitid,daily_date,vital_highesttem_c,vital_hr,vital_rr,vital_systolicbp,vital_diastolicbp,vital_spo2,vital_fio2spo2_02110,vital_fio2spo2_pcnt,vital_capillaryr,vital_avpu,vital_gcs,vital_urineflow +1,10,2020-01-01,36.2,120,30,70,120,5,,75,1,1,1,150 +2,11,2021-02-02,37,100,40,80,130,6,10,85,0,2,1,200 +3,12,2022-03-03,35.5,70,50,90,140,7,,95,0,3,1, \ No newline at end of file diff --git a/tests/test_ingest.py b/tests/test_ingest.py index f4d16de..c6b727f 100644 --- a/tests/test_ingest.py +++ b/tests/test_ingest.py @@ -1,8 +1,10 @@ -from fhirflat.ingest import load_data +from fhirflat.ingest import load_data, load_data_one_to_many from fhirflat.resources.encounter import Encounter +from fhirflat.resources.observation import Observation import pandas as pd from pandas.testing import assert_frame_equal import os +from decimal import Decimal ENCOUNTER_SINGLE_ROW_FLAT = { @@ -141,3 +143,92 @@ def test_load_data_one_to_one_multi_row(): check_like=True, ) os.remove("encounter_ingestion_multi.parquet") + + +OBS_FLAT = { + "resourceType": [ + "Observation", + "Observation", + "Observation", + "Observation", + "Observation", + ], + "category.code": [ + "http://terminology.hl7.org/CodeSystem/observation-category|vital-signs", + "http://terminology.hl7.org/CodeSystem/observation-category|vital-signs", + "http://terminology.hl7.org/CodeSystem/observation-category|vital-signs", + "http://terminology.hl7.org/CodeSystem/observation-category|vital-signs", + "http://terminology.hl7.org/CodeSystem/observation-category|vital-signs", + ], + "category.text": [ + "Vital Signs", + "Vital Signs", + "Vital Signs", + "Vital Signs", + "Vital Signs", + ], + "effectiveDateTime": [ + "2020-01-01", + "2021-02-02", + "2022-03-03", + "2020-01-01", + "2021-02-02", + ], + "code.code": [ + "https://loinc.org|8310-5", + "https://loinc.org|8310-5", + "https://loinc.org|8310-5", + "https://loinc.org|8867-4", + "https://loinc.org|8867-4", + ], + "code.text": [ + "Body temperature", + "Body temperature", + "Body temperature", + "Heart rate", + "Heart rate", + ], + "subject": ["1", "2", "3", "1", "2"], + "encounter": ["10", "11", "12", "10", "11"], + "valueQuantity.value": [Decimal("36.2"), 37.0, 35.5, 120.0, 100.0], + "valueQuantity.unit": [ + "DegreesCelsius", + "DegreesCelsius", + "DegreesCelsius", + "Beats/minute (qualifier value)", + "Beats/minute (qualifier value)", + ], + "valueQuantity.code": [ + "http://unitsofmeasure|Cel", + "http://unitsofmeasure|Cel", + "http://unitsofmeasure|Cel", + "https://snomed.info/sct|258983007", + "https://snomed.info/sct|258983007", + ], + "valueCodeableConcept.code": [None, None, None, None, None], + "valueCodeableConcept.text": [None, None, None, None, None], + "valueInteger": [None, None, None, None, None], +} + + +def test_load_data_one_to_many_multi_row(): + load_data_one_to_many( + "tests/dummy_data/vital_signs_dummy_data.csv", + "tests/dummy_data/observation_dummy_mapping.csv", + Observation, + "observation_ingestion", + ) + + full_df = pd.read_parquet("observation_ingestion.parquet") + + assert len(full_df) == 33 + + df_head = full_df.head(5) + + assert_frame_equal( + df_head, + pd.DataFrame(OBS_FLAT), + check_dtype=False, + check_like=True, + ) + os.remove("observation_ingestion.parquet") From cfadee9bfec6f4cc28baa5cd40b32c64f62140e2 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Wed, 15 May 2024 12:29:59 +0100 Subject: [PATCH 06/21] Update overwritten cleanup() func in remaining classes --- fhirflat/resources/condition.py | 5 ++-- fhirflat/resources/encounter.py | 4 +-- fhirflat/resources/immunization.py | 12 ++++---- fhirflat/resources/location.py | 5 ++-- .../resources/medicationadministration.py | 30 +++++++++++-------- fhirflat/resources/medicationstatement.py | 24 ++++++++------- fhirflat/resources/organization.py | 5 ++-- fhirflat/resources/patient.py | 5 ++-- fhirflat/resources/procedure.py | 5 ++-- fhirflat/resources/researchsubject.py | 12 ++++---- fhirflat/resources/specimen.py | 28 +++++++++-------- 11 files changed, 79 insertions(+), 56 deletions(-) diff --git a/fhirflat/resources/condition.py b/fhirflat/resources/condition.py index 32a3b6a..41fbfd7 100644 --- a/fhirflat/resources/condition.py +++ b/fhirflat/resources/condition.py @@ -42,13 +42,14 @@ def flat_descriptions(cls) -> dict[str, str]: return descrip @classmethod - def cleanup(cls, data: JsonString) -> Condition: + def cleanup(cls, data: JsonString | dict, json_data=True) -> Condition: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - data = orjson.loads(data) + if json_data: + data = orjson.loads(data) data["encounter"] = {"reference": data["encounter"]} data["subject"] = {"reference": data["subject"]} diff --git a/fhirflat/resources/encounter.py b/fhirflat/resources/encounter.py index d0cc054..662aea7 100644 --- a/fhirflat/resources/encounter.py +++ b/fhirflat/resources/encounter.py @@ -24,8 +24,8 @@ class Encounter(_Encounter, FHIRFlatBase): title="List of `Extension` items (represented as `dict` in JSON)", description=( """ - Contains the Global.health 'eventTiming' and 'relativePeriod' extensions, and allows - extensions from other implementations to be included. + Contains the Global.health 'eventTiming' and 'relativePeriod' extensions, + and allows extensions from other implementations to be included. """ ), # if property is element of this resource. diff --git a/fhirflat/resources/immunization.py b/fhirflat/resources/immunization.py index 4bf2dfe..35e5b03 100644 --- a/fhirflat/resources/immunization.py +++ b/fhirflat/resources/immunization.py @@ -65,17 +65,19 @@ def validate_extension_contents(cls, extensions): return extensions @classmethod - def cleanup(cls, data: JsonString) -> Immunization: + def cleanup(cls, data: JsonString | dict, json_data=True) -> Immunization: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - data = orjson.loads(data) + if json_data: + data = orjson.loads(data) - for field in ({"patient", "encounter", "location"} | { - x for x in data.keys() if x.endswith(".reference") - }).intersection(data.keys()): + for field in ( + {"patient", "encounter", "location"} + | {x for x in data.keys() if x.endswith(".reference")} + ).intersection(data.keys()): data[field] = {"reference": data[field]} # add default status back in diff --git a/fhirflat/resources/location.py b/fhirflat/resources/location.py index 29b917e..e999301 100644 --- a/fhirflat/resources/location.py +++ b/fhirflat/resources/location.py @@ -21,13 +21,14 @@ class Location(_Location, FHIRFlatBase): ) @classmethod - def cleanup(cls, data: JsonString) -> Location: + def cleanup(cls, data: JsonString | dict, json_data=True) -> Location: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - data = orjson.loads(data) + if json_data: + data = orjson.loads(data) for field in { "managingOrganization", diff --git a/fhirflat/resources/medicationadministration.py b/fhirflat/resources/medicationadministration.py index 136a82f..c3ed488 100644 --- a/fhirflat/resources/medicationadministration.py +++ b/fhirflat/resources/medicationadministration.py @@ -26,23 +26,29 @@ class MedicationAdministration(_MedicationAdministration, FHIRFlatBase): flat_defaults: ClassVar[list[str]] = FHIRFlatBase.flat_defaults + ["status"] @classmethod - def cleanup(cls, data: JsonString) -> MedicationAdministration: + def cleanup( + cls, data: JsonString | dict, json_data=True + ) -> MedicationAdministration: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - data = orjson.loads(data) - - for field in ({ - "basedOn", - "partOf", - "subject", - "encounter", - "supportingInformation", - "request", - "eventHistory", - } | {x for x in data.keys() if x.endswith(".reference")}).intersection(data.keys()): + if json_data: + data = orjson.loads(data) + + for field in ( + { + "basedOn", + "partOf", + "subject", + "encounter", + "supportingInformation", + "request", + "eventHistory", + } + | {x for x in data.keys() if x.endswith(".reference")} + ).intersection(data.keys()): data[field] = {"reference": data[field]} # add default status back in diff --git a/fhirflat/resources/medicationstatement.py b/fhirflat/resources/medicationstatement.py index f208c5b..824971e 100644 --- a/fhirflat/resources/medicationstatement.py +++ b/fhirflat/resources/medicationstatement.py @@ -25,21 +25,25 @@ class MedicationStatement(_MedicationStatement, FHIRFlatBase): flat_defaults: ClassVar[list[str]] = FHIRFlatBase.flat_defaults + ["status"] @classmethod - def cleanup(cls, data: JsonString) -> MedicationStatement: + def cleanup(cls, data: JsonString | dict, json_data=True) -> MedicationStatement: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - data = orjson.loads(data) - - for field in ({ - "partOf", - "subject", - "encounter", - "derivedFrom", - "relatedClinicalInformation", - } | {x for x in data.keys() if x.endswith(".reference")}).intersection(data.keys()): + if json_data: + data = orjson.loads(data) + + for field in ( + { + "partOf", + "subject", + "encounter", + "derivedFrom", + "relatedClinicalInformation", + } + | {x for x in data.keys() if x.endswith(".reference")} + ).intersection(data.keys()): data[field] = {"reference": data[field]} # add default status back in diff --git a/fhirflat/resources/organization.py b/fhirflat/resources/organization.py index f8b0aa7..47cc7c8 100644 --- a/fhirflat/resources/organization.py +++ b/fhirflat/resources/organization.py @@ -20,13 +20,14 @@ class Organization(_Organization, FHIRFlatBase): ) @classmethod - def cleanup(cls, data: JsonString) -> Organization: + def cleanup(cls, data: JsonString | dict, json_data=True) -> Organization: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - data = orjson.loads(data) + if json_data: + data = orjson.loads(data) for field in { "partOf", diff --git a/fhirflat/resources/patient.py b/fhirflat/resources/patient.py index db465cb..1a130d5 100644 --- a/fhirflat/resources/patient.py +++ b/fhirflat/resources/patient.py @@ -69,9 +69,10 @@ def flat_descriptions(cls) -> dict[str, str]: return descrip @classmethod - def cleanup(cls, data: JsonString) -> Patient: + def cleanup(cls, data: JsonString | dict, json_data=True) -> Patient: # Load the data and apply resource-specific changes - data = orjson.loads(data) + if json_data: + data = orjson.loads(data) # # Strip time from the birthDate if "birthDate" in data: diff --git a/fhirflat/resources/procedure.py b/fhirflat/resources/procedure.py index 2dc0989..179f028 100644 --- a/fhirflat/resources/procedure.py +++ b/fhirflat/resources/procedure.py @@ -81,13 +81,14 @@ def validate_extension_contents(cls, extensions): return extensions @classmethod - def cleanup(cls, data: JsonString) -> Procedure: + def cleanup(cls, data: JsonString | dict, json_data=True) -> Procedure: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - data = orjson.loads(data) + if json_data: + data = orjson.loads(data) for field in { "partOf", diff --git a/fhirflat/resources/researchsubject.py b/fhirflat/resources/researchsubject.py index c24bad6..d888321 100644 --- a/fhirflat/resources/researchsubject.py +++ b/fhirflat/resources/researchsubject.py @@ -21,17 +21,19 @@ class ResearchSubject(_ResearchSubject, FHIRFlatBase): flat_defaults: ClassVar[list[str]] = FHIRFlatBase.flat_defaults + ["status"] @classmethod - def cleanup(cls, data: JsonString) -> ResearchSubject: + def cleanup(cls, data: JsonString | dict, json_data=True) -> ResearchSubject: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - data = orjson.loads(data) + if json_data: + data = orjson.loads(data) - for field in ({"study", "subject", "consent"} | { - x for x in data.keys() if x.endswith(".reference") - }).intersection(data.keys()): + for field in ( + {"study", "subject", "consent"} + | {x for x in data.keys() if x.endswith(".reference")} + ).intersection(data.keys()): data[field] = {"reference": data[field]} # add default status back in diff --git a/fhirflat/resources/specimen.py b/fhirflat/resources/specimen.py index 3ef491a..5661274 100644 --- a/fhirflat/resources/specimen.py +++ b/fhirflat/resources/specimen.py @@ -21,23 +21,27 @@ class Specimen(_Specimen, FHIRFlatBase): ) @classmethod - def cleanup(cls, data: JsonString) -> Specimen: + def cleanup(cls, data: JsonString | dict, json_data=True) -> Specimen: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - data = orjson.loads(data) - - for field in ({ - "subject", - "parent", - "request", - "collection.collector", - "collection.procedure", - "container.device", - "container.location", - } | {x for x in data.keys() if x.endswith(".reference")}).intersection(data.keys()): + if json_data: + data = orjson.loads(data) + + for field in ( + { + "subject", + "parent", + "request", + "collection.collector", + "collection.procedure", + "container.device", + "container.location", + } + | {x for x in data.keys() if x.endswith(".reference")} + ).intersection(data.keys()): data[field] = {"reference": data[field]} data = expand_concepts(data, cls) From b591be984642ad58df866767ca9b5bd85f5c202e Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Wed, 15 May 2024 12:31:40 +0100 Subject: [PATCH 07/21] Start condensing ingestion code --- fhirflat/ingest.py | 265 ++++++++++++++++++------------------- fhirflat/resources/base.py | 12 +- 2 files changed, 131 insertions(+), 146 deletions(-) diff --git a/fhirflat/ingest.py b/fhirflat/ingest.py index 2233835..cce366a 100644 --- a/fhirflat/ingest.py +++ b/fhirflat/ingest.py @@ -6,7 +6,7 @@ mappings. PL: Actually, maybe rather than the mappings it's either a file or a dictionary showing the location of each mapping file (one per resource type). -TODO: Eventually, this ahould link to a google sheet file that contains the mappings +TODO: Eventually, this should link to a google sheet file that contains the mappings """ import pandas as pd @@ -16,19 +16,6 @@ # 1:1 (single row, single resource) mapping: Patient, Encounter # 1:M (single row, multiple resources) mapping: Observation, Condition, Procedure, ... -""" -1. Create one input-data dataframe per resource type, using the column names from -the mapping file - -2. For 1:1 mappings: use an apply function to create a fhir-like (or maybe -fhir-flat-like?) input data dictionary in one column, then a resource object in another. -Then follow format similar to fhir_file_to_flat to create the flat representation. - -3. For 1:M mappings: (PL: not sure about this) Group columns by single_resource column -(to be created in the mapping file), explode the dataframe by these groups, then follow -the 1:1 process. -""" - """ TODO * sort out reference formatting @@ -39,126 +26,144 @@ """ -def find_field_value(row, response, map, raw_data=None): - """Returns the data for a given field, given the map.""" - if map == "": +def find_field_value(row, response, mapp, raw_data=None): + """ + Returns the data for a given field, given the mapping. + For one to many resources the raw data is provided to allow for searching for other + fields than in the melted data. + """ + if mapp == "": return response - elif "+" in map: - map = map.split("+") - results = [find_field_value(row, response, m) for m in map] + elif "+" in mapp: + mapp = mapp.split("+") + results = [find_field_value(row, response, m) for m in mapp] results = [x for x in results if x == x] return " ".join(results) else: - col = map.lstrip("<").rstrip(">") + col = mapp.lstrip("<").rstrip(">") try: return row[col] except KeyError: return raw_data.loc[row["index"], col] -def create_dictionary(data: pd.DataFrame, map_file: pd.DataFrame) -> pd.DataFrame: +def create_dict_from_row(row, map_df): """ - Given a data file and a single mapping file for one FHIR resource type, - returns a single column dataframe with the mapped data in a FHIRflat-like - format, ready for further processing. + Iterates through the columns of the row, applying the mapping to each columns + and produces a fhirflat-like dictionary to initialize the resource object. """ - data = pd.read_csv(data, header=0) - map_df = pd.read_csv(map_file, header=0) - - filtered_data = data[map_df["redcap_variable"].dropna().unique()].copy() - - # Fills the na redcap variables with the previous value - map_df["redcap_variable"] = map_df["redcap_variable"].ffill() - - # strips the text answers out of the redcap_response column - map_df["redcap_response"] = map_df["redcap_response"].apply( - lambda x: x.split(",")[0] if isinstance(x, str) else x - ) - - # Set multi-index for easier access - map_df.set_index(["redcap_variable", "redcap_response"], inplace=True) - - def create_dict_from_row(row): - """ - Iterates through the columns of the row, applying the mapping to each columns - and produces a fhirflat-like dictionary to initialize the resource object. - """ - - result = {} - for column in row.index: - if column in map_df.index.get_level_values(0): - response = row[column] - if pd.notna(response): # Ensure there is a response to map - try: - # Retrieve the mapping for the given column and response - if pd.isna(map_df.loc[column].index).all(): - mapping = map_df.loc[(column, np.nan)].dropna() - else: - mapping = map_df.loc[(column, str(int(response)))].dropna() - snippet = { - k: ( - v - if "<" not in str(v) - else find_field_value(row, response, v) - ) - for k, v in mapping.items() - } - except KeyError: - # No mapping found for this column and response despite presence - # in mapping file - warnings.warn( - f"No mapping for column {column} response {response}", - UserWarning, + result = {} + for column in row.index: + if column in map_df.index.get_level_values(0): + response = row[column] + if pd.notna(response): # Ensure there is a response to map + try: + # Retrieve the mapping for the given column and response + if pd.isna(map_df.loc[column].index).all(): + mapping = map_df.loc[(column, np.nan)].dropna() + else: + mapping = map_df.loc[(column, str(int(response)))].dropna() + snippet = { + k: ( + v + if "<" not in str(v) + else find_field_value(row, response, v) ) - continue - else: + for k, v in mapping.items() + } + except KeyError: + # No mapping found for this column and response despite presence + # in mapping file + warnings.warn( + f"No mapping for column {column} response {response}", + UserWarning, + ) continue else: - raise ValueError(f"Column {column} not found in mapping file") - duplicate_keys = set(result.keys()).intersection(snippet.keys()) - if not duplicate_keys: - result = result | snippet + continue + else: + raise ValueError(f"Column {column} not found in mapping file") + duplicate_keys = set(result.keys()).intersection(snippet.keys()) + if not duplicate_keys: + result = result | snippet + else: + if all( + result[key] == snippet[key] for key in duplicate_keys + ): # Ignore duplicates if they are the same + continue else: - if all( - result[key] == snippet[key] for key in duplicate_keys - ): # Ignore duplicates if they are the same - continue - else: - raise ValueError( - "Duplicate keys in mapping:" - f" {set(result.keys()).intersection(snippet.keys())}" - ) - return result - - # Apply the function across the DataFrame rows - filtered_data["flat_dict"] = filtered_data.apply(create_dict_from_row, axis=1) - return filtered_data - - -def load_data(data, mapping_files, resource_type, file_name): - - df = create_dictionary(data, mapping_files) + raise ValueError( + "Duplicate keys in mapping:" + f" {set(result.keys()).intersection(snippet.keys())}" + ) + return result - resource_type.ingest_to_flat(df, file_name) +def create_dict_from_cell(row, full_df, map_df): + """ + Iterates through the columns of the row, applying the mapping to each columns + and produces a fhirflat-like dictionary to initialize the resource object. + """ -def create_one_to_many_dictionary(data: pd.DataFrame, map_file: pd.DataFrame): + column = row["column"] + response = row["value"] + if pd.notna(response): # Ensure there is a response to map + try: + # Retrieve the mapping for the given column and response + if pd.isna(map_df.loc[column].index).all(): + mapping = map_df.loc[(column, np.nan)].dropna() + else: + mapping = map_df.loc[(column, str(int(response)))].dropna() + snippet = { + k: ( + v + if "<" not in str(v) + else find_field_value(row, response, v, raw_data=full_df) + ) + for k, v in mapping.items() + } + return snippet + except KeyError: + # No mapping found for this column and response despite presence + # in mapping file + warnings.warn( + f"No mapping for column {column} response {response}", + UserWarning, + ) + return None + + +def create_dictionary( + data: pd.DataFrame, map_file: pd.DataFrame, one_to_one=False +) -> pd.DataFrame: """ Given a data file and a single mapping file for one FHIR resource type, returns a single column dataframe with the mapped data in a FHIRflat-like format, ready for further processing. + + Parameters + ---------- + data: pd.DataFrame + The data file containing the clinical data. + map_file: pd.DataFrame + The mapping file containing the mapping of the clinical data to the FHIR + resource. + one_to_one: bool + Whether the resource should be mapped as one-to-one or one-to-many. """ + data = pd.read_csv(data, header=0) map_df = pd.read_csv(map_file, header=0) # setup the data ----------------------------------------------------------- relevant_cols = map_df["redcap_variable"].dropna().unique() - filtered_data = data.loc[ - :, data.columns.isin(relevant_cols) - ].reset_index() # .copy() - melted_data = filtered_data.melt(id_vars="index", var_name="column") + if one_to_one: + filtered_data = data[relevant_cols].copy() + else: + filtered_data = data.loc[:, data.columns.isin(relevant_cols)].reset_index() + melted_data = filtered_data.melt(id_vars="index", var_name="column") # set up the mappings ------------------------------------------------------- @@ -173,46 +178,28 @@ def create_one_to_many_dictionary(data: pd.DataFrame, map_file: pd.DataFrame): # Set multi-index for easier access map_df.set_index(["redcap_variable", "redcap_response"], inplace=True) - def create_dict_from_cell(row, full_df=data): - """ - Iterates through the columns of the row, applying the mapping to each columns - and produces a fhirflat-like dictionary to initialize the resource object. - """ - - column = row["column"] - response = row["value"] - if pd.notna(response): # Ensure there is a response to map - try: - # Retrieve the mapping for the given column and response - if pd.isna(map_df.loc[column].index).all(): - mapping = map_df.loc[(column, np.nan)].dropna() - else: - mapping = map_df.loc[(column, str(int(response)))].dropna() - snippet = { - k: ( - v - if "<" not in str(v) - else find_field_value(row, response, v, raw_data=full_df) - ) - for k, v in mapping.items() - } - return snippet - except KeyError: - # No mapping found for this column and response despite presence - # in mapping file - warnings.warn( - f"No mapping for column {column} response {response}", - UserWarning, - ) - return None + # Generate the flat_like dictionary + if one_to_one: + filtered_data["flat_dict"] = filtered_data.apply( + create_dict_from_row, args=[map_df], axis=1 + ) + return filtered_data + else: + melted_data["flat_dict"] = melted_data.apply( + create_dict_from_cell, args=[data, map_df], axis=1 + ) + return melted_data["flat_dict"].to_frame() + + +def load_data(data, mapping_files, resource_type, file_name): + + df = create_dictionary(data, mapping_files, one_to_one=True) - # Apply the function across the DataFrame rows - melted_data["flat_dict"] = melted_data.apply(create_dict_from_cell, axis=1) - return melted_data + resource_type.ingest_to_flat(df, file_name) def load_data_one_to_many(data, mapping_files, resource_type, file_name): - df = create_one_to_many_dictionary(data, mapping_files) + df = create_dictionary(data, mapping_files, one_to_one=False) resource_type.ingest_to_flat(df.dropna(), file_name) diff --git a/fhirflat/resources/base.py b/fhirflat/resources/base.py index 516bcc6..2ccc53e 100644 --- a/fhirflat/resources/base.py +++ b/fhirflat/resources/base.py @@ -91,10 +91,7 @@ def ingest_to_flat(cls, data: pd.DataFrame, filename: str): lambda x: cls.cleanup(x, json_data=False) ) - data["fhir_flat"] = data["fhir"].apply(lambda x: x.to_flat()) - - # get the flat dataframe out into it's own variable - flat_df = pd.concat(data["fhir_flat"].tolist(), ignore_index=True) + flat_df = data["fhir"].apply(lambda x: x.to_flat()) # Stops parquet conversion from stripping the time from mixed date/datetime # columns @@ -168,10 +165,10 @@ def fhir_file_to_flat(cls, source_file: str, output_name: str | None = None): df.to_parquet(output_name) - def to_flat(self, filename: str | None = None) -> None: + def to_flat(self, filename: str | None = None) -> None | pd.Series: """ Generates a FHIRflat parquet file from the resource. - If no file name is provided, returns the pandas dataframe. + If no file name is provided, returns a pandas Series. Parameters ---------- @@ -196,4 +193,5 @@ def to_flat(self, filename: str | None = None) -> None: if filename: flat_df.to_parquet(filename) else: - return flat_df + assert flat_df.shape[0] == 1 + return flat_df.loc[0] From c14264319b73987d24d57464793903b168617984 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Wed, 15 May 2024 13:46:59 +0100 Subject: [PATCH 08/21] Create generic data conversion function for users --- fhirflat/ingest.py | 52 ++++++++++++++++++++++++ tests/dummy_data/combined_dummy_data.csv | 5 +++ tests/test_ingest.py | 37 ++++++++++++++++- 3 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 tests/dummy_data/combined_dummy_data.csv diff --git a/fhirflat/ingest.py b/fhirflat/ingest.py index cce366a..e1cabe9 100644 --- a/fhirflat/ingest.py +++ b/fhirflat/ingest.py @@ -12,6 +12,7 @@ import pandas as pd import numpy as np import warnings +import os # 1:1 (single row, single resource) mapping: Patient, Encounter # 1:M (single row, multiple resources) mapping: Observation, Condition, Procedure, ... @@ -203,3 +204,54 @@ def load_data_one_to_many(data, mapping_files, resource_type, file_name): df = create_dictionary(data, mapping_files, one_to_one=False) resource_type.ingest_to_flat(df.dropna(), file_name) + + +def convert_data_to_flat( + data: str, + folder_name: str, + mapping_files_types: tuple[dict, dict] | None = None, + sheet_id: str | None = None, +): + """ + Takes raw clinical data (currently assumed to be a one-row-per-patient format like + RedCap exports) and produces a folder of FHIRflat files, one per resource. Takes + either local mapping files, or a Google Sheet ID containing the mapping files. + + Parameters + ---------- + data: str + The path to the raw clinical data file. + folder_name: str + The name of the folder to store the FHIRflat files. + mapping_files_types: tuple[dict, dict] | None + A tuple containing two dictionaries, one with the mapping files for each + resource type and one with the mapping type (either one-to-one or one-to-many) + for each resource type. + sheet_id: str | None + The Google Sheet ID containing the mapping files. The first sheet must contain + the mapping types - one column listing the resource name, and another describing + whether the mapping is one-to-one or one-to-many. The subsequent sheets must + be named by resource, and contain the mapping for that resource. + """ + + if not mapping_files_types and not sheet_id: + raise TypeError("Either mapping_files_types or sheet_id must be provided") + + if not os.path.exists(folder_name): + os.makedirs(folder_name) + + if mapping_files_types: + mappings, types = mapping_files_types + for resource, map_file in mappings.items(): + t = types[resource.__name__] + if t == "one-to-one": + df = create_dictionary(data, map_file, one_to_one=True) + elif t == "one-to-many": + df = create_dictionary(data, map_file, one_to_one=False) + df = df.dropna().reset_index(drop=True) + else: + raise ValueError(f"Unknown mapping type {t}") + + resource.ingest_to_flat(df, folder_name + "/" + resource.__name__.lower()) + else: + pass diff --git a/tests/dummy_data/combined_dummy_data.csv b/tests/dummy_data/combined_dummy_data.csv new file mode 100644 index 0000000..94449ef --- /dev/null +++ b/tests/dummy_data/combined_dummy_data.csv @@ -0,0 +1,5 @@ +usubjid,visitid,dates_enrolment,dates_adm,dates_admdate,dates_admtime,non_encounter_field,outco_denguediag,outco_denguediag_main,outco_denguediag_class,outco_not_dengue,outco_secondiag_oth,outco_date,outco_outcome,daily_date,vital_highesttem_c,vital_hr,vital_rr,vital_systolicbp,vital_diastolicbp,vital_spo2,vital_fio2spo2_02110,vital_fio2spo2_pcnt,vital_capillaryr,vital_avpu,vital_gcs,vital_urineflow +1,10,2020-05-01,0,,,,,,,cough,,,7,2020-01-01,36.2,120,30,70,120,5,,75,1,1,1,150 +2,11,,1,2021-04-01,18:00,fish,1,,2,,,2021-04-10,1,2021-02-02,37,100,40,80,130,6,10,85,0,2,1,200 +3,12,,1,2021-05-10,17:30,,1,,1,flu,,2021-05-15,4,2022-03-03,35.5,70,50,90,140,7,,95,0,3,1, +4,13,,1,2022-06-15,21:00,dolphin,0,Malaria,,,,2022-06-20,2,,,,,,,,,,,,, \ No newline at end of file diff --git a/tests/test_ingest.py b/tests/test_ingest.py index c6b727f..ac6d11a 100644 --- a/tests/test_ingest.py +++ b/tests/test_ingest.py @@ -1,4 +1,4 @@ -from fhirflat.ingest import load_data, load_data_one_to_many +from fhirflat.ingest import load_data, load_data_one_to_many, convert_data_to_flat from fhirflat.resources.encounter import Encounter from fhirflat.resources.observation import Observation import pandas as pd @@ -232,3 +232,38 @@ def test_load_data_one_to_many_multi_row(): check_like=True, ) os.remove("observation_ingestion.parquet") + + +def test_convert_data_to_flat_local_mapping(): + mappings = { + Encounter: "tests/dummy_data/encounter_dummy_mapping.csv", + Observation: "tests/dummy_data/observation_dummy_mapping.csv", + } + resource_types = {"Encounter": "one-to-one", "Observation": "one-to-many"} + + convert_data_to_flat( + "tests/dummy_data/combined_dummy_data.csv", + mapping_files_types=(mappings, resource_types), + folder_name="tests/ingestion_output", + ) + + encounter_df = pd.read_parquet("tests/ingestion_output/encounter.parquet") + obs_df = pd.read_parquet("tests/ingestion_output/observation.parquet") + + assert_frame_equal( + encounter_df, + pd.DataFrame(ENCOUNTER_SINGLE_ROW_MULTI), + check_dtype=False, + check_like=True, + ) + + assert len(obs_df) == 33 + + obs_df_head = obs_df.head(5) + + assert_frame_equal( + obs_df_head, + pd.DataFrame(OBS_FLAT), + check_dtype=False, + check_like=True, + ) From 6f9f4ad021d4711281742ccec5fdaa3bfa65bd4c Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Wed, 15 May 2024 13:58:23 +0100 Subject: [PATCH 09/21] Remove load_data functions --- fhirflat/ingest.py | 18 ------------------ tests/test_ingest.py | 32 +++++++++++++++++++++----------- 2 files changed, 21 insertions(+), 29 deletions(-) diff --git a/fhirflat/ingest.py b/fhirflat/ingest.py index e1cabe9..2728d0e 100644 --- a/fhirflat/ingest.py +++ b/fhirflat/ingest.py @@ -2,10 +2,6 @@ Stores the main functions for converting clinical data (initally from RedCap-ARCH) to FHIRflat. -Assumes two files are provided: one with the clinical data and one containing the -mappings. PL: Actually, maybe rather than the mappings it's either a file or a -dictionary showing the location of each mapping file (one per resource type). - TODO: Eventually, this should link to a google sheet file that contains the mappings """ @@ -192,20 +188,6 @@ def create_dictionary( return melted_data["flat_dict"].to_frame() -def load_data(data, mapping_files, resource_type, file_name): - - df = create_dictionary(data, mapping_files, one_to_one=True) - - resource_type.ingest_to_flat(df, file_name) - - -def load_data_one_to_many(data, mapping_files, resource_type, file_name): - - df = create_dictionary(data, mapping_files, one_to_one=False) - - resource_type.ingest_to_flat(df.dropna(), file_name) - - def convert_data_to_flat( data: str, folder_name: str, diff --git a/tests/test_ingest.py b/tests/test_ingest.py index ac6d11a..c146a45 100644 --- a/tests/test_ingest.py +++ b/tests/test_ingest.py @@ -1,9 +1,13 @@ -from fhirflat.ingest import load_data, load_data_one_to_many, convert_data_to_flat +from fhirflat.ingest import ( + create_dictionary, + convert_data_to_flat, +) from fhirflat.resources.encounter import Encounter from fhirflat.resources.observation import Observation import pandas as pd from pandas.testing import assert_frame_equal import os +import shutil from decimal import Decimal @@ -28,13 +32,14 @@ def test_load_data_one_to_one_single_row(): - load_data( + df = create_dictionary( "tests/dummy_data/encounter_dummy_data_single.csv", "tests/dummy_data/encounter_dummy_mapping.csv", - Encounter, - "encounter_ingestion_single", + one_to_one=True, ) + Encounter.ingest_to_flat(df, "encounter_ingestion_single") + assert_frame_equal( pd.read_parquet("encounter_ingestion_single.parquet"), pd.DataFrame([ENCOUNTER_SINGLE_ROW_FLAT], index=[0]), @@ -129,13 +134,14 @@ def test_load_data_one_to_one_single_row(): def test_load_data_one_to_one_multi_row(): - load_data( + df = create_dictionary( "tests/dummy_data/encounter_dummy_data_multi.csv", "tests/dummy_data/encounter_dummy_mapping.csv", - Encounter, - "encounter_ingestion_multi", + one_to_one=True, ) + Encounter.ingest_to_flat(df, "encounter_ingestion_multi") + assert_frame_equal( pd.read_parquet("encounter_ingestion_multi.parquet"), pd.DataFrame(ENCOUNTER_SINGLE_ROW_MULTI), @@ -212,13 +218,14 @@ def test_load_data_one_to_one_multi_row(): def test_load_data_one_to_many_multi_row(): - load_data_one_to_many( + df = create_dictionary( "tests/dummy_data/vital_signs_dummy_data.csv", "tests/dummy_data/observation_dummy_mapping.csv", - Observation, - "observation_ingestion", + one_to_one=False, ) + Observation.ingest_to_flat(df.dropna(), "observation_ingestion") + full_df = pd.read_parquet("observation_ingestion.parquet") assert len(full_df) == 33 @@ -235,6 +242,7 @@ def test_load_data_one_to_many_multi_row(): def test_convert_data_to_flat_local_mapping(): + output_folder = "tests/ingestion_output" mappings = { Encounter: "tests/dummy_data/encounter_dummy_mapping.csv", Observation: "tests/dummy_data/observation_dummy_mapping.csv", @@ -244,7 +252,7 @@ def test_convert_data_to_flat_local_mapping(): convert_data_to_flat( "tests/dummy_data/combined_dummy_data.csv", mapping_files_types=(mappings, resource_types), - folder_name="tests/ingestion_output", + folder_name=output_folder, ) encounter_df = pd.read_parquet("tests/ingestion_output/encounter.parquet") @@ -267,3 +275,5 @@ def test_convert_data_to_flat_local_mapping(): check_dtype=False, check_like=True, ) + + shutil.rmtree(output_folder) From e35200347a903bdf1631cbfb342566bdecf19dcb Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Wed, 15 May 2024 17:08:54 +0100 Subject: [PATCH 10/21] Make fhirflat installable --- fhirflat/__init__.py | 14 ++++++++++++++ pyproject.toml | 44 ++++++++++++++++++++++++++++++++++++++++++++ pytest.ini | 2 -- requirements.txt | 10 ---------- 4 files changed, 58 insertions(+), 12 deletions(-) create mode 100644 fhirflat/__init__.py create mode 100644 pyproject.toml delete mode 100644 pytest.ini delete mode 100644 requirements.txt diff --git a/fhirflat/__init__.py b/fhirflat/__init__.py new file mode 100644 index 0000000..8e0e52b --- /dev/null +++ b/fhirflat/__init__.py @@ -0,0 +1,14 @@ +from .resources.condition import Condition +from .resources.encounter import Encounter +from .resources.immunization import Immunization +from .resources.location import Location +from .resources.medicationadministration import MedicationAdministration +from .resources.medicationstatement import MedicationStatement +from .resources.observation import Observation +from .resources.organization import Organization +from .resources.patient import Patient +from .resources.procedure import Procedure +from .resources.researchsubject import ResearchSubject +from .resources.specimen import Specimen + +from .ingest import convert_data_to_flat diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..5641cd8 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,44 @@ +[build-system] +requires = ['setuptools>=40.8.0'] +build-backend = 'setuptools.build_meta' + +[tool.setuptools] +packages = ["fhirflat"] + +[project] +name = "fhirflat" +version = "0.1.0" +description = "Flattened FHIR resources" +authors = [ + {name = "Pip Liggins", email = "philippa.liggins@dtc.ox.ac.uk"}, + {name = "Abhishek Dasgupta", email = "abhishek.dasgupta@dtc.ox.ac.uk"}, +] +license = {file = "LICENSE"} +requires-python = ">=3.10" +readme = "README.md" +classifiers = ["License :: OSI Approved :: MIT License"] +dependencies = [ + "fhir.resources==7.1.0", + "numpy==1.26.4", + "orjson==3.9.13", + "pandas>=2.2.0", + "pyarrow==15.0.0", + "pydantic==2.6.1", + "pydantic_core==2.16.2", +] + +[project.optional-dependencies] +test = [ + "pytest", + "pytest-cov", + "pytest-unordered" +] + +[project.urls] +Home = "https://github.com/globaldothealth/fhirflat" + +[tool.black] +line-length = 88 + +[tool.pytest.ini_options] +pythonpath = "." diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index 03f586d..0000000 --- a/pytest.ini +++ /dev/null @@ -1,2 +0,0 @@ -[pytest] -pythonpath = . \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 1755bfb..0000000 --- a/requirements.txt +++ /dev/null @@ -1,10 +0,0 @@ -fhir.resources==7.1.0 -numpy==1.26.4 -orjson==3.9.13 -pandas==2.2.0 -pyarrow==15.0.0 -pydantic==2.6.1 -pydantic_core==2.16.2 -pytest==8.0.0 -pytest-cov -pytest-unordered From 464dd678faa1928c6d90163de38b4d60d18f6365 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Wed, 15 May 2024 17:10:32 +0100 Subject: [PATCH 11/21] Allow mappings from google sheets --- fhirflat/ingest.py | 60 ++++++++++++++++++++++++++++++++------------ fhirflat/util.py | 6 +++++ tests/test_ingest.py | 3 +++ 3 files changed, 53 insertions(+), 16 deletions(-) diff --git a/fhirflat/ingest.py b/fhirflat/ingest.py index 2728d0e..5bd0ffc 100644 --- a/fhirflat/ingest.py +++ b/fhirflat/ingest.py @@ -9,6 +9,7 @@ import numpy as np import warnings import os +from fhirflat.util import get_local_resource # 1:1 (single row, single resource) mapping: Patient, Encounter # 1:M (single row, multiple resources) mapping: Observation, Condition, Procedure, ... @@ -132,7 +133,7 @@ def create_dict_from_cell(row, full_df, map_df): def create_dictionary( - data: pd.DataFrame, map_file: pd.DataFrame, one_to_one=False + data: pd.DataFrame, map_file: pd.DataFrame, resource: str, one_to_one=False ) -> pd.DataFrame: """ Given a data file and a single mapping file for one FHIR resource type, @@ -146,6 +147,8 @@ def create_dictionary( map_file: pd.DataFrame The mapping file containing the mapping of the clinical data to the FHIR resource. + resource: str + The name of the resource being mapped. one_to_one: bool Whether the resource should be mapped as one-to-one or one-to-many. """ @@ -155,11 +158,14 @@ def create_dictionary( # setup the data ----------------------------------------------------------- relevant_cols = map_df["redcap_variable"].dropna().unique() + filtered_data = data.loc[:, data.columns.isin(relevant_cols)].copy() - if one_to_one: - filtered_data = data[relevant_cols].copy() - else: - filtered_data = data.loc[:, data.columns.isin(relevant_cols)].reset_index() + if filtered_data.empty: + warnings.warn(f"No data found for the {resource} resource.", UserWarning) + return None + + if not one_to_one: + filtered_data = filtered_data.reset_index() melted_data = filtered_data.melt(id_vars="index", var_name="column") # set up the mappings ------------------------------------------------------- @@ -224,16 +230,38 @@ def convert_data_to_flat( if mapping_files_types: mappings, types = mapping_files_types - for resource, map_file in mappings.items(): - t = types[resource.__name__] - if t == "one-to-one": - df = create_dictionary(data, map_file, one_to_one=True) - elif t == "one-to-many": - df = create_dictionary(data, map_file, one_to_one=False) - df = df.dropna().reset_index(drop=True) + else: + sheet_link = ( + f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv" + ) + + df_types = pd.read_csv(sheet_link, header=0, index_col="Resources") + types = dict( + zip( + df_types.index, + df_types["Resource Type"], + ) + ) + sheet_keys = {r: df_types.loc[r, "Sheet ID"] for r in types.keys()} + mappings = { + get_local_resource(r): sheet_link + f"&gid={i}" + for r, i in sheet_keys.items() + } + + for resource, map_file in mappings.items(): + + t = types[resource.__name__] + if t == "one-to-one": + df = create_dictionary(data, map_file, resource.__name__, one_to_one=True) + if df is None: + continue + elif t == "one-to-many": + df = create_dictionary(data, map_file, resource.__name__, one_to_one=False) + if df is None: + continue else: - raise ValueError(f"Unknown mapping type {t}") + df = df.dropna().reset_index(drop=True) + else: + raise ValueError(f"Unknown mapping type {t}") - resource.ingest_to_flat(df, folder_name + "/" + resource.__name__.lower()) - else: - pass + resource.ingest_to_flat(df, folder_name + "/" + resource.__name__.lower()) diff --git a/fhirflat/util.py b/fhirflat/util.py index fe1a8b6..71cdbe8 100644 --- a/fhirflat/util.py +++ b/fhirflat/util.py @@ -6,6 +6,8 @@ from .resources import extensions +import fhirflat + def group_keys(data_keys: list[str]) -> list[dict[str, list[str]]]: """ @@ -65,3 +67,7 @@ def get_local_extension_type(t: str): return getattr(extensions, t.capitalize()) except AttributeError: raise AttributeError(f"Could not find {t} in fhirflat extensions") + + +def get_local_resource(t: str): + return getattr(fhirflat, t) diff --git a/tests/test_ingest.py b/tests/test_ingest.py index c146a45..5f31e66 100644 --- a/tests/test_ingest.py +++ b/tests/test_ingest.py @@ -35,6 +35,7 @@ def test_load_data_one_to_one_single_row(): df = create_dictionary( "tests/dummy_data/encounter_dummy_data_single.csv", "tests/dummy_data/encounter_dummy_mapping.csv", + "Encounter", one_to_one=True, ) @@ -137,6 +138,7 @@ def test_load_data_one_to_one_multi_row(): df = create_dictionary( "tests/dummy_data/encounter_dummy_data_multi.csv", "tests/dummy_data/encounter_dummy_mapping.csv", + "Encounter", one_to_one=True, ) @@ -221,6 +223,7 @@ def test_load_data_one_to_many_multi_row(): df = create_dictionary( "tests/dummy_data/vital_signs_dummy_data.csv", "tests/dummy_data/observation_dummy_mapping.csv", + "Observation", one_to_one=False, ) From a49d78b31a3698ccc28175b3d363fbd26c0d01a3 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Wed, 15 May 2024 17:14:09 +0100 Subject: [PATCH 12/21] Update test workflow for package --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b17ce45..a8ad5e7 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -27,7 +27,7 @@ jobs: python-version: "3.11" - name: Install dependencies run: | - python3 -m pip install -r requirements.txt + python3 -m pip install '.[test]' - name: Test with pytest run: | python3 -m pytest --cov From de6c177f391c40070adcb1dc541d48e41588429c Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Fri, 17 May 2024 16:14:34 +0100 Subject: [PATCH 13/21] Allow lists to be created during ingestion. Backbone elements converted to full FHIR to be written out as *_dense. Add ID to Encounter resource --- fhirflat/flat2fhir.py | 32 +- fhirflat/ingest.py | 22 +- fhirflat/resources/base.py | 44 ++- fhirflat/resources/encounter.py | 16 +- tests/data/encounter_flat.parquet | Bin 26278 -> 26792 bytes tests/dummy_data/combined_dummy_data.csv | 8 +- .../dummy_data/encounter_dummy_data_multi.csv | 8 +- .../encounter_dummy_data_single.csv | 2 +- tests/dummy_data/encounter_dummy_mapping.csv | 50 ++-- tests/dummy_data/vital_signs_dummy_data.csv | 6 +- tests/test_encounter_resource.py | 2 + tests/test_ingest.py | 274 +++++++++++++++--- 12 files changed, 367 insertions(+), 97 deletions(-) diff --git a/fhirflat/flat2fhir.py b/fhirflat/flat2fhir.py index c0d23af..ee26a4f 100644 --- a/fhirflat/flat2fhir.py +++ b/fhirflat/flat2fhir.py @@ -19,16 +19,28 @@ def create_codeable_concept( # for reading in from ingestion pipeline if (name + ".code" and name + ".system") in old_dict: code = old_dict[name + ".code"] - formatted_code = code if isinstance(code, str) else str(int(code)) - new_dict = { - "coding": [ - { - "system": old_dict[name + ".system"], - "code": formatted_code, - "display": old_dict[name + ".text"], - } - ] - } + if isinstance(code, list) and len(code) > 1: + new_dict = {"coding": []} + for system, code, name in zip( + old_dict[name + ".system"], code, old_dict[name + ".text"] + ): + formatted_code = code if isinstance(code, str) else str(int(code)) + display = name + + subdict = {"system": system, "code": code, "display": display} + + new_dict["coding"].append(subdict) + else: + formatted_code = code if isinstance(code, str) else str(int(code)) + new_dict = { + "coding": [ + { + "system": old_dict[name + ".system"], + "code": formatted_code, + "display": old_dict[name + ".text"], + } + ] + } return new_dict # From FHIRflat file diff --git a/fhirflat/ingest.py b/fhirflat/ingest.py index 5bd0ffc..9e2c923 100644 --- a/fhirflat/ingest.py +++ b/fhirflat/ingest.py @@ -9,6 +9,7 @@ import numpy as np import warnings import os +from math import isnan from fhirflat.util import get_local_resource # 1:1 (single row, single resource) mapping: Patient, Encounter @@ -16,7 +17,7 @@ """ TODO -* sort out reference formatting +* sort out reference formatting + how to choose ID's e.g. location within encounter etc * cope with 'if' statements - e.g. for date overwriting. * deal with duplicates/how to add multiple values to a single field - list options. * Consider using pandarallel (https://pypi.org/project/pandarallel/) to parallelize @@ -37,6 +38,14 @@ def find_field_value(row, response, mapp, raw_data=None): results = [find_field_value(row, response, m) for m in mapp] results = [x for x in results if x == x] return " ".join(results) + elif "if not" in mapp: + mapp = mapp.replace(" ", "").split("ifnot") + results = [find_field_value(row, response, m) for m in mapp] + x, y = results + if isinstance(y, float): + return x if isnan(y) else None + else: + return x if not y else None else: col = mapp.lstrip("<").rstrip(">") try: @@ -90,11 +99,14 @@ def create_dict_from_row(row, map_df): result[key] == snippet[key] for key in duplicate_keys ): # Ignore duplicates if they are the same continue + elif all(result[key] is None for key in duplicate_keys): + result.update(snippet) else: - raise ValueError( - "Duplicate keys in mapping:" - f" {set(result.keys()).intersection(snippet.keys())}" - ) + for key in duplicate_keys: + if isinstance(result[key], list): + result[key].append(snippet[key]) + else: + result[key] = [result[key], snippet[key]] return result diff --git a/fhirflat/resources/base.py b/fhirflat/resources/base.py index 2ccc53e..30f205d 100644 --- a/fhirflat/resources/base.py +++ b/fhirflat/resources/base.py @@ -6,6 +6,8 @@ import orjson from ..fhir2flat import fhir2flat +from ..flat2fhir import expand_concepts + from typing import TypeAlias, ClassVar JsonString: TypeAlias = str @@ -24,6 +26,8 @@ class FHIRFlatBase(DomainResource): flat_defaults: ClassVar[list[str]] = [] + backbone_elements: ClassVar[dict] = {} + @classmethod def attr_lists(cls) -> list[str]: """Attributes which take a list of FHIR types.""" @@ -73,19 +77,51 @@ def from_flat(cls, file: str) -> FHIRFlatBase | list[FHIRFlatBase]: else: return list(df["fhir"]) + @classmethod + def ingest_backbone_elements(cls, mapped_data: pd.Series): + """ + Takes ordered lists of data and forms the correct FHIR format which won't + be flattened after ingestion. + """ + + def fhir_format(row): + for b_e, b_c in cls.backbone_elements.items(): + keys_present = [key for key in row if key.startswith(b_e)] + if keys_present: + condensed_dict = {k: row[k] for k in keys_present} + if all( + not isinstance(v, list) or len(v) == 1 + for v in condensed_dict.values() + ): + continue + else: + backbone_list = [] + for i in range(len(next(iter(condensed_dict.values())))): + first_item = { + k.lstrip(b_e + "."): v[i] + for k, v in condensed_dict.items() + } + backbone_list.append(expand_concepts(first_item, b_c)) + for k_d in condensed_dict: + row.pop(k_d) + row[b_e] = backbone_list + return row + + mapped_data.apply(fhir_format) + return mapped_data + @classmethod def ingest_to_flat(cls, data: pd.DataFrame, filename: str): """ Takes a pandas dataframe and populates the resource with the data. + Creates a FHIRflat parquet file for the resources. data: pd.DataFrame Pandas dataframe containing the data - - Returns - ------- - FHIRFlatBase or list[FHIRFlatBase] """ + data["flat_dict"] = cls.ingest_backbone_elements(data["flat_dict"]) + # Creates a columns of FHIR resource instances data["fhir"] = data["flat_dict"].apply( lambda x: cls.cleanup(x, json_data=False) diff --git a/fhirflat/resources/encounter.py b/fhirflat/resources/encounter.py index 662aea7..f65500a 100644 --- a/fhirflat/resources/encounter.py +++ b/fhirflat/resources/encounter.py @@ -1,5 +1,12 @@ from __future__ import annotations from fhir.resources.encounter import Encounter as _Encounter +from fhir.resources.encounter import ( + EncounterAdmission, + EncounterDiagnosis, + EncounterLocation, + EncounterParticipant, + EncounterReason, +) from .base import FHIRFlatBase import orjson @@ -36,7 +43,6 @@ class Encounter(_Encounter, FHIRFlatBase): # attributes to exclude from the flat representation flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions + ( - "id", "identifier", "participant", # participants other than the patient "appointment", # appointment that scheduled the encounter @@ -49,6 +55,14 @@ class Encounter(_Encounter, FHIRFlatBase): # required attributes that are not present in the FHIRflat representation flat_defaults: ClassVar[list[str]] = FHIRFlatBase.flat_defaults + ["status"] + backbone_elements: ClassVar[dict] = { + "participant": EncounterParticipant, + "reason": EncounterReason, + "diagnosis": EncounterDiagnosis, + "admission": EncounterAdmission, + "location": EncounterLocation, + } + @validator("extension") def validate_extension_contents(cls, extensions): rel_phase_count = sum(isinstance(item, relativePeriod) for item in extensions) diff --git a/tests/data/encounter_flat.parquet b/tests/data/encounter_flat.parquet index cf34fd9d81a5956f83e3edba247ae1b6d7a3252a..a1cdf7a9f8f827680d85e0cfc67d93aefe31d848 100644 GIT binary patch delta 1534 zcmb7?T}&fY6vsOQEfjZIpdYrS2us*e!^gDK4pNt0xGhj-pk=0|pR6H6OG}5g)O4Y= zn&3yGBuiFyj}JUxqAx}hqnK_CCO%-|gD)sPnCz1gU(6}cTEoKv$Ng7BaX?%$=ZW7eKpW5_W2USl!Pz=xO*2vo;+|gOpw*~4=0h;vRl3#}4 zSZ`DR8l<*@@NugZz8<&?1w)GbB!(CCaAL5kKa5f*5qR6sU8RjX1i2%@i!`m?d5*e1 z2YdAI@QZ1d+(`px?pA-0qJCb4g82<{QwG9vLA{xwzF2}oa}Uf|R>;?KaA4_BU(Zrs zEyK2@3rbJ7!Kl6$M%u5EKNa9pd!KrDg}RkLy~;?t7S1s&j9P)*TLmxEr#>lCTSeHg zj>10E23swU!%7EF-dl%5=1KKQiTbPr8y$Uc!O_-gs5jpmXH_OmF|o9mlMF-E!6*9a z>oBNlxcgOt=)57stInRg+JXPc@qCZ|QI3OkR1J_LH6a3RY^$Rlf%wP60FP|_wE*I} z8U%I!MQ*SbHC-)QdegH;l4>+=59?JbbDv35kn%~XXnLiqDmv0lLO3QLV`&k_J>AVn97hPb@kEFdtE-;lvt%v{yJO}$DI&u3 zY`(!eQBID#a`2aE{#V}!<(u4PDdH+i(Etm?L?@hxMv$ipkL4eTA5V3{Zomi@Z=pTU zXJb+S+Cqk9Vv%)wJjlum3@gW@ZW}0h(?JU7>ESzmwiV|l7?(Yf^Jl}8*-|_R^P;Kw z;UJ)ib~p&m>MQXm&qxt}HZ;kvC3D&3c$9-e$gHas#}Cpv!Wk(SmXjW3S4fANL7ZJW zlRQ00xl|wvZ;hEW_X99L^#c4IYJhiy4oyD5CcGgy3RE;+*1q5^L5trAH$5i!K`@iq zRgh+^8Y#ls{4&gY+c(0tBIn`dL@vCP^t#HNypmol^F=Abmt#?$ReUb*X0a|WI1-s@ z56UES;c|k>Vouy3=i%1=O;==a#s?bsii4(IX~@J~4L(Kpits+xf}&{zcjBCOOm<>D z=kI*C&)MB{S8?`rfItjx8^QueM(xduUCL;vA%|}l9F~62CmQpULlVU yso>=y{E+bprR|X$KLg?Lu>S)XKI}sP delta 1288 zcmZuvO>7%g5VoCc9H+Eu8pk+JnnDB%l9SqAJ0TVjc)O0*Ht~Ax#P)jE35|Dc$M$;d z$T;pAN<%@khe!>Y8(b<>LB*++IYg)&-~d9j{2VwSRgg*raX=NfN+<}Pdq802oxX48 zdo%OBH`l&5esJ4({o)85wVpBEib2gfLEPfVOC0Q4e=yx#hT^E3_$^Lu$Kl5C1niAo zHGPzZHJh7wKSk~*VZ!a}d}({zXxe!hVs;m?Bas&}AU%2yDxMRdkFA(?^04ihBCaUp zuQGh-nT8>&73Lf<(?Jm~I-VpB3glh^ZaBd7XBqs|N#a78{JaDbMS(W#guf{Zd^27# z-P2%~nkIIu(0!&vMFzKBp_BYAxO^A6D@YBh4(;r)aCZ8hqo8)f22?(aO9Y8j1_@9<*ht0(STdd<)Ek%zcn=p%dV&mQJ{ zq}TOKuUlNc$F8^6qSR||x!2l@cE8O;4%#j~Yc&|grtJgubxm1s*xxST>O5Fr zh`}pCSNC{;4!qFEAs*feOdVrtffhnE!Y5MWJ0Az`9<^xnhO8xcu>|*%_Q8J%7as#x zY6X5?w!x3-kt1>_QqOrKbtP1RSJ~I#@`?@aE{+`kPrRwHDIaX}{BimC}`}ie zJVZ-cLd}QZlb~Zb$Es~5_=+co2gp$gRe*EpFq~K%A7df~fl`}z8lOO&Rh4S0A++g& zpaXs_b3LcUDUN~H({@5_DuE1q&5obT=cZ4VPNQuqLXq#?^C_uM{hxQgkKRj55AfSaUbJ+Dv-3k`cH9Zm-Lfx|QqJ zWWxGVIgE7?KmJ^{`@=FDY>I1au#Iu98wjErLeXLbZ$O5LBlHX^BD93UnEM!<4|MrU zD1^D2#(Tmd8RuKL7~Sgb43F@oSSx@KBO{!Tq9~jfpEs_9QAZoeY9> z+Cn2g#x!4m%hLQnt4lbzExF89zJNEg4Sm_u=4b$4a4y1p)^Fw?VmF+{mz5;vM zuqD{Wc)sZ5VQBR^3ln$pE?0^#!?)6mxyhGZE^iy&TwSolx;keI>qKd0AnC+iDEMpD b)yb~?Vlhrae1n5C8;@8FddOhlQNr^NFuAqI diff --git a/tests/dummy_data/combined_dummy_data.csv b/tests/dummy_data/combined_dummy_data.csv index 94449ef..2988531 100644 --- a/tests/dummy_data/combined_dummy_data.csv +++ b/tests/dummy_data/combined_dummy_data.csv @@ -1,5 +1,5 @@ usubjid,visitid,dates_enrolment,dates_adm,dates_admdate,dates_admtime,non_encounter_field,outco_denguediag,outco_denguediag_main,outco_denguediag_class,outco_not_dengue,outco_secondiag_oth,outco_date,outco_outcome,daily_date,vital_highesttem_c,vital_hr,vital_rr,vital_systolicbp,vital_diastolicbp,vital_spo2,vital_fio2spo2_02110,vital_fio2spo2_pcnt,vital_capillaryr,vital_avpu,vital_gcs,vital_urineflow -1,10,2020-05-01,0,,,,,,,cough,,,7,2020-01-01,36.2,120,30,70,120,5,,75,1,1,1,150 -2,11,,1,2021-04-01,18:00,fish,1,,2,,,2021-04-10,1,2021-02-02,37,100,40,80,130,6,10,85,0,2,1,200 -3,12,,1,2021-05-10,17:30,,1,,1,flu,,2021-05-15,4,2022-03-03,35.5,70,50,90,140,7,,95,0,3,1, -4,13,,1,2022-06-15,21:00,dolphin,0,Malaria,,,,2022-06-20,2,,,,,,,,,,,,, \ No newline at end of file +p1,e10,2020-05-01,0,,,,,,,cough,,,7,2020-01-01,36.2,120,30,70,120,5,,75,1,1,1,150 +p2,e11,,1,2021-04-01,18:00,fish,1,,2,,,2021-04-10,1,2021-02-02,37,100,40,80,130,6,10,85,0,2,1,200 +p3,e12,,1,2021-05-10,17:30,,1,,1,flu,,2021-05-15,4,2022-03-03,35.5,70,50,90,140,7,,95,0,3,1, +p4,e13,,1,2022-06-15,21:00,dolphin,0,Malaria,,,,2022-06-20,2,,,,,,,,,,,,, \ No newline at end of file diff --git a/tests/dummy_data/encounter_dummy_data_multi.csv b/tests/dummy_data/encounter_dummy_data_multi.csv index d0ad22e..286a771 100644 --- a/tests/dummy_data/encounter_dummy_data_multi.csv +++ b/tests/dummy_data/encounter_dummy_data_multi.csv @@ -1,5 +1,5 @@ usubjid,visitid,dates_enrolment,dates_adm,dates_admdate,dates_admtime,non_encounter_field,outco_denguediag,outco_denguediag_main,outco_denguediag_class,outco_not_dengue,outco_secondiag_oth,outco_date,outco_outcome -1,10,2020-05-01,0,,,,,,,cough,,,7 -2,11,,1,2021-04-01,18:00,fish,1,,2,,,2021-04-10,1 -3,12,,1,2021-05-10,17:30,,1,,1,flu,,2021-05-15,4 -4,13,,1,2022-06-15,21:00,dolphin,0,Malaria,,,,2022-06-20,2 \ No newline at end of file +p1,e10,2020-05-01,0,,,,,,,cough,,,7 +p2,e11,,1,2021-04-01,18:00,fish,1,,2,,,2021-04-10,1 +p3,e12,,1,2021-05-10,17:30,,1,,1,flu,,2021-05-15,4 +p4,e13,,1,2022-06-15,21:00,dolphin,0,Malaria,,,,2022-06-20,2 \ No newline at end of file diff --git a/tests/dummy_data/encounter_dummy_data_single.csv b/tests/dummy_data/encounter_dummy_data_single.csv index b7c635d..4f2122f 100644 --- a/tests/dummy_data/encounter_dummy_data_single.csv +++ b/tests/dummy_data/encounter_dummy_data_single.csv @@ -1,2 +1,2 @@ usubjid,visitid,dates_enrolment,dates_adm,dates_admdate,dates_admtime,non_encounter_field,outco_denguediag,outco_denguediag_main,outco_denguediag_class,outco_not_dengue,outco_secondiag_oth,outco_date,outco_outcome -2,11,,1,2021-04-01,18:00,fish,1,,2,,,2021-04-10,1 \ No newline at end of file +2,11,2021-04-02,1,2021-04-01,18:00,fish,1,,2,,,2021-04-10,1 \ No newline at end of file diff --git a/tests/dummy_data/encounter_dummy_mapping.csv b/tests/dummy_data/encounter_dummy_mapping.csv index 5d88a4e..0c7efec 100644 --- a/tests/dummy_data/encounter_dummy_mapping.csv +++ b/tests/dummy_data/encounter_dummy_mapping.csv @@ -1,25 +1,25 @@ -redcap_variable,redcap_response,subject,class.system,class.code,class.text,actualPeriod.start,actualPeriod.end,reason.value.concept.system,reason.value.concept.code,reason.value.concept.text,reason.use.system,reason.use.code,reason.use.text,diagnosis.condition.concept.system,diagnosis.condition.concept.code,diagnosis.condition.concept.text,diagnosis.use.system,diagnosis.use.code,diagnosis.use.text,admission.dischargeDisposition.system,admission.dischargeDisposition.code,admission.dischargeDisposition.text -usubjid,,,,,,,,,,,,,,,,,,,,,, -dates_enrolment,,,,,,,,,,,,,,,,,,,,,, -dates_adm,"1, Yes",,https://snomed.info/sct,32485007,Hospital admission (procedure),,,,,,,,,,,,,,,,, -,"0, No",,https://snomed.info/sct,32485007,Hospital admission (procedure),,,,,,,,,,,,,,,,, -,,,https://snomed.info/sct,371883000,Outpatient procedure (procedure),,,,,,,,,,,,,,,,, -,"99, Unknown",,https://snomed.info/sct,32485007,Hospital admission (procedure),,,,,,,,,,,,,,,,, -dates_admdate,,,,,,+,,,,,,,,,,,,,,,, -dates_admtime,,,,,,+,,,,,,,,,,,,,,,, -outco_denguediag,"1, Yes",,,,,,,https://snomed.info/sct,38362002,Dengue (disorder),https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,,,,,,,, -,"0, No",,,,,,,,,,,,,,,,,,,,, -,"99, Unknown",,,,,,,https://snomed.info/sct,261665006,Unknown (qualifier value),,,,,,,,,,,, -outco_denguediag_main,,,,,,,,,,,,,,,,,https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,, -outco_denguediag_class,"1, Uncomplicated dengue",,,,,,,,,,,,,https://snomed.info/sct,722862003,Dengue without warning signs (disorder),https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,, -,"2, Dengue with warning signs",,,,,,,,,,,,,https://snomed.info/sct,722863008,Dengue with warning signs (disorder),https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,, -,"3, Severe dengue",,,,,,,,,,,,,https://snomed.info/sct,20927009,Dengue hemorrhagic fever (disorder),https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,, -outco_secondiag_oth,,,,,,,,,,,,,,,,,https://snomed.info/sct,85097005,Secondary diagnosis (contextual qualifier) (qualifier value),,, -outco_date,,,,,,,,,,,,,,,,,,,,,, -outco_outcome,"1, Discharged alive",,,,,,,,,,,,,,,,,,,https://snomed.info/sct,371827001,Patient discharged alive (finding) -,"2, Still hospitalised",,,,,,,,,,,,,,,,,,,https://snomed.info/sct,32485007,Hospital admission (procedure) -,"3, Transfer to other facility",,,,,,,,,,,,,,,,,,,https://snomed.info/sct,306685000,Discharge to establishment (procedure) -,"4, Death",,,,,,,,,,,,,,,,,,,https://snomed.info/sct,419099009,Dead (finding) -,"5, Palliative care",,,,,,,,,,,,,,,,,,,https://snomed.info/sct,306237005,Referral to palliative care service (procedure) -,"6, Discharged against medical advice",,,,,,,,,,,,,,,,,,,https://snomed.info/sct,225928004,Patient self-discharge against medical advice (procedure) -,"7, Alive, not admitted",,,,,,,,,,,,,,,,,,,https://snomed.info/sct,371827001,Patient discharged alive (finding) \ No newline at end of file +redcap_variable,redcap_response,id,subject,extension.timingPhase.system,extension.timingPhase.code,extension.timingPhase.text,class.system,class.code,class.text,actualPeriod.start,actualPeriod.end,diagnosis.condition.concept.system,diagnosis.condition.concept.code,diagnosis.condition.concept.text,diagnosis.use.system,diagnosis.use.code,diagnosis.use.text,admission.dischargeDisposition.system,admission.dischargeDisposition.code,admission.dischargeDisposition.text +usubjid,,,,,,,,,,,,,,,,,,,, +visitid,,,,,,,,,,,,,,,,,,,, +dates_enrolment,,,,,,,,,, if not , if not ,,,,,,,,, +dates_adm,"1, Yes",,,https://snomed.info/sct,278307001,On admission (qualifier value),https://snomed.info/sct,32485007,Hospital admission (procedure),,,,,,,,,,, +,"0, No",,,https://snomed.info/sct,281379000,Pre-admission (qualifier value),https://snomed.info/sct,371883000,Outpatient procedure (procedure),,,,,,,,,,, +,"99, Unknown",,,,,,https://snomed.info/sct,32485007,Hospital admission (procedure),,,,,,,,,,, +dates_admdate,,,,,,,,,,+,,,,,,,,,, +dates_admtime,,,,,,,,,,+,,,,,,,,,, +outco_denguediag,"1, Yes",,,,,,,,,,,https://snomed.info/sct,38362002,Dengue (disorder),https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,, +,"0, No",,,,,,,,,,,,,,,,,,, +,"99, Unknown",,,,,,,,,,,https://snomed.info/sct,261665006,Unknown (qualifier value),,,,,, +outco_date,,,,,,,,,,,,,,,,,,,, +outco_outcome,"1, Discharged alive",,,,,,,,,,,,,,,,,https://snomed.info/sct,371827001,Patient discharged alive (finding) +,"2, Still hospitalised",,,,,,,,,,,,,,,,,https://snomed.info/sct,32485007,Hospital admission (procedure) +,"3, Transfer to other facility",,,,,,,,,,,,,,,,,https://snomed.info/sct,306685000,Discharge to establishment (procedure) +,"4, Death",,,,,,,,,,,,,,,,,https://snomed.info/sct,419099009,Dead (finding) +,"5, Palliative care",,,,,,,,,,,,,,,,,https://snomed.info/sct,306237005,Referral to palliative care service (procedure) +,"6, Discharged against medical advice",,,,,,,,,,,,,,,,,https://snomed.info/sct,225928004,Patient self-discharge against medical advice (procedure) +,"7, Alive, not admitted",,,,,,,,,,,,,,,,,https://snomed.info/sct,371827001,Patient discharged alive (finding) +outco_denguediag_main,,,,,,,,,,,,,,,https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,, +outco_denguediag_class,"1, Uncomplicated dengue",,,,,,,,,,,https://snomed.info/sct,722862003,Dengue without warning signs (disorder),https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,, +,"2, Dengue with warning signs",,,,,,,,,,,https://snomed.info/sct,722863008,Dengue with warning signs (disorder),https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,, +,"3, Severe dengue",,,,,,,,,,,https://snomed.info/sct,20927009,Dengue hemorrhagic fever (disorder),https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,, +outco_secondiag_oth,,,,,,,,,,,,,,,https://snomed.info/sct,85097005,Secondary diagnosis (contextual qualifier) (qualifier value),,, \ No newline at end of file diff --git a/tests/dummy_data/vital_signs_dummy_data.csv b/tests/dummy_data/vital_signs_dummy_data.csv index a361b93..4f96f3c 100644 --- a/tests/dummy_data/vital_signs_dummy_data.csv +++ b/tests/dummy_data/vital_signs_dummy_data.csv @@ -1,4 +1,4 @@ usubjid,visitid,daily_date,vital_highesttem_c,vital_hr,vital_rr,vital_systolicbp,vital_diastolicbp,vital_spo2,vital_fio2spo2_02110,vital_fio2spo2_pcnt,vital_capillaryr,vital_avpu,vital_gcs,vital_urineflow -1,10,2020-01-01,36.2,120,30,70,120,5,,75,1,1,1,150 -2,11,2021-02-02,37,100,40,80,130,6,10,85,0,2,1,200 -3,12,2022-03-03,35.5,70,50,90,140,7,,95,0,3,1, \ No newline at end of file +p1,e10,2020-01-01,36.2,120,30,70,120,5,,75,1,1,1,150 +p2,e11,2021-02-02,37,100,40,80,130,6,10,85,0,2,1,200 +p3,e12,2022-03-03,35.5,70,50,90,140,7,,95,0,3,1, \ No newline at end of file diff --git a/tests/test_encounter_resource.py b/tests/test_encounter_resource.py index fac7f83..260435d 100644 --- a/tests/test_encounter_resource.py +++ b/tests/test_encounter_resource.py @@ -176,6 +176,7 @@ ENCOUNTER_FLAT = { "resourceType": "Encounter", + "id": "f203", "extension.timingPhase.code": "http://snomed.info/sct|278307001", "extension.timingPhase.text": "on admission", "extension.relativePeriod.relativeStart": 2, @@ -236,6 +237,7 @@ ENCOUNTER_DICT_OUT = { "resourceType": "Encounter", + "id": "f203", "status": "completed", "extension": [ { diff --git a/tests/test_ingest.py b/tests/test_ingest.py index 5f31e66..bfcc941 100644 --- a/tests/test_ingest.py +++ b/tests/test_ingest.py @@ -11,23 +11,117 @@ from decimal import Decimal +ENCOUNTER_DICT_OUT = { + "id": 11, + "subject": 2, + "actualPeriod.start": "2021-04-01 18:00", + "actualPeriod.end": "2021-04-10", + "extension.timingPhase.system": "https://snomed.info/sct", + "extension.timingPhase.code": 278307001, + "extension.timingPhase.text": "On admission (qualifier value)", + "class.system": "https://snomed.info/sct", + "class.code": 32485007, + "class.text": "Hospital admission (procedure)", + "diagnosis.condition.concept.system": [ + "https://snomed.info/sct", + "https://snomed.info/sct", + ], + "diagnosis.condition.concept.code": [38362002, 722863008], + "diagnosis.condition.concept.text": [ + "Dengue (disorder)", + "Dengue with warning signs (disorder)", + ], + "diagnosis.use.system": ["https://snomed.info/sct", "https://snomed.info/sct"], + "diagnosis.use.code": [89100005, 89100005], + "diagnosis.use.text": [ + "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", + "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", + ], + "admission.dischargeDisposition.system": "https://snomed.info/sct", + "admission.dischargeDisposition.code": 371827001, + "admission.dischargeDisposition.text": "Patient discharged alive (finding)", +} + + +def test_create_dict_one_to_one_single_row(): + df = create_dictionary( + "tests/dummy_data/encounter_dummy_data_single.csv", + "tests/dummy_data/encounter_dummy_mapping.csv", + "Encounter", + one_to_one=True, + ) + + dict_out = df["flat_dict"][0] + + assert dict_out == ENCOUNTER_DICT_OUT + + ENCOUNTER_SINGLE_ROW_FLAT = { "resourceType": "Encounter", + "id": "11", "class.code": "https://snomed.info/sct|32485007", "class.text": "Hospital admission (procedure)", - "reason.use.code": "https://snomed.info/sct|89100005", - "reason.use.text": "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 - "reason.value.concept.code": "https://snomed.info/sct|38362002", - "reason.value.concept.text": "Dengue (disorder)", - "diagnosis.condition.concept.code": "https://snomed.info/sct|722863008", - "diagnosis.condition.concept.text": "Dengue with warning signs (disorder)", - "diagnosis.use.code": "https://snomed.info/sct|89100005", - "diagnosis.use.text": "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + "diagnosis_dense": [ + { + "condition": [ + { + "concept": { + "coding": [ + { + "code": "38362002", + "display": "Dengue (disorder)", + "system": "https://snomed.info/sct", + } + ] + } + } + ], + "use": [ + { + "coding": [ + { + "code": "89100005", + "display": "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + "system": "https://snomed.info/sct", + } + ] + } + ], + }, + { + "condition": [ + { + "concept": { + "coding": [ + { + "system": "https://snomed.info/sct", + "code": "722863008", + "display": "Dengue with warning signs (disorder)", + } + ] + } + } + ], + "use": [ + { + "coding": [ + { + "code": "89100005", + "display": "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + "system": "https://snomed.info/sct", + } + ] + } + ], + }, + ], "subject": "2", "actualPeriod.start": "2021-04-01 18:00:00", "actualPeriod.end": "2021-04-10", "admission.dischargeDisposition.code": "https://snomed.info/sct|371827001", "admission.dischargeDisposition.text": "Patient discharged alive (finding)", + "extension.timingPhase.code": ["https://snomed.info/sct|278307001"], + "extension.timingPhase.text": ["On admission (qualifier value)"], } @@ -52,61 +146,149 @@ def test_load_data_one_to_one_single_row(): ENCOUNTER_SINGLE_ROW_MULTI = { "resourceType": ["Encounter", "Encounter", "Encounter", "Encounter"], "class.code": [ - "https://snomed.info/sct|32485007", + "https://snomed.info/sct|371883000", "https://snomed.info/sct|32485007", "https://snomed.info/sct|32485007", "https://snomed.info/sct|32485007", ], "class.text": [ - "Hospital admission (procedure)", + "Outpatient procedure (procedure)", "Hospital admission (procedure)", "Hospital admission (procedure)", "Hospital admission (procedure)", ], - "reason.use.code": [ + "diagnosis_dense": [ None, - "https://snomed.info/sct|89100005", - "https://snomed.info/sct|89100005", + [ + { + "condition": [ + { + "concept": { + "coding": [ + { + "code": "38362002", + "display": "Dengue (disorder)", + "system": "https://snomed.info/sct", + } + ] + } + } + ], + "use": [ + { + "coding": [ + { + "code": "89100005", + "display": "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + "system": "https://snomed.info/sct", + } + ] + } + ], + }, + { + "condition": [ + { + "concept": { + "coding": [ + { + "code": "722863008", + "display": "Dengue with warning signs (disorder)", + "system": "https://snomed.info/sct", + } + ] + } + } + ], + "use": [ + { + "coding": [ + { + "code": "89100005", + "display": "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + "system": "https://snomed.info/sct", + } + ] + } + ], + }, + ], + [ + { + "condition": [ + { + "concept": { + "coding": [ + { + "code": "38362002", + "display": "Dengue (disorder)", + "system": "https://snomed.info/sct", + } + ] + } + } + ], + "use": [ + { + "coding": [ + { + "code": "89100005", + "display": "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + "system": "https://snomed.info/sct", + } + ] + } + ], + }, + { + "condition": [ + { + "concept": { + "coding": [ + { + "code": "722862003", + "display": "Dengue without warning signs (disorder)", # noqa: E501 + "system": "https://snomed.info/sct", + } + ] + } + } + ], + "use": [ + { + "coding": [ + { + "code": "89100005", + "display": "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + "system": "https://snomed.info/sct", + } + ] + } + ], + }, + ], None, ], - "reason.use.text": [ - None, - "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", - "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", - None, - ], - "reason.value.concept.code": [ - None, - "https://snomed.info/sct|38362002", - "https://snomed.info/sct|38362002", + "diagnosis.condition.concept.text": [ None, - ], - "reason.value.concept.text": [None, "Dengue (disorder)", "Dengue (disorder)", None], - "diagnosis.condition.concept.code": [ None, - "https://snomed.info/sct|722863008", - "https://snomed.info/sct|722862003", None, + "Malaria", ], - "diagnosis.condition.concept.text": [ + "diagnosis.use.code": [ None, - "Dengue with warning signs (disorder)", - "Dengue without warning signs (disorder)", None, - ], - "diagnosis.use.code": [ None, "https://snomed.info/sct|89100005", - "https://snomed.info/sct|89100005", - "https://snomed.info/sct|89100005", ], "diagnosis.use.text": [ None, - "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", - "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", + None, + None, "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", ], - "subject": ["1", "2", "3", "4"], + "subject": ["p1", "p2", "p3", "p4"], + "id": ["e10", "e11", "e12", "e13"], "actualPeriod.start": [ "2020-05-01", "2021-04-01 18:00:00", @@ -131,6 +313,18 @@ def test_load_data_one_to_one_single_row(): "Dead (finding)", "Hospital admission (procedure)", ], + "extension.timingPhase.code": [ + ["https://snomed.info/sct|281379000"], + ["https://snomed.info/sct|278307001"], + ["https://snomed.info/sct|278307001"], + ["https://snomed.info/sct|278307001"], + ], + "extension.timingPhase.text": [ + ["Pre-admission (qualifier value)"], + ["On admission (qualifier value)"], + ["On admission (qualifier value)"], + ["On admission (qualifier value)"], + ], } @@ -196,8 +390,8 @@ def test_load_data_one_to_one_multi_row(): "Heart rate", "Heart rate", ], - "subject": ["1", "2", "3", "1", "2"], - "encounter": ["10", "11", "12", "10", "11"], + "subject": ["p1", "p2", "p3", "p1", "p2"], + "encounter": ["e10", "e11", "e12", "e10", "e11"], "valueQuantity.value": [Decimal("36.2"), 37.0, 35.5, 120.0, 100.0], "valueQuantity.unit": [ "DegreesCelsius", From 234fd092afec6707e0064ca7de7e5fb42a41831e Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Mon, 20 May 2024 11:53:14 +0100 Subject: [PATCH 14/21] Improve references --- fhirflat/ingest.py | 23 ++++++----- fhirflat/resources/base.py | 19 ++++++++- fhirflat/resources/patient.py | 4 +- tests/dummy_data/combined_dummy_data.csv | 8 ++-- .../dummy_data/encounter_dummy_data_multi.csv | 8 ++-- tests/dummy_data/encounter_dummy_mapping.csv | 2 +- .../dummy_data/observation_dummy_mapping.csv | 38 +++++++++--------- tests/dummy_data/vital_signs_dummy_data.csv | 6 +-- tests/ingestion_output/encounter.parquet | Bin 0 -> 18587 bytes tests/ingestion_output/observation.parquet | Bin 0 -> 13457 bytes tests/ingestion_output/patient.parquet | Bin 0 -> 2182 bytes tests/test_ingest.py | 18 ++++++--- 12 files changed, 76 insertions(+), 50 deletions(-) create mode 100644 tests/ingestion_output/encounter.parquet create mode 100644 tests/ingestion_output/observation.parquet create mode 100644 tests/ingestion_output/patient.parquet diff --git a/fhirflat/ingest.py b/fhirflat/ingest.py index 9e2c923..ff06aa3 100644 --- a/fhirflat/ingest.py +++ b/fhirflat/ingest.py @@ -1,8 +1,6 @@ """ Stores the main functions for converting clinical data (initally from RedCap-ARCH) to FHIRflat. - -TODO: Eventually, this should link to a google sheet file that contains the mappings """ import pandas as pd @@ -17,9 +15,10 @@ """ TODO -* sort out reference formatting + how to choose ID's e.g. location within encounter etc +* sort out how to choose ID's e.g. location within encounter etc * cope with 'if' statements - e.g. for date overwriting. -* deal with duplicates/how to add multiple values to a single field - list options. +* deal with how to check if lists are appropriate when adding multiple values to a + single field - list options. * Consider using pandarallel (https://pypi.org/project/pandarallel/) to parallelize the apply function, particularly for one to many mappings. """ @@ -35,23 +34,25 @@ def find_field_value(row, response, mapp, raw_data=None): return response elif "+" in mapp: mapp = mapp.split("+") - results = [find_field_value(row, response, m) for m in mapp] - results = [x for x in results if x == x] - return " ".join(results) + results = [find_field_value(row, response, m, raw_data) for m in mapp] + results = [str(x) for x in results if x == x] + return " ".join(results) if "/" not in results[0] else "".join(results) elif "if not" in mapp: mapp = mapp.replace(" ", "").split("ifnot") - results = [find_field_value(row, response, m) for m in mapp] + results = [find_field_value(row, response, m, raw_data) for m in mapp] x, y = results if isinstance(y, float): return x if isnan(y) else None else: return x if not y else None - else: + elif "<" in mapp: col = mapp.lstrip("<").rstrip(">") try: return row[col] except KeyError: return raw_data.loc[row["index"], col] + else: + return mapp def create_dict_from_row(row, map_df): @@ -276,4 +277,6 @@ def convert_data_to_flat( else: raise ValueError(f"Unknown mapping type {t}") - resource.ingest_to_flat(df, folder_name + "/" + resource.__name__.lower()) + resource.ingest_to_flat( + df, os.path.join(folder_name, resource.__name__.lower()) + ) diff --git a/fhirflat/resources/base.py b/fhirflat/resources/base.py index 30f205d..4f56e31 100644 --- a/fhirflat/resources/base.py +++ b/fhirflat/resources/base.py @@ -14,6 +14,9 @@ class FHIRFlatBase(DomainResource): + """ + Base class for FHIR resources to add FHIRflat functionality. + """ flat_exclusions: ClassVar[set[str]] = ( "meta", @@ -78,10 +81,21 @@ def from_flat(cls, file: str) -> FHIRFlatBase | list[FHIRFlatBase]: return list(df["fhir"]) @classmethod - def ingest_backbone_elements(cls, mapped_data: pd.Series): + def ingest_backbone_elements(cls, mapped_data: pd.Series) -> pd.Series: """ Takes ordered lists of data and forms the correct FHIR format which won't - be flattened after ingestion. + be flattened after ingestion (*_dense columns). + + Parameters + ---------- + mapped_data: pd.Series + Pandas series of FHIRflat-like dictionaries ready to be converted to FHIR + format. + + Returns + ------- + pd.Series + """ def fhir_format(row): @@ -127,6 +141,7 @@ def ingest_to_flat(cls, data: pd.DataFrame, filename: str): lambda x: cls.cleanup(x, json_data=False) ) + # flattens resources back out flat_df = data["fhir"].apply(lambda x: x.to_flat()) # Stops parquet conversion from stripping the time from mixed date/datetime diff --git a/fhirflat/resources/patient.py b/fhirflat/resources/patient.py index 1a130d5..4585235 100644 --- a/fhirflat/resources/patient.py +++ b/fhirflat/resources/patient.py @@ -74,7 +74,9 @@ def cleanup(cls, data: JsonString | dict, json_data=True) -> Patient: if json_data: data = orjson.loads(data) - # # Strip time from the birthDate + data["id"] = str(data["id"]) + + # Strip time from the birthDate if "birthDate" in data: data["birthDate"] = data["birthDate"].split("T", 1)[0] diff --git a/tests/dummy_data/combined_dummy_data.csv b/tests/dummy_data/combined_dummy_data.csv index 2988531..94449ef 100644 --- a/tests/dummy_data/combined_dummy_data.csv +++ b/tests/dummy_data/combined_dummy_data.csv @@ -1,5 +1,5 @@ usubjid,visitid,dates_enrolment,dates_adm,dates_admdate,dates_admtime,non_encounter_field,outco_denguediag,outco_denguediag_main,outco_denguediag_class,outco_not_dengue,outco_secondiag_oth,outco_date,outco_outcome,daily_date,vital_highesttem_c,vital_hr,vital_rr,vital_systolicbp,vital_diastolicbp,vital_spo2,vital_fio2spo2_02110,vital_fio2spo2_pcnt,vital_capillaryr,vital_avpu,vital_gcs,vital_urineflow -p1,e10,2020-05-01,0,,,,,,,cough,,,7,2020-01-01,36.2,120,30,70,120,5,,75,1,1,1,150 -p2,e11,,1,2021-04-01,18:00,fish,1,,2,,,2021-04-10,1,2021-02-02,37,100,40,80,130,6,10,85,0,2,1,200 -p3,e12,,1,2021-05-10,17:30,,1,,1,flu,,2021-05-15,4,2022-03-03,35.5,70,50,90,140,7,,95,0,3,1, -p4,e13,,1,2022-06-15,21:00,dolphin,0,Malaria,,,,2022-06-20,2,,,,,,,,,,,,, \ No newline at end of file +1,10,2020-05-01,0,,,,,,,cough,,,7,2020-01-01,36.2,120,30,70,120,5,,75,1,1,1,150 +2,11,,1,2021-04-01,18:00,fish,1,,2,,,2021-04-10,1,2021-02-02,37,100,40,80,130,6,10,85,0,2,1,200 +3,12,,1,2021-05-10,17:30,,1,,1,flu,,2021-05-15,4,2022-03-03,35.5,70,50,90,140,7,,95,0,3,1, +4,13,,1,2022-06-15,21:00,dolphin,0,Malaria,,,,2022-06-20,2,,,,,,,,,,,,, \ No newline at end of file diff --git a/tests/dummy_data/encounter_dummy_data_multi.csv b/tests/dummy_data/encounter_dummy_data_multi.csv index 286a771..d0ad22e 100644 --- a/tests/dummy_data/encounter_dummy_data_multi.csv +++ b/tests/dummy_data/encounter_dummy_data_multi.csv @@ -1,5 +1,5 @@ usubjid,visitid,dates_enrolment,dates_adm,dates_admdate,dates_admtime,non_encounter_field,outco_denguediag,outco_denguediag_main,outco_denguediag_class,outco_not_dengue,outco_secondiag_oth,outco_date,outco_outcome -p1,e10,2020-05-01,0,,,,,,,cough,,,7 -p2,e11,,1,2021-04-01,18:00,fish,1,,2,,,2021-04-10,1 -p3,e12,,1,2021-05-10,17:30,,1,,1,flu,,2021-05-15,4 -p4,e13,,1,2022-06-15,21:00,dolphin,0,Malaria,,,,2022-06-20,2 \ No newline at end of file +1,10,2020-05-01,0,,,,,,,cough,,,7 +2,11,,1,2021-04-01,18:00,fish,1,,2,,,2021-04-10,1 +3,12,,1,2021-05-10,17:30,,1,,1,flu,,2021-05-15,4 +4,13,,1,2022-06-15,21:00,dolphin,0,Malaria,,,,2022-06-20,2 \ No newline at end of file diff --git a/tests/dummy_data/encounter_dummy_mapping.csv b/tests/dummy_data/encounter_dummy_mapping.csv index 0c7efec..0aa99b8 100644 --- a/tests/dummy_data/encounter_dummy_mapping.csv +++ b/tests/dummy_data/encounter_dummy_mapping.csv @@ -1,5 +1,5 @@ redcap_variable,redcap_response,id,subject,extension.timingPhase.system,extension.timingPhase.code,extension.timingPhase.text,class.system,class.code,class.text,actualPeriod.start,actualPeriod.end,diagnosis.condition.concept.system,diagnosis.condition.concept.code,diagnosis.condition.concept.text,diagnosis.use.system,diagnosis.use.code,diagnosis.use.text,admission.dischargeDisposition.system,admission.dischargeDisposition.code,admission.dischargeDisposition.text -usubjid,,,,,,,,,,,,,,,,,,,, +usubjid,,,Patient/+,,,,,,,,,,,,,,,,, visitid,,,,,,,,,,,,,,,,,,,, dates_enrolment,,,,,,,,,, if not , if not ,,,,,,,,, dates_adm,"1, Yes",,,https://snomed.info/sct,278307001,On admission (qualifier value),https://snomed.info/sct,32485007,Hospital admission (procedure),,,,,,,,,,, diff --git a/tests/dummy_data/observation_dummy_mapping.csv b/tests/dummy_data/observation_dummy_mapping.csv index b1a9feb..0d3ee82 100644 --- a/tests/dummy_data/observation_dummy_mapping.csv +++ b/tests/dummy_data/observation_dummy_mapping.csv @@ -1,20 +1,20 @@ redcap_variable,redcap_response,single_resource_group,category.system,category.code,category.text,effectiveDateTime,code.system,code.code,code.text,subject,encounter,valueQuantity.value,valueQuantity.system,valueQuantity.code,valueQuantity.unit,valueCodeableConcept.system,valueCodeableConcept.code,valueCodeableConcept.text,valueDateTime,valueInteger -vital_highesttem_c,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8310-5,Body temperature,,,,http://unitsofmeasure,Cel,DegreesCelsius,,,,, -vital_highesttem_f,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8310-5,Body temperature,,,,http://unitsofmeasure,degF,DegreesFarenheit,,,,, -vital_hr,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8867-4,Heart rate,,,,https://snomed.info/sct,258983007,Beats/minute (qualifier value),,,,, -vital_rr,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,9279-1,Respiratory rate,,,,https://snomed.info/sct,258984001,Breaths/minute (qualifier value),,,,, -vital_systolicbp,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8480-6,Systolic blood pressure,,,,http://unitsofmeasure,mm[Hg],MilliMetersOfMercury,,,,, -vital_diastolicbp,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8462-4,Diastolic blood pressure,,,,http://unitsofmeasure,mm[Hg],MilliMetersOfMercury,,,,, -vital_spo2,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,59408-5,Oxygen saturation in Arterial blood by Pulse oximetry,,,,http://unitsofmeasure,%,Percent,,,,, -vital_fio2spo2_02110,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,3150-0,Inhaled oxygen concentration,,,,,,,,,,, -vital_fio2spo2_pcnt,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,3150-0,Inhaled oxygen concentration,,,,http://unitsofmeasure,%,Percent,,,,, -vital_capillaryr,"1, Yes",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,50427001,Increased capillary filling time (finding),,,,,,,https://snomed.info/sct,373066001,Yes,, -,"0, No",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,50427001,Increased capillary filling time (finding),,,,,,,https://snomed.info/sct,373067005,No,, -,"99, Unknown",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,50427001,Increased capillary filling time (finding),,,,,,,https://snomed.info/sct,261665006,Unknown,, -vital_avpu,"1, Alert",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),,,,,,,https://snomed.info/sct,271591004,Fully conscious (finding),, -,"5, Confusion",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),,,,,,,https://snomed.info/sct,40917007,Clouded consciousness (finding),, -,"2, Verbal",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),,,,,,,https://snomed.info/sct,300202002,Responds to voice (finding),, -,"3, Pain",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),,,,,,,https://snomed.info/sct,450847001,Responds to pain (finding),, -,"4, Unresponsive",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),,,,,,,https://snomed.info/sct,422768004,Unresponsive (finding),, -vital_gcs,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,9269-2,Glasgow coma score total,,,,,,,,,,, -vital_urineflow,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,9192-6,Urine output 24 hour,,,,https://snomed.info/sct,258861009,Millilitre/24 hours (qualifier value),,,,, \ No newline at end of file +vital_highesttem_c,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8310-5,Body temperature,Patient/+,Encounter/+,,http://unitsofmeasure,Cel,DegreesCelsius,,,,, +vital_highesttem_f,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8310-5,Body temperature,Patient/+,Encounter/+,,http://unitsofmeasure,degF,DegreesFarenheit,,,,, +vital_hr,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8867-4,Heart rate,Patient/+,Encounter/+,,https://snomed.info/sct,258983007,Beats/minute (qualifier value),,,,, +vital_rr,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,9279-1,Respiratory rate,Patient/+,Encounter/+,,https://snomed.info/sct,258984001,Breaths/minute (qualifier value),,,,, +vital_systolicbp,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8480-6,Systolic blood pressure,Patient/+,Encounter/+,,http://unitsofmeasure,mm[Hg],MilliMetersOfMercury,,,,, +vital_diastolicbp,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8462-4,Diastolic blood pressure,Patient/+,Encounter/+,,http://unitsofmeasure,mm[Hg],MilliMetersOfMercury,,,,, +vital_spo2,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,59408-5,Oxygen saturation in Arterial blood by Pulse oximetry,Patient/+,Encounter/+,,http://unitsofmeasure,%,Percent,,,,, +vital_fio2spo2_02110,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,3150-0,Inhaled oxygen concentration,Patient/+,Encounter/+,,,,,,,,, +vital_fio2spo2_pcnt,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,3150-0,Inhaled oxygen concentration,Patient/+,Encounter/+,,http://unitsofmeasure,%,Percent,,,,, +vital_capillaryr,"1, Yes",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,50427001,Increased capillary filling time (finding),Patient/+,Encounter/+,,,,,https://snomed.info/sct,373066001,Yes,, +,"0, No",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,50427001,Increased capillary filling time (finding),Patient/+,Encounter/+,,,,,https://snomed.info/sct,373067005,No,, +,"99, Unknown",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,50427001,Increased capillary filling time (finding),Patient/+,Encounter/+,,,,,https://snomed.info/sct,261665006,Unknown,, +vital_avpu,"1, Alert",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),Patient/+,Encounter/+,,,,,https://snomed.info/sct,271591004,Fully conscious (finding),, +,"5, Confusion",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),Patient/+,Encounter/+,,,,,https://snomed.info/sct,40917007,Clouded consciousness (finding),, +,"2, Verbal",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),Patient/+,Encounter/+,,,,,https://snomed.info/sct,300202002,Responds to voice (finding),, +,"3, Pain",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),Patient/+,Encounter/+,,,,,https://snomed.info/sct,450847001,Responds to pain (finding),, +,"4, Unresponsive",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),Patient/+,Encounter/+,,,,,https://snomed.info/sct,422768004,Unresponsive (finding),, +vital_gcs,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,9269-2,Glasgow coma score total,Patient/+,Encounter/+,,,,,,,,, +vital_urineflow,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,9192-6,Urine output 24 hour,Patient/+,Encounter/+,,https://snomed.info/sct,258861009,Millilitre/24 hours (qualifier value),,,,, \ No newline at end of file diff --git a/tests/dummy_data/vital_signs_dummy_data.csv b/tests/dummy_data/vital_signs_dummy_data.csv index 4f96f3c..a361b93 100644 --- a/tests/dummy_data/vital_signs_dummy_data.csv +++ b/tests/dummy_data/vital_signs_dummy_data.csv @@ -1,4 +1,4 @@ usubjid,visitid,daily_date,vital_highesttem_c,vital_hr,vital_rr,vital_systolicbp,vital_diastolicbp,vital_spo2,vital_fio2spo2_02110,vital_fio2spo2_pcnt,vital_capillaryr,vital_avpu,vital_gcs,vital_urineflow -p1,e10,2020-01-01,36.2,120,30,70,120,5,,75,1,1,1,150 -p2,e11,2021-02-02,37,100,40,80,130,6,10,85,0,2,1,200 -p3,e12,2022-03-03,35.5,70,50,90,140,7,,95,0,3,1, \ No newline at end of file +1,10,2020-01-01,36.2,120,30,70,120,5,,75,1,1,1,150 +2,11,2021-02-02,37,100,40,80,130,6,10,85,0,2,1,200 +3,12,2022-03-03,35.5,70,50,90,140,7,,95,0,3,1, \ No newline at end of file diff --git a/tests/ingestion_output/encounter.parquet b/tests/ingestion_output/encounter.parquet new file mode 100644 index 0000000000000000000000000000000000000000..203d4cc86493747d6e62c6c6bb30ff594af23ac9 GIT binary patch literal 18587 zcmeHPZ)hXunIFlH*SpzlHoH5DoZH(fzTK&}URyKzV@u2Cj*)H2a%8QowIy4nw>Q#^ zE!m@yqCb|cG*HS?N(dz^rIfN9p&vphp_EcexPx-shknY3a2(+%AsnS2j#5e};RvN4 z?s?w%qmeWt+gW+zlv@(*%)HP0yuau7{CWS(BrAN1qlf7i=xYhuNmG|7>KXPdMNtb% zzT8miQkACYOSJ12JxX7sN9d8OGt|UcqtV^Igg$3IPWT;lI7TVvr;e-i`LQ51a&gRk zzADwqjcQ(6JE%ys(>3b)XNT);2NPjN?bVIl0$3F7q~PZw{6$f4&(drHkRVL=6@13A zPmQn~x$==;KZswU-5q$HoZs!_SP(yV9fd|6&Y|B?%%?+bh|Z!&fc?k;*q9#=F&{b+ zn%>hA%tHrVNFYc+6aV3hptw-4S8BJt-kMS_N#b--*(!T$`T95gvs@_TXIa)m?cAW; zzCb9*va^8n8a>{D^u1dik>4T5&;{oGGls39b9q^))u!`hQ5u$u zwfY%JmP(RRCj@+Z$n}wv`QN9QPll0zfdH~(?+pk2z@gI@6+Hm^Zlhii>P3L!sZ`5( zNo-Ul&jkJScW+*U*2QwIQmhNIM-WTJTCG@CJQIf4O$#zT*819g+}HZ+v}E$X&-XA{ zm-g$uOn&Bced=W1e46>gS!D9@(OA-0pc8sV1W4kps0@0!>1Qa)2R{Kn=Cl4dV38kZ zjYhX|$w8lvZ291#VYmBCt&!W2@@mBV(c${g!Tk1F=Fb;MHd`vQ#QwH}ejP$U&F6b} z&cTpA*2hkGxq%T;1tJtxY^r&xVht-VrQnfpvT9l z?H0ye29@1n?sM&Ny6t@HO54s$LcZP*Y3{kkg*e_YHTX*A7FucbvFIp}YJKSxMKRNg$SBKl@omQ!v}*8^Ob4To7a>`|7?)zL#= zKU#G(6db8KUc6(jj?SSsDdzR71}~w%GwRELUmi3FV-sMTComA$EI$6Fy#8o z$^85U=HVFfW?MIp1J#n`UpVM@4U)6(jkj~#W55+Xf?V8#+PPI!#GX3 zFFiO?E$ya$T+8Y{W4JGn5sWcmNDHPve(~r`Ti4a^47t8N#JuBSKDvfXe;#C(#D3pF zZ|U>VQs3b9g*g8##4B69L0z4Ni5&ykVgj-a0z-k@UPRXmH1mi6I$KI`*^W?)GfGH##+7rztqty}bYeX%|~ zan_J@kLn7b|CPiL{0?1oJcWP#>|dXzoI?*N<`HW!^4_yzQP@_X?A7jz5MZ5)#k2V` zlol+A9V?$R@Hj73>Lakagi!2g8^#qUEHcl#en%#mM_%S<(^qF~AX-xLfrH-DIVr3? z2hiswWxD~D4r*VyDoWLx*T?AjX0cxIG=-`H5InWwwo?1s3F@me6VzzAQO`|ce09_G zRZS6g1)hQ6(D z5C5j{JQ!@V2?1gm`fDBW_q>g$+pYn$Uta7EPTW%d=k1eYbEI}qt4k$9=69T~@4@OW z%=|ipWZL1hH^7oYH+@y7@cb>z=1`b}DTEDLa5k#9xs9tHR^Ix_ISN_%);}1mywf*h zhZ^KE^ZrZ7f&s(?$#ur*d^mO_7aRszm|Pq4QX7 zWa5n0fN$zm4s7Y%Z;VXXo^t7Z?00&)1PrY2KgewIZxoX7@jZit z&?TcHO+#(8Suy|8>H3wE`EH7NYZW;d1Tah1POxrvlY5#fg{70;$=W)}dOiMd*!5_b z`S%R-MjD-TnhdmZ_Sj#7LCQS>hYM0kps!K%`5Vs8Q}P>B;wl6%X@Ik7?KbFin+|Uw z$3_x&Kq@%wCJSvmEJscHfo@MPHL;J@OVy+u^qJvU9EzEndybXbD2{TnatxymQ+FI| zf!RL$=wpDg(Q-^I-3vJ@003EcG(AkgU3*J*n79+g^{pC24cuvq!NYovs1ZiCa%QjV z3lsA*ZE|e;A6KA<4e(^Zkl;Ra4s0X0m?^9bvB|!!-p7dr*XY>QvAUz#&^=9C^$GpGH~mpR49si*p2llkj7C;T!W(4DSh zH0J+Ljnq$mm@n0F&67D{Ii*J!%wPAW-SsKw8f8XZzwut7sIgn_k&2**Le2f)@~h+U zl3m)rpD)Xel2RML?fLqv$Y{$0bF5aff}^*P>IQ@z+dXI#G<9KkGwH9e8?L9}5}s_S!7~s98H+6njTh zpAB}L_P27q<8`)mjf-E_eO;sf8`xJloc|sx)e&&RXg9jvb`qm&hd;eTn;N>tJzl)^ z*xT_*lA0tvab+!Yf;iyQs9p*>P-qeZ#-M-9VE-AC1+1fW>Owk@ST#Pru*RzgYG;u+ zVyBb9Yya6yFRlG$pBy@C60OncvL}JgcyXeB8PZFbg@6?fbr$;E<EYpQ!AW7PZMh}tBoV1o16dJO$sY*hGIfu3<1W`lfx@M!C=Psnp%fZS|Fk`@g2W_{YDU;(2}* z{>48p^L&7hZ1Yi`-{$AG$+JkLqrHg_7mW5qb4k62U!sMy$9cZVLqGT%mCUWrK^xq| zlU0z9%*IP{BO6NxHhjsw++w<(Qyc{{Ew6>=p}ju)HT@lvpm z%dDeJL(J^g5_5bp?@tx-$|^kLo7(+eel9)}pI@!T=ayPxEZsys%Nv}C?VG-EP0VnO z4WBx8cEcF6v6qxERkb~nW;Zjbf*4yc{IhkhxedNFcm;p&s($Rn zC1g}R`9bp<3v#*GemF6Q@z&;R)3;y9`F&5dL$QEKyGg)bMA zTVk}%2SO&in^2PM=Ef3h7l)huD{SlINZxXsM|YIb9(lJl)LppK|{_T=#H<}z|C@00g(#o(^6k=zkt(Sxin zy#UwEoIhC+W2BAbhduYHI>@)cPmM$=RmuC3W$=}fU59IbWDoSSYTFl>kC`K4H07d| z&$2NNY)XQ^0x*xrtxaE{kcW9L1=smlxFu}BJOFK`yl-xTU!fkEYj6$tVl#fdR;NX6 zp5$^Hk)U~Q%K4UF$wF+a?VK#eqM$2PFy^~OwuW<%DXyTFq~x5UVg&!yVNO=oV}R4Q zen{PfS`|w-w0HoYaWJQes}K{d>^(Basq?v2e3tfsmsHNE1!;DG^G{z~Y<8nyTNkhm z@=A?O1$>i%^#d1y_)N+PldrAn+DX=3h@C>#7sT9XZNS_E{?gV;y>qOsY#%@9aWlv{ z&5Q%*R#O*iRyAff{oLc11u8@f;xf13HnZq%MjM|VZCFT6$>S4KK%+~|VnGRec z#)fI_aF`w4@?T^uC6qMgYX6)t#a@qmpO_j*Zl|GM%`NS~x);`*;RCE=u-2>znWY1m zGlF;@S0X{UKFIm=q=ty4^nqHFaQzzJPu>eO6FYnn*Qp?@uYr9qkCHqmI(FiX2 zS6JM+;J%3G7d2Vf=FH<(C%U=_zxfDUH~CGTkFCbwK0svget8@AQjzrt?0*8dL0Vje zy%QhDy-*alQOl?#QidOOf3&g_@OndVx3CNyVe3RTVc19AU(J!Ld-&UtOD3NovdI;% zcX>tedQtuAI@y^au=Z9?8xL|Pn^EnHp**(}e2nqQ91ogTC%3%b-D8d~q5P~dev}wJ zPoS&zBYa|iq$Ag09dBO)un#{MLOfmrPjelg;>K0ugKSOFkvxywoAoERV4n)}**YJ> z{;l~Ljh`gpQ}u_64_&pein@q?qSVB1e5LC3`i%anf7L)~`Nfq@5qP!$FyL@oZ{ehBAz3_2Vyfc;7Q zuw*lK8x((-!Q`H&vJWIQ~W(ea{841d(fQpYDoz#N^fqaZ)O2K~dVKE92A zNId2Jus0_CB6*ZoaC}s6SC7c}WW5i2?_Iut@*y0L)I*#2A^5bq1G-;sfxNe-Ys1HW4T?&6^8p&fDfGTzSNZv@V!jGpMO!m ziJw`-bI^iRFwJ-KB;VTkr}DLfUEz%BID14siFo{Ly8eFg(~j>b;+f2QA+=#ho!?y? zU(E`)G!^&O{GHkk}{v*C3qx#XqSJEit3+ z@%X!h&l+?h__g>aq^kG{wODl3rA16T=Ti4EU?k+6H=Nh|+vIn5d@Z1S0didRzZzd~ z_zQp9^(;p2%kT^J4EBTh19d+L_bm3W8~u;AUQtj!jCjrbnCclB4^c!a=$9tS@8h}) z&!sncDAgq7!5GpB1vPuS{Y~V$&sm9j=nA~5@k$=N^b+OvRD|j)4XHkruT-YUfxSmN ckoTxZ_3*ti{I?6A`ya{!|L>6%_}>!!ANkO&s{jB1 literal 0 HcmV?d00001 diff --git a/tests/ingestion_output/observation.parquet b/tests/ingestion_output/observation.parquet new file mode 100644 index 0000000000000000000000000000000000000000..91e7524a1538ddde4a3279bd8ecf1fb80423ec90 GIT binary patch literal 13457 zcmds8YiuLebskc$eQ4M6MnhAvydKl5iPkHL!x>&17iYlw6#zI+MT2N~xcSJ3Sw;P%+R@`b^viZEAizT(zcxhkK1$kbRN{R+t zKpS1+Mo)&a^MvSts1F5+k}2{WLRH@xYpFVTp()ow9^@XDB~2d_Ww9bEI#K+0haKM^ zX1_Pg{@RYJKOtBrCZ6Sj_gQEXmyEeRV9CyRbUQmMmdbm5+)8=6c&PU)?;V@tXEyfr z^Xz-)h}{lkWGTiry}1TngpT%<{~Unva$atp^T7AQ#ZCC}!q;qvq7S}ckXPv0E`vM= zv&>#6mQ4yk?$Wa1efINL;%{D=fR$&hy`H1EMu&>!UH7rzFbi{5$vRN@V4A`tR(!LNOja{4?0kdJvR^o*vc6|t}& zDMi(-<@J{WoZq{|aUSIg#dsDx+|6sAz~$W)B)k}!_a^@uN}iB+ei2l-OfPq-@NyGj zFHvKz)7k8AOJD#2;aro3_l=CEhlbBn6A!z*zUA$mV)5V`uYLP3aP`<9(Jhq+&!QI& zaK%3vbNq0aeeVnGD-WaIe|uDxEAww7^jjm%mHAKm-%J3suv%y^5D2ScP0;H#5#g?g zLQQ9&F5-xUs8uDXKwuo1eC-iv!O*G7Ql81ls#;*G5Xv=}iVM9g3EIXqH5Y*Lt;0r1 zR2a>+`xUZtFp|RXHHgQOAYWM1a}8#*E^8vA9)j8FwZ=5ei7RD677L7emY>e6N*-cV zZDg5xYJ|D_{d`RnH0a9?nY@swx^7J)`A;9X!<<5HR7g0<2&(pyseD20LFutke)w%+P@Ah!l!KFO- z@LB}&@s5#RXJ?HYh@WlaKPxsF5eZ(gJO0JS{__m`>I_QINv@Z;r5tJcnKgQjgz>o_ z+IDR+SgJ zw2!^-v{PQkrCGBC(mYxoh3A~@5;bfa8Dn1KUl0BHm4IXKjFnyKO! z_64~vZq)_ilMAFq1pLJ2_|(R}ah3hn?6d_BOQC*Yr$2>Eah!FF33_*5^D_MeU-7fCLk8+<>D3U4u*ksj6>@ptgv>g zxGvW6^;%;L+BcyHb~{5|crqwUdQEhjF+yX)RZL)K>AAL=7AI%N%o)zh_eKD;C)Y;c zdBrwz@%f=K>dlo8-udR0zjv^&xZCz{satV%MKVnLjbX>TL+l6h?9Z-E-w#tuh5q3X z{a#CJ{_V5zu_lXOSkvyPxm-PmMp`efBmR^~mCj796PhVJ# zmMCYX^7KmS*++5=Ma>*N4~fB|hvPyPihH8DKU-2piq1!@QOyk2<#+r8b%}d8vBCcu zcKl?Beb3AOD~G!LEI2IH`yY1tN1)zw#9sf(cIOFGy_ik)%IaJ4af^!}D`;^XGNFJn zLowbZPap(Z^J4{6jQJtw#fm?`1^t*Xfj;MGuEP))2g}i#n1-_HJ1sHNNB6tPo0lnJo+!ls=UtNMTp>62l{+e7R}OYEO6PA{GmPD{=I zqmBM?OY^0>UmgMxGNOj^Y^Z4E(Yh=*AT3a|yrkB(b^`J!bYj+|Dg}+vRc0TPkEzG0 zFJ6C~@>ekjY2_*wsviO-&Z&XKf%ZV9e){F#q*UuY|J5PKukGxIzsJ6L1C{!l5Ll{f zf}U(1z|UYN2cNsY58^rKFSaeqd8XOZE;(+@TC~Er^RvV73{pd5wo!_GJ3LLFYhE4@ zgNehIS`ur-4S#BLylZ3sjA!qDWx7qeT^hi|N|LEh9}Q5>Q7vB(t+w!)bAa*8+%xzDossz#J64kr>anLAgK%5+!BONsbYV^8 zzI1aA&85>oDMxr}dg`Ooy8inAeb>W2pv#z3y%qUgIRCT%R}O&eNgv7B4^QcgbV{d` z2b{yRpL?47yPgcPpIct_pt2sv&pAh!J>z)yDja90=A5HdK`98D^N${W(FM0^;^A|7 zRjyYQ&2^o5`bF0sv_Z*L6O@wZnqypwP!XX)fs7bxnhy7eP~}KjtwPb~BV`fGo|m76 zib3!>LR~cJc9VeSjkQ#sEbQ#*q(VES_p49)J-z|i-4q92FlkRB}eXD9ud02?Fc(SGh|eyy$^t|zk! z$e^j>ZWAQDn_Z{D+q@6$m%5{)hwo?9bn6DQpJ!Ls$#U(!CnL3f{sRW}ASsA>sUpZY zT{%uff|nF-86*ube zBvkMbJ_r@v?kdnQ8Y9UFl*GnrHCIW!Kb6&dI!iwtDzq3Ej_hxfu6GIjr~G;bvK(d_*vo~FfJd9 z*0ZtbQ76_yDp1XMLwil#7gC-BF&W~-beOC`yb_XxN^-XlA#0;&K@On7WmLT$CFffWtP!2;NGOjL&Rk4BDn#{` z?$f5NoHSm*<2Pl!H(mv7dS&Z#!JAEGD`;OrIx8n&Ex;D4Xww61L59(C#+RrTDqv~p zgq)1YXy>P5-^6EEzz0_}sl#rGzjUr~pU)AWO~{>N9=si(9|Smv7pCuKtJ&1yUfSD? z8ED%*W0Y_i;#)cI>GYYyYTlQ~wR|hdk+lKc==l}7GnVXUDpgn;!;W&PB$r7gz$b3D z;#2YV0mh8B&nO$7&N&u(JSwC*;*S+RC0Q3z7=sL(IT2iyB>K812Y!*4h;3Q;Pv4jm zkFNvl&zngdda}u=2jV&SO(L+SWXrjgq#48e*SS!MYK2rXQAkDWg_z8Vw*v-^!G5y` zJ_&vp)qcx;;UEujR!9R4uOIgjj>JOmMmcBfD~H8&pI=$X1#6o&?8(A5?m6Dd-&!_& zg7MZuw;lWGvDTtJ{z2KdX)jyo|EZ{yizSzH-oS1)9p*Y?)1 zO|KpasZbsB#rW<`&)P0uUx)m#J2%S8d8HLAq;<*PfN@fhgaR=R-Za37z%OE;@5S1x4 znW|zvhFvT7@Es$Mg+a@N-sYyyly8R+1>xaCiljt@g#9I(;z)fJ$9uO!rhX0lz4{HK zE=_%O?rBZl(OsB-E#rps7!C|42x(NJDG#aN#-~Lr#vB{KMAmWsQ44>bfLs@vHdArXWW8l4wH>th*7< z(b2Uo{q(Ibk_xx-AU?7U<3oE*e&+fTqcE(RtS>Q2qUspQjgY6mfJ~z}|JqJ-zNt?# z-k6Vr4EuuILe3HOgo8MKe{-qJzKIfy^*5HDOeOZ+nIl7MM8o+@t@+2Z9}kW{+(vUo zNCWH#-J6qthQc?1$)!tk*J*k9C~mjJ*RIdGslF5q!$<@8jXm;`qfD z{&l#N=#DSQFHBq$?=bkjvnKYhB7S!UA2|AW>$S||c#sU@PzmotR?HBv)pnWz`ZVHQ z@7WyT;Zz_=luGK9lc@@|7wV!upRZQu$;A$1UfeLomCn>h@Q-GW`K7+`B>d}1_|HiH E3p#rX_W%F@ literal 0 HcmV?d00001 diff --git a/tests/ingestion_output/patient.parquet b/tests/ingestion_output/patient.parquet new file mode 100644 index 0000000000000000000000000000000000000000..ae07ae6f80193efccb9217faef14dbbeaf63feb2 GIT binary patch literal 2182 zcmb_eO>g2x7#>KJrELz;Dn?M{fUGXcfuz_#c3HKR1`>!V$p#W^aJ7p36)@Pw^#=}# z@&~G_haP(9sp_TI{)YaF{(%0FzB48?Y|>NKlE?FY%=13;%#7_b+~t^cCe3`_W|9ng ziO^@eFAzd!N?;mJz#!%&lP)kRW{cTiHg*os?u$hzbKimNPjX3UQ;9?h<$q7?Fq_#5 z+StlwHa)|ihMs0z%w2;?rc>#k6X{CryboWk}v;GAk?-+B;B@1^|6wrc>FSY~f?8@|>xpe}jd{r_z5W zw-L$~G8?Yq=!&2DWBryj9o@LT)TUNwJ3jk1_uVZ!0Uz{OPjQ9@Tgb7FVjJLaLdycn z4-_u|vq<|>7rJHIHAt7cdk;p8cS*sG1?(!ym1$V|<*F~Hbs50c0VFnbOl_*i3_{zT zF9TC`Y-j;tW*dQ`D}e&(x2)kny6Ftb*xp5B?+ttRfXpfj_TfGyNGp6EMW4lt=yq2C zf{WR*rvA8M?0bG4s=j9^!PKLoSf=VJ-kfOS&K1v_Mx^%4@O*RX5HDBRFYXsvAjmGN zKM3;sJc*&qChm6MNpIuBkoht7CAlJeh%xTKO|EY-F5}t|H!vRJ`jGl+wUu~8oJY%e zBdW(Xv=oKL1&kvMLEO@)W5u z5ysP@;Iu}nty_W_4F$V0Ql%cr33ch(Z`ZM@mAWI%>3|PM3wx&3g#+QR;|ulHjouU^ zf^*X6bP^x%RbQ95u+PV}<^Hl}IFLAt$#!bJ}t!Pcc z37fi9aRGlKp9`cmaO5IGtQL)ajp7P+)l_V8tk-Egko(;cU=#=E!1vfUWvMb&`L3(l z8pRXEZnJ$cG_OA3uFWC85rBU8yg&zAyirTye87u6T94}WA^-o$b84V5=*T_b>WHBt zoz7&K8_^9k3AAJDN9xd1B>O304S}t{k^`6A* z_?VX>#W>o>*0ovT)aG@y4_pNMbS_JcqFNFNN6_!u9l-w5oC(z%<_G=OOrO*B;pKko zMv=pi}O z$k79%R_jt{kF@lBP&3X4T`)d5gv>V`Tlw{ZYno-f_#) z_$sH_r{fV>}%@z*Gma~1FV&P G1Nj$bgJu!{ literal 0 HcmV?d00001 diff --git a/tests/test_ingest.py b/tests/test_ingest.py index bfcc941..f304bf6 100644 --- a/tests/test_ingest.py +++ b/tests/test_ingest.py @@ -13,7 +13,7 @@ ENCOUNTER_DICT_OUT = { "id": 11, - "subject": 2, + "subject": "Patient/2", "actualPeriod.start": "2021-04-01 18:00", "actualPeriod.end": "2021-04-10", "extension.timingPhase.system": "https://snomed.info/sct", @@ -115,7 +115,7 @@ def test_create_dict_one_to_one_single_row(): ], }, ], - "subject": "2", + "subject": "Patient/2", "actualPeriod.start": "2021-04-01 18:00:00", "actualPeriod.end": "2021-04-10", "admission.dischargeDisposition.code": "https://snomed.info/sct|371827001", @@ -287,8 +287,8 @@ def test_load_data_one_to_one_single_row(): None, "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", ], - "subject": ["p1", "p2", "p3", "p4"], - "id": ["e10", "e11", "e12", "e13"], + "subject": ["Patient/1", "Patient/2", "Patient/3", "Patient/4"], + "id": ["10", "11", "12", "13"], "actualPeriod.start": [ "2020-05-01", "2021-04-01 18:00:00", @@ -390,8 +390,14 @@ def test_load_data_one_to_one_multi_row(): "Heart rate", "Heart rate", ], - "subject": ["p1", "p2", "p3", "p1", "p2"], - "encounter": ["e10", "e11", "e12", "e10", "e11"], + "subject": ["Patient/1", "Patient/2", "Patient/3", "Patient/1", "Patient/2"], + "encounter": [ + "Encounter/10", + "Encounter/11", + "Encounter/12", + "Encounter/10", + "Encounter/11", + ], "valueQuantity.value": [Decimal("36.2"), 37.0, 35.5, 120.0, 100.0], "valueQuantity.unit": [ "DegreesCelsius", From c524efe8d78335e3d41f3ddfde6f3016bd76fe08 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Mon, 20 May 2024 13:06:02 +0100 Subject: [PATCH 15/21] Add race extension --- fhirflat/flat2fhir.py | 2 +- fhirflat/resources/extension_types.py | 4 +++ fhirflat/resources/extension_validators.py | 5 +++ fhirflat/resources/extensions.py | 36 ++++++++++++++++++++++ fhirflat/resources/patient.py | 34 ++++++++++---------- 5 files changed, 63 insertions(+), 18 deletions(-) diff --git a/fhirflat/flat2fhir.py b/fhirflat/flat2fhir.py index ee26a4f..925a4f1 100644 --- a/fhirflat/flat2fhir.py +++ b/fhirflat/flat2fhir.py @@ -17,7 +17,7 @@ def create_codeable_concept( """Re-creates a codeableConcept structure from the FHIRflat representation.""" # for reading in from ingestion pipeline - if (name + ".code" and name + ".system") in old_dict: + if name + ".code" in old_dict and name + ".system" in old_dict: code = old_dict[name + ".code"] if isinstance(code, list) and len(code) > 1: new_dict = {"coding": []} diff --git a/fhirflat/resources/extension_types.py b/fhirflat/resources/extension_types.py index 3eea251..ba077be 100644 --- a/fhirflat/resources/extension_types.py +++ b/fhirflat/resources/extension_types.py @@ -51,5 +51,9 @@ class birthSexType(AbstractType): __resource_type__ = "birthSex" +class raceType(AbstractType): + __resource_type__ = "Race" + + class dateTimeExtensionType(AbstractType): __resource_type__ = "dateTimeExtension" diff --git a/fhirflat/resources/extension_validators.py b/fhirflat/resources/extension_validators.py index 7e96460..0658ce8 100644 --- a/fhirflat/resources/extension_validators.py +++ b/fhirflat/resources/extension_validators.py @@ -65,6 +65,7 @@ def __init__(self): "Duration": (None, ".extensions"), "Age": (None, ".extensions"), "birthSex": (None, ".extensions"), + "Race": (None, ".extensions"), "dateTimeExtension": (None, ".extensions"), } @@ -230,5 +231,9 @@ def birthsex_validator(v: Union[StrBytes, dict, Path, FHIRAbstractModel]): return Validators().fhir_model_validator("birthSex", v) +def race_validator(v: Union[StrBytes, dict, Path, FHIRAbstractModel]): + return Validators().fhir_model_validator("Race", v) + + def datetimeextension_validator(v: Union[StrBytes, dict, Path, FHIRAbstractModel]): return Validators().fhir_model_validator("dateTimeExtension", v) diff --git a/fhirflat/resources/extensions.py b/fhirflat/resources/extensions.py index dad428b..006c8c5 100644 --- a/fhirflat/resources/extensions.py +++ b/fhirflat/resources/extensions.py @@ -417,6 +417,42 @@ def elements_sequence(cls): ] +class Race(_DataType): + """ + An ISARIC extension collecting data on the race of a patient. + """ + + resource_type = Field("Race", const=True) + + url = Field("race", const=True, alias="url") + + valueCodeableConcept: fhirtypes.CodeableConceptType = Field( + None, + alias="valueCodeableConcept", + title="Value of extension", + description=( + "Value of extension - must be one of a constrained set of the data " + "types (see [Extensibility](extensibility.html) for a list)." + ), + # if property is element of this resource. + element_property=True, + element_required=True, + ) + + @classmethod + def elements_sequence(cls): + """returning all elements names from + ``Extension`` according specification, + with preserving original sequence order. + """ + return [ + "id", + "extension", + "url", + "valueCodeableConcept", + ] + + # ------------------- extension types ------------------------------ diff --git a/fhirflat/resources/patient.py b/fhirflat/resources/patient.py index 4585235..ac06f5b 100644 --- a/fhirflat/resources/patient.py +++ b/fhirflat/resources/patient.py @@ -1,10 +1,7 @@ from fhir.resources.patient import Patient from .base import FHIRFlatBase -from .extension_types import ( - ageType, - birthSexType, -) -from .extensions import Age, birthSex +from .extension_types import ageType, birthSexType, raceType +from .extensions import Age, birthSex, Race import orjson from ..flat2fhir import expand_concepts @@ -16,18 +13,20 @@ class Patient(Patient, FHIRFlatBase): - extension: list[Union[ageType, birthSexType, fhirtypes.ExtensionType]] = Field( - None, - alias="extension", - title="Additional content defined by implementations", - description=( - """ + extension: list[Union[ageType, birthSexType, raceType, fhirtypes.ExtensionType]] = ( + Field( + None, + alias="extension", + title="Additional content defined by implementations", + description=( + """ Contains the G.H 'age' and 'birthSex' extensions, and allows extensions from other implementations to be included.""" - ), - # if property is element of this resource. - element_property=True, - union_mode="smart", + ), + # if property is element of this resource. + element_property=True, + union_mode="smart", + ) ) # attributes to exclude from the flat representation @@ -47,9 +46,10 @@ class Patient(Patient, FHIRFlatBase): def validate_extension_contents(cls, extensions): age_count = sum(isinstance(item, Age) for item in extensions) birthsex_count = sum(isinstance(item, birthSex) for item in extensions) + race_count = sum(isinstance(item, Race) for item in extensions) - if age_count > 1 or birthsex_count > 1: - raise ValueError("Age and birthSex can only appear once.") + if age_count > 1 or birthsex_count > 1 or race_count > 1: + raise ValueError("Age, birthSex and Race can only appear once.") return extensions From 9fa116fdcbea583eb27a400d0de9fb4b92d0dc30 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Mon, 20 May 2024 14:51:27 +0100 Subject: [PATCH 16/21] Misc fixes, add presenceAbsence and prespecifiedQuery extensions --- fhirflat/ingest.py | 33 ++++---- fhirflat/resources/base.py | 4 +- fhirflat/resources/condition.py | 41 +++++++++- fhirflat/resources/extension_types.py | 8 ++ fhirflat/resources/extension_validators.py | 10 +++ fhirflat/resources/extensions.py | 72 ++++++++++++++++++ tests/dummy_data/encounter_dummy_mapping.csv | 2 +- .../dummy_data/observation_dummy_mapping.csv | 2 +- tests/ingestion_output/encounter.parquet | Bin 18587 -> 0 bytes tests/ingestion_output/observation.parquet | Bin 13457 -> 0 bytes tests/ingestion_output/patient.parquet | Bin 2182 -> 0 bytes 11 files changed, 153 insertions(+), 19 deletions(-) delete mode 100644 tests/ingestion_output/encounter.parquet delete mode 100644 tests/ingestion_output/observation.parquet delete mode 100644 tests/ingestion_output/patient.parquet diff --git a/fhirflat/ingest.py b/fhirflat/ingest.py index ff06aa3..d939fa1 100644 --- a/fhirflat/ingest.py +++ b/fhirflat/ingest.py @@ -35,7 +35,7 @@ def find_field_value(row, response, mapp, raw_data=None): elif "+" in mapp: mapp = mapp.split("+") results = [find_field_value(row, response, m, raw_data) for m in mapp] - results = [str(x) for x in results if x == x] + results = [str(x) for x in results if not (isinstance(x, float) and isnan(x))] return " ".join(results) if "/" not in results[0] else "".join(results) elif "if not" in mapp: mapp = mapp.replace(" ", "").split("ifnot") @@ -55,10 +55,11 @@ def find_field_value(row, response, mapp, raw_data=None): return mapp -def create_dict_from_row(row, map_df): +def create_dict_wide(row: pd.Series, map_df: pd.DataFrame) -> pd.Series: """ - Iterates through the columns of the row, applying the mapping to each columns - and produces a fhirflat-like dictionary to initialize the resource object. + Takes a wide-format dataframe and iterates through the columns of the row, + applying the mapping to each column and produces a fhirflat-like dictionary to + initialize the resource object for each row. """ result = {} @@ -111,10 +112,12 @@ def create_dict_from_row(row, map_df): return result -def create_dict_from_cell(row, full_df, map_df): +def create_dict_long( + row: pd.Series, full_df: pd.DataFrame, map_df: pd.DataFrame +) -> pd.Series: """ - Iterates through the columns of the row, applying the mapping to each columns - and produces a fhirflat-like dictionary to initialize the resource object. + Takes a long-format dataframe and a mapping file, and produces a fhirflat-like + dictionary for each row in the dataframe. """ column = row["column"] @@ -170,7 +173,7 @@ def create_dictionary( map_df = pd.read_csv(map_file, header=0) # setup the data ----------------------------------------------------------- - relevant_cols = map_df["redcap_variable"].dropna().unique() + relevant_cols = map_df["raw_variable"].dropna().unique() filtered_data = data.loc[:, data.columns.isin(relevant_cols)].copy() if filtered_data.empty: @@ -183,26 +186,26 @@ def create_dictionary( # set up the mappings ------------------------------------------------------- - # Fills the na redcap variables with the previous value - map_df["redcap_variable"] = map_df["redcap_variable"].ffill() + # Fills the na input variables with the previous value + map_df["raw_variable"] = map_df["raw_variable"].ffill() - # strips the text answers out of the redcap_response column - map_df["redcap_response"] = map_df["redcap_response"].apply( + # strips the text answers out of the response column + map_df["raw_response"] = map_df["raw_response"].apply( lambda x: x.split(",")[0] if isinstance(x, str) else x ) # Set multi-index for easier access - map_df.set_index(["redcap_variable", "redcap_response"], inplace=True) + map_df.set_index(["raw_variable", "raw_response"], inplace=True) # Generate the flat_like dictionary if one_to_one: filtered_data["flat_dict"] = filtered_data.apply( - create_dict_from_row, args=[map_df], axis=1 + create_dict_wide, args=[map_df], axis=1 ) return filtered_data else: melted_data["flat_dict"] = melted_data.apply( - create_dict_from_cell, args=[data, map_df], axis=1 + create_dict_long, args=[data, map_df], axis=1 ) return melted_data["flat_dict"].to_frame() diff --git a/fhirflat/resources/base.py b/fhirflat/resources/base.py index 4f56e31..d5bb8a2 100644 --- a/fhirflat/resources/base.py +++ b/fhirflat/resources/base.py @@ -83,9 +83,11 @@ def from_flat(cls, file: str) -> FHIRFlatBase | list[FHIRFlatBase]: @classmethod def ingest_backbone_elements(cls, mapped_data: pd.Series) -> pd.Series: """ - Takes ordered lists of data and forms the correct FHIR format which won't + Unflattens ordered lists of data and forms the correct FHIR format which won't be flattened after ingestion (*_dense columns). + Extends the flat2fhir.expand_concepts function specifically for data ingestion. + Parameters ---------- mapped_data: pd.Series diff --git a/fhirflat/resources/condition.py b/fhirflat/resources/condition.py index 41fbfd7..b805bf1 100644 --- a/fhirflat/resources/condition.py +++ b/fhirflat/resources/condition.py @@ -1,10 +1,14 @@ from __future__ import annotations from fhir.resources.condition import Condition as _Condition from .base import FHIRFlatBase +from .extension_types import presenceAbsenceType, prespecifiedQueryType, timingPhaseType +from .extensions import presenceAbsence, prespecifiedQuery, timingPhase import orjson from ..flat2fhir import expand_concepts -from typing import TypeAlias, ClassVar +from typing import TypeAlias, ClassVar, Union +from fhir.resources import fhirtypes +from pydantic.v1 import Field, validator JsonString: TypeAlias = str @@ -12,6 +16,27 @@ class Condition(_Condition, FHIRFlatBase): + extension: list[ + Union[ + presenceAbsenceType, + prespecifiedQueryType, + timingPhaseType, + fhirtypes.ExtensionType, + ] + ] = Field( + None, + alias="extension", + title="Additional content defined by implementations", + description=( + """ + Contains the G.H 'age' and 'birthSex' extensions, + and allows extensions from other implementations to be included.""" + ), + # if property is element of this resource. + element_property=True, + union_mode="smart", + ) + # attributes to exclude from the flat representation flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions + ( "id", @@ -25,6 +50,20 @@ class Condition(_Condition, FHIRFlatBase): # required attributes that are not present in the FHIRflat representation flat_defaults: ClassVar[list[str]] = FHIRFlatBase.flat_defaults + ["clinicalStatus"] + @validator("extension") + def validate_extension_contents(cls, extensions): + present_count = sum(isinstance(item, presenceAbsence) for item in extensions) + query_count = sum(isinstance(item, prespecifiedQuery) for item in extensions) + timing_count = sum(isinstance(item, timingPhase) for item in extensions) + + if present_count > 1 or query_count > 1 or timing_count > 1: + raise ValueError( + "presenceAbsence, prespecifiedQuery and timingPhase can only appear" + " once." + ) + + return extensions + @classmethod def flat_descriptions(cls) -> dict[str, str]: """ diff --git a/fhirflat/resources/extension_types.py b/fhirflat/resources/extension_types.py index ba077be..55f61ea 100644 --- a/fhirflat/resources/extension_types.py +++ b/fhirflat/resources/extension_types.py @@ -55,5 +55,13 @@ class raceType(AbstractType): __resource_type__ = "Race" +class presenceAbsenceType(AbstractType): + __resource_type__ = "presenceAbsence" + + +class prespecifiedQueryType(AbstractType): + __resource_type__ = "prespecifiedQuery" + + class dateTimeExtensionType(AbstractType): __resource_type__ = "dateTimeExtension" diff --git a/fhirflat/resources/extension_validators.py b/fhirflat/resources/extension_validators.py index 0658ce8..4d98c6c 100644 --- a/fhirflat/resources/extension_validators.py +++ b/fhirflat/resources/extension_validators.py @@ -66,6 +66,8 @@ def __init__(self): "Age": (None, ".extensions"), "birthSex": (None, ".extensions"), "Race": (None, ".extensions"), + "presenceAbsence": (None, ".extensions"), + "prespecifiedQuery": (None, ".extensions"), "dateTimeExtension": (None, ".extensions"), } @@ -235,5 +237,13 @@ def race_validator(v: Union[StrBytes, dict, Path, FHIRAbstractModel]): return Validators().fhir_model_validator("Race", v) +def presenceabsence_validator(v: Union[StrBytes, dict, Path, FHIRAbstractModel]): + return Validators().fhir_model_validator("presenceAbsence", v) + + +def prespecifiedquery_validator(v: Union[StrBytes, dict, Path, FHIRAbstractModel]): + return Validators().fhir_model_validator("prespecifiedQuery", v) + + def datetimeextension_validator(v: Union[StrBytes, dict, Path, FHIRAbstractModel]): return Validators().fhir_model_validator("dateTimeExtension", v) diff --git a/fhirflat/resources/extensions.py b/fhirflat/resources/extensions.py index 006c8c5..18c9f3e 100644 --- a/fhirflat/resources/extensions.py +++ b/fhirflat/resources/extensions.py @@ -453,6 +453,78 @@ def elements_sequence(cls): ] +class presenceAbsence(_DataType): + """ + An ISARIC extension to indicate if a clinical finding is present, absent or unknown. + """ + + resource_type = Field("presenceAbsence", const=True) + + url = Field("presenceAbsence", const=True, alias="url") + + valueCodeableConcept: fhirtypes.CodeableConceptType = Field( + None, + alias="valueCodeableConcept", + title="Value of extension", + description=( + "Value of extension - must be one of a constrained set of the data " + "types (see [Extensibility](extensibility.html) for a list)." + ), + # if property is element of this resource. + element_property=True, + element_required=True, + ) + + @classmethod + def elements_sequence(cls): + """returning all elements names from + ``Extension`` according specification, + with preserving original sequence order. + """ + return [ + "id", + "extension", + "url", + "valueCodeableConcept", + ] + + +class prespecifiedQuery(_DataType): + """ + An ISARIC extension to indicate if a finding is the result of a prespecified query. + """ + + resource_type = Field("prespecifiedQuery", const=True) + + url = Field("prespecifiedQuery", const=True, alias="url") + + valueBoolean: bool = Field( + None, + alias="valueBoolean", + title="Value of extension", + description=( + "Value of extension - must be one of a constrained set of the data " + "types (see [Extensibility](extensibility.html) for a list)." + ), + # if property is element of this resource. + element_property=True, + elementRequired=True, + ) + + @classmethod + def elements_sequence(cls): + """returning all elements names from + ``Extension`` according specification, + with preserving original sequence order. + """ + return [ + "id", + "extension", + "url", + "valueBoolean", + ] + + # ------------------- extension types ------------------------------ diff --git a/tests/dummy_data/encounter_dummy_mapping.csv b/tests/dummy_data/encounter_dummy_mapping.csv index 0aa99b8..42026f2 100644 --- a/tests/dummy_data/encounter_dummy_mapping.csv +++ b/tests/dummy_data/encounter_dummy_mapping.csv @@ -1,4 +1,4 @@ -redcap_variable,redcap_response,id,subject,extension.timingPhase.system,extension.timingPhase.code,extension.timingPhase.text,class.system,class.code,class.text,actualPeriod.start,actualPeriod.end,diagnosis.condition.concept.system,diagnosis.condition.concept.code,diagnosis.condition.concept.text,diagnosis.use.system,diagnosis.use.code,diagnosis.use.text,admission.dischargeDisposition.system,admission.dischargeDisposition.code,admission.dischargeDisposition.text +raw_variable,raw_response,id,subject,extension.timingPhase.system,extension.timingPhase.code,extension.timingPhase.text,class.system,class.code,class.text,actualPeriod.start,actualPeriod.end,diagnosis.condition.concept.system,diagnosis.condition.concept.code,diagnosis.condition.concept.text,diagnosis.use.system,diagnosis.use.code,diagnosis.use.text,admission.dischargeDisposition.system,admission.dischargeDisposition.code,admission.dischargeDisposition.text usubjid,,,Patient/+,,,,,,,,,,,,,,,,, visitid,,,,,,,,,,,,,,,,,,,, dates_enrolment,,,,,,,,,, if not , if not ,,,,,,,,, diff --git a/tests/dummy_data/observation_dummy_mapping.csv b/tests/dummy_data/observation_dummy_mapping.csv index 0d3ee82..742d63d 100644 --- a/tests/dummy_data/observation_dummy_mapping.csv +++ b/tests/dummy_data/observation_dummy_mapping.csv @@ -1,4 +1,4 @@ -redcap_variable,redcap_response,single_resource_group,category.system,category.code,category.text,effectiveDateTime,code.system,code.code,code.text,subject,encounter,valueQuantity.value,valueQuantity.system,valueQuantity.code,valueQuantity.unit,valueCodeableConcept.system,valueCodeableConcept.code,valueCodeableConcept.text,valueDateTime,valueInteger +raw_variable,raw_response,single_resource_group,category.system,category.code,category.text,effectiveDateTime,code.system,code.code,code.text,subject,encounter,valueQuantity.value,valueQuantity.system,valueQuantity.code,valueQuantity.unit,valueCodeableConcept.system,valueCodeableConcept.code,valueCodeableConcept.text,valueDateTime,valueInteger vital_highesttem_c,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8310-5,Body temperature,Patient/+,Encounter/+,,http://unitsofmeasure,Cel,DegreesCelsius,,,,, vital_highesttem_f,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8310-5,Body temperature,Patient/+,Encounter/+,,http://unitsofmeasure,degF,DegreesFarenheit,,,,, vital_hr,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8867-4,Heart rate,Patient/+,Encounter/+,,https://snomed.info/sct,258983007,Beats/minute (qualifier value),,,,, diff --git a/tests/ingestion_output/encounter.parquet b/tests/ingestion_output/encounter.parquet deleted file mode 100644 index 203d4cc86493747d6e62c6c6bb30ff594af23ac9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 18587 zcmeHPZ)hXunIFlH*SpzlHoH5DoZH(fzTK&}URyKzV@u2Cj*)H2a%8QowIy4nw>Q#^ zE!m@yqCb|cG*HS?N(dz^rIfN9p&vphp_EcexPx-shknY3a2(+%AsnS2j#5e};RvN4 z?s?w%qmeWt+gW+zlv@(*%)HP0yuau7{CWS(BrAN1qlf7i=xYhuNmG|7>KXPdMNtb% zzT8miQkACYOSJ12JxX7sN9d8OGt|UcqtV^Igg$3IPWT;lI7TVvr;e-i`LQ51a&gRk zzADwqjcQ(6JE%ys(>3b)XNT);2NPjN?bVIl0$3F7q~PZw{6$f4&(drHkRVL=6@13A zPmQn~x$==;KZswU-5q$HoZs!_SP(yV9fd|6&Y|B?%%?+bh|Z!&fc?k;*q9#=F&{b+ zn%>hA%tHrVNFYc+6aV3hptw-4S8BJt-kMS_N#b--*(!T$`T95gvs@_TXIa)m?cAW; zzCb9*va^8n8a>{D^u1dik>4T5&;{oGGls39b9q^))u!`hQ5u$u zwfY%JmP(RRCj@+Z$n}wv`QN9QPll0zfdH~(?+pk2z@gI@6+Hm^Zlhii>P3L!sZ`5( zNo-Ul&jkJScW+*U*2QwIQmhNIM-WTJTCG@CJQIf4O$#zT*819g+}HZ+v}E$X&-XA{ zm-g$uOn&Bced=W1e46>gS!D9@(OA-0pc8sV1W4kps0@0!>1Qa)2R{Kn=Cl4dV38kZ zjYhX|$w8lvZ291#VYmBCt&!W2@@mBV(c${g!Tk1F=Fb;MHd`vQ#QwH}ejP$U&F6b} z&cTpA*2hkGxq%T;1tJtxY^r&xVht-VrQnfpvT9l z?H0ye29@1n?sM&Ny6t@HO54s$LcZP*Y3{kkg*e_YHTX*A7FucbvFIp}YJKSxMKRNg$SBKl@omQ!v}*8^Ob4To7a>`|7?)zL#= zKU#G(6db8KUc6(jj?SSsDdzR71}~w%GwRELUmi3FV-sMTComA$EI$6Fy#8o z$^85U=HVFfW?MIp1J#n`UpVM@4U)6(jkj~#W55+Xf?V8#+PPI!#GX3 zFFiO?E$ya$T+8Y{W4JGn5sWcmNDHPve(~r`Ti4a^47t8N#JuBSKDvfXe;#C(#D3pF zZ|U>VQs3b9g*g8##4B69L0z4Ni5&ykVgj-a0z-k@UPRXmH1mi6I$KI`*^W?)GfGH##+7rztqty}bYeX%|~ zan_J@kLn7b|CPiL{0?1oJcWP#>|dXzoI?*N<`HW!^4_yzQP@_X?A7jz5MZ5)#k2V` zlol+A9V?$R@Hj73>Lakagi!2g8^#qUEHcl#en%#mM_%S<(^qF~AX-xLfrH-DIVr3? z2hiswWxD~D4r*VyDoWLx*T?AjX0cxIG=-`H5InWwwo?1s3F@me6VzzAQO`|ce09_G zRZS6g1)hQ6(D z5C5j{JQ!@V2?1gm`fDBW_q>g$+pYn$Uta7EPTW%d=k1eYbEI}qt4k$9=69T~@4@OW z%=|ipWZL1hH^7oYH+@y7@cb>z=1`b}DTEDLa5k#9xs9tHR^Ix_ISN_%);}1mywf*h zhZ^KE^ZrZ7f&s(?$#ur*d^mO_7aRszm|Pq4QX7 zWa5n0fN$zm4s7Y%Z;VXXo^t7Z?00&)1PrY2KgewIZxoX7@jZit z&?TcHO+#(8Suy|8>H3wE`EH7NYZW;d1Tah1POxrvlY5#fg{70;$=W)}dOiMd*!5_b z`S%R-MjD-TnhdmZ_Sj#7LCQS>hYM0kps!K%`5Vs8Q}P>B;wl6%X@Ik7?KbFin+|Uw z$3_x&Kq@%wCJSvmEJscHfo@MPHL;J@OVy+u^qJvU9EzEndybXbD2{TnatxymQ+FI| zf!RL$=wpDg(Q-^I-3vJ@003EcG(AkgU3*J*n79+g^{pC24cuvq!NYovs1ZiCa%QjV z3lsA*ZE|e;A6KA<4e(^Zkl;Ra4s0X0m?^9bvB|!!-p7dr*XY>QvAUz#&^=9C^$GpGH~mpR49si*p2llkj7C;T!W(4DSh zH0J+Ljnq$mm@n0F&67D{Ii*J!%wPAW-SsKw8f8XZzwut7sIgn_k&2**Le2f)@~h+U zl3m)rpD)Xel2RML?fLqv$Y{$0bF5aff}^*P>IQ@z+dXI#G<9KkGwH9e8?L9}5}s_S!7~s98H+6njTh zpAB}L_P27q<8`)mjf-E_eO;sf8`xJloc|sx)e&&RXg9jvb`qm&hd;eTn;N>tJzl)^ z*xT_*lA0tvab+!Yf;iyQs9p*>P-qeZ#-M-9VE-AC1+1fW>Owk@ST#Pru*RzgYG;u+ zVyBb9Yya6yFRlG$pBy@C60OncvL}JgcyXeB8PZFbg@6?fbr$;E<EYpQ!AW7PZMh}tBoV1o16dJO$sY*hGIfu3<1W`lfx@M!C=Psnp%fZS|Fk`@g2W_{YDU;(2}* z{>48p^L&7hZ1Yi`-{$AG$+JkLqrHg_7mW5qb4k62U!sMy$9cZVLqGT%mCUWrK^xq| zlU0z9%*IP{BO6NxHhjsw++w<(Qyc{{Ew6>=p}ju)HT@lvpm z%dDeJL(J^g5_5bp?@tx-$|^kLo7(+eel9)}pI@!T=ayPxEZsys%Nv}C?VG-EP0VnO z4WBx8cEcF6v6qxERkb~nW;Zjbf*4yc{IhkhxedNFcm;p&s($Rn zC1g}R`9bp<3v#*GemF6Q@z&;R)3;y9`F&5dL$QEKyGg)bMA zTVk}%2SO&in^2PM=Ef3h7l)huD{SlINZxXsM|YIb9(lJl)LppK|{_T=#H<}z|C@00g(#o(^6k=zkt(Sxin zy#UwEoIhC+W2BAbhduYHI>@)cPmM$=RmuC3W$=}fU59IbWDoSSYTFl>kC`K4H07d| z&$2NNY)XQ^0x*xrtxaE{kcW9L1=smlxFu}BJOFK`yl-xTU!fkEYj6$tVl#fdR;NX6 zp5$^Hk)U~Q%K4UF$wF+a?VK#eqM$2PFy^~OwuW<%DXyTFq~x5UVg&!yVNO=oV}R4Q zen{PfS`|w-w0HoYaWJQes}K{d>^(Basq?v2e3tfsmsHNE1!;DG^G{z~Y<8nyTNkhm z@=A?O1$>i%^#d1y_)N+PldrAn+DX=3h@C>#7sT9XZNS_E{?gV;y>qOsY#%@9aWlv{ z&5Q%*R#O*iRyAff{oLc11u8@f;xf13HnZq%MjM|VZCFT6$>S4KK%+~|VnGRec z#)fI_aF`w4@?T^uC6qMgYX6)t#a@qmpO_j*Zl|GM%`NS~x);`*;RCE=u-2>znWY1m zGlF;@S0X{UKFIm=q=ty4^nqHFaQzzJPu>eO6FYnn*Qp?@uYr9qkCHqmI(FiX2 zS6JM+;J%3G7d2Vf=FH<(C%U=_zxfDUH~CGTkFCbwK0svget8@AQjzrt?0*8dL0Vje zy%QhDy-*alQOl?#QidOOf3&g_@OndVx3CNyVe3RTVc19AU(J!Ld-&UtOD3NovdI;% zcX>tedQtuAI@y^au=Z9?8xL|Pn^EnHp**(}e2nqQ91ogTC%3%b-D8d~q5P~dev}wJ zPoS&zBYa|iq$Ag09dBO)un#{MLOfmrPjelg;>K0ugKSOFkvxywoAoERV4n)}**YJ> z{;l~Ljh`gpQ}u_64_&pein@q?qSVB1e5LC3`i%anf7L)~`Nfq@5qP!$FyL@oZ{ehBAz3_2Vyfc;7Q zuw*lK8x((-!Q`H&vJWIQ~W(ea{841d(fQpYDoz#N^fqaZ)O2K~dVKE92A zNId2Jus0_CB6*ZoaC}s6SC7c}WW5i2?_Iut@*y0L)I*#2A^5bq1G-;sfxNe-Ys1HW4T?&6^8p&fDfGTzSNZv@V!jGpMO!m ziJw`-bI^iRFwJ-KB;VTkr}DLfUEz%BID14siFo{Ly8eFg(~j>b;+f2QA+=#ho!?y? zU(E`)G!^&O{GHkk}{v*C3qx#XqSJEit3+ z@%X!h&l+?h__g>aq^kG{wODl3rA16T=Ti4EU?k+6H=Nh|+vIn5d@Z1S0didRzZzd~ z_zQp9^(;p2%kT^J4EBTh19d+L_bm3W8~u;AUQtj!jCjrbnCclB4^c!a=$9tS@8h}) z&!sncDAgq7!5GpB1vPuS{Y~V$&sm9j=nA~5@k$=N^b+OvRD|j)4XHkruT-YUfxSmN ckoTxZ_3*ti{I?6A`ya{!|L>6%_}>!!ANkO&s{jB1 diff --git a/tests/ingestion_output/observation.parquet b/tests/ingestion_output/observation.parquet deleted file mode 100644 index 91e7524a1538ddde4a3279bd8ecf1fb80423ec90..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 13457 zcmds8YiuLebskc$eQ4M6MnhAvydKl5iPkHL!x>&17iYlw6#zI+MT2N~xcSJ3Sw;P%+R@`b^viZEAizT(zcxhkK1$kbRN{R+t zKpS1+Mo)&a^MvSts1F5+k}2{WLRH@xYpFVTp()ow9^@XDB~2d_Ww9bEI#K+0haKM^ zX1_Pg{@RYJKOtBrCZ6Sj_gQEXmyEeRV9CyRbUQmMmdbm5+)8=6c&PU)?;V@tXEyfr z^Xz-)h}{lkWGTiry}1TngpT%<{~Unva$atp^T7AQ#ZCC}!q;qvq7S}ckXPv0E`vM= zv&>#6mQ4yk?$Wa1efINL;%{D=fR$&hy`H1EMu&>!UH7rzFbi{5$vRN@V4A`tR(!LNOja{4?0kdJvR^o*vc6|t}& zDMi(-<@J{WoZq{|aUSIg#dsDx+|6sAz~$W)B)k}!_a^@uN}iB+ei2l-OfPq-@NyGj zFHvKz)7k8AOJD#2;aro3_l=CEhlbBn6A!z*zUA$mV)5V`uYLP3aP`<9(Jhq+&!QI& zaK%3vbNq0aeeVnGD-WaIe|uDxEAww7^jjm%mHAKm-%J3suv%y^5D2ScP0;H#5#g?g zLQQ9&F5-xUs8uDXKwuo1eC-iv!O*G7Ql81ls#;*G5Xv=}iVM9g3EIXqH5Y*Lt;0r1 zR2a>+`xUZtFp|RXHHgQOAYWM1a}8#*E^8vA9)j8FwZ=5ei7RD677L7emY>e6N*-cV zZDg5xYJ|D_{d`RnH0a9?nY@swx^7J)`A;9X!<<5HR7g0<2&(pyseD20LFutke)w%+P@Ah!l!KFO- z@LB}&@s5#RXJ?HYh@WlaKPxsF5eZ(gJO0JS{__m`>I_QINv@Z;r5tJcnKgQjgz>o_ z+IDR+SgJ zw2!^-v{PQkrCGBC(mYxoh3A~@5;bfa8Dn1KUl0BHm4IXKjFnyKO! z_64~vZq)_ilMAFq1pLJ2_|(R}ah3hn?6d_BOQC*Yr$2>Eah!FF33_*5^D_MeU-7fCLk8+<>D3U4u*ksj6>@ptgv>g zxGvW6^;%;L+BcyHb~{5|crqwUdQEhjF+yX)RZL)K>AAL=7AI%N%o)zh_eKD;C)Y;c zdBrwz@%f=K>dlo8-udR0zjv^&xZCz{satV%MKVnLjbX>TL+l6h?9Z-E-w#tuh5q3X z{a#CJ{_V5zu_lXOSkvyPxm-PmMp`efBmR^~mCj796PhVJ# zmMCYX^7KmS*++5=Ma>*N4~fB|hvPyPihH8DKU-2piq1!@QOyk2<#+r8b%}d8vBCcu zcKl?Beb3AOD~G!LEI2IH`yY1tN1)zw#9sf(cIOFGy_ik)%IaJ4af^!}D`;^XGNFJn zLowbZPap(Z^J4{6jQJtw#fm?`1^t*Xfj;MGuEP))2g}i#n1-_HJ1sHNNB6tPo0lnJo+!ls=UtNMTp>62l{+e7R}OYEO6PA{GmPD{=I zqmBM?OY^0>UmgMxGNOj^Y^Z4E(Yh=*AT3a|yrkB(b^`J!bYj+|Dg}+vRc0TPkEzG0 zFJ6C~@>ekjY2_*wsviO-&Z&XKf%ZV9e){F#q*UuY|J5PKukGxIzsJ6L1C{!l5Ll{f zf}U(1z|UYN2cNsY58^rKFSaeqd8XOZE;(+@TC~Er^RvV73{pd5wo!_GJ3LLFYhE4@ zgNehIS`ur-4S#BLylZ3sjA!qDWx7qeT^hi|N|LEh9}Q5>Q7vB(t+w!)bAa*8+%xzDossz#J64kr>anLAgK%5+!BONsbYV^8 zzI1aA&85>oDMxr}dg`Ooy8inAeb>W2pv#z3y%qUgIRCT%R}O&eNgv7B4^QcgbV{d` z2b{yRpL?47yPgcPpIct_pt2sv&pAh!J>z)yDja90=A5HdK`98D^N${W(FM0^;^A|7 zRjyYQ&2^o5`bF0sv_Z*L6O@wZnqypwP!XX)fs7bxnhy7eP~}KjtwPb~BV`fGo|m76 zib3!>LR~cJc9VeSjkQ#sEbQ#*q(VES_p49)J-z|i-4q92FlkRB}eXD9ud02?Fc(SGh|eyy$^t|zk! z$e^j>ZWAQDn_Z{D+q@6$m%5{)hwo?9bn6DQpJ!Ls$#U(!CnL3f{sRW}ASsA>sUpZY zT{%uff|nF-86*ube zBvkMbJ_r@v?kdnQ8Y9UFl*GnrHCIW!Kb6&dI!iwtDzq3Ej_hxfu6GIjr~G;bvK(d_*vo~FfJd9 z*0ZtbQ76_yDp1XMLwil#7gC-BF&W~-beOC`yb_XxN^-XlA#0;&K@On7WmLT$CFffWtP!2;NGOjL&Rk4BDn#{` z?$f5NoHSm*<2Pl!H(mv7dS&Z#!JAEGD`;OrIx8n&Ex;D4Xww61L59(C#+RrTDqv~p zgq)1YXy>P5-^6EEzz0_}sl#rGzjUr~pU)AWO~{>N9=si(9|Smv7pCuKtJ&1yUfSD? z8ED%*W0Y_i;#)cI>GYYyYTlQ~wR|hdk+lKc==l}7GnVXUDpgn;!;W&PB$r7gz$b3D z;#2YV0mh8B&nO$7&N&u(JSwC*;*S+RC0Q3z7=sL(IT2iyB>K812Y!*4h;3Q;Pv4jm zkFNvl&zngdda}u=2jV&SO(L+SWXrjgq#48e*SS!MYK2rXQAkDWg_z8Vw*v-^!G5y` zJ_&vp)qcx;;UEujR!9R4uOIgjj>JOmMmcBfD~H8&pI=$X1#6o&?8(A5?m6Dd-&!_& zg7MZuw;lWGvDTtJ{z2KdX)jyo|EZ{yizSzH-oS1)9p*Y?)1 zO|KpasZbsB#rW<`&)P0uUx)m#J2%S8d8HLAq;<*PfN@fhgaR=R-Za37z%OE;@5S1x4 znW|zvhFvT7@Es$Mg+a@N-sYyyly8R+1>xaCiljt@g#9I(;z)fJ$9uO!rhX0lz4{HK zE=_%O?rBZl(OsB-E#rps7!C|42x(NJDG#aN#-~Lr#vB{KMAmWsQ44>bfLs@vHdArXWW8l4wH>th*7< z(b2Uo{q(Ibk_xx-AU?7U<3oE*e&+fTqcE(RtS>Q2qUspQjgY6mfJ~z}|JqJ-zNt?# z-k6Vr4EuuILe3HOgo8MKe{-qJzKIfy^*5HDOeOZ+nIl7MM8o+@t@+2Z9}kW{+(vUo zNCWH#-J6qthQc?1$)!tk*J*k9C~mjJ*RIdGslF5q!$<@8jXm;`qfD z{&l#N=#DSQFHBq$?=bkjvnKYhB7S!UA2|AW>$S||c#sU@PzmotR?HBv)pnWz`ZVHQ z@7WyT;Zz_=luGK9lc@@|7wV!upRZQu$;A$1UfeLomCn>h@Q-GW`K7+`B>d}1_|HiH E3p#rX_W%F@ diff --git a/tests/ingestion_output/patient.parquet b/tests/ingestion_output/patient.parquet deleted file mode 100644 index ae07ae6f80193efccb9217faef14dbbeaf63feb2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2182 zcmb_eO>g2x7#>KJrELz;Dn?M{fUGXcfuz_#c3HKR1`>!V$p#W^aJ7p36)@Pw^#=}# z@&~G_haP(9sp_TI{)YaF{(%0FzB48?Y|>NKlE?FY%=13;%#7_b+~t^cCe3`_W|9ng ziO^@eFAzd!N?;mJz#!%&lP)kRW{cTiHg*os?u$hzbKimNPjX3UQ;9?h<$q7?Fq_#5 z+StlwHa)|ihMs0z%w2;?rc>#k6X{CryboWk}v;GAk?-+B;B@1^|6wrc>FSY~f?8@|>xpe}jd{r_z5W zw-L$~G8?Yq=!&2DWBryj9o@LT)TUNwJ3jk1_uVZ!0Uz{OPjQ9@Tgb7FVjJLaLdycn z4-_u|vq<|>7rJHIHAt7cdk;p8cS*sG1?(!ym1$V|<*F~Hbs50c0VFnbOl_*i3_{zT zF9TC`Y-j;tW*dQ`D}e&(x2)kny6Ftb*xp5B?+ttRfXpfj_TfGyNGp6EMW4lt=yq2C zf{WR*rvA8M?0bG4s=j9^!PKLoSf=VJ-kfOS&K1v_Mx^%4@O*RX5HDBRFYXsvAjmGN zKM3;sJc*&qChm6MNpIuBkoht7CAlJeh%xTKO|EY-F5}t|H!vRJ`jGl+wUu~8oJY%e zBdW(Xv=oKL1&kvMLEO@)W5u z5ysP@;Iu}nty_W_4F$V0Ql%cr33ch(Z`ZM@mAWI%>3|PM3wx&3g#+QR;|ulHjouU^ zf^*X6bP^x%RbQ95u+PV}<^Hl}IFLAt$#!bJ}t!Pcc z37fi9aRGlKp9`cmaO5IGtQL)ajp7P+)l_V8tk-Egko(;cU=#=E!1vfUWvMb&`L3(l z8pRXEZnJ$cG_OA3uFWC85rBU8yg&zAyirTye87u6T94}WA^-o$b84V5=*T_b>WHBt zoz7&K8_^9k3AAJDN9xd1B>O304S}t{k^`6A* z_?VX>#W>o>*0ovT)aG@y4_pNMbS_JcqFNFNN6_!u9l-w5oC(z%<_G=OOrO*B;pKko zMv=pi}O z$k79%R_jt{kF@lBP&3X4T`)d5gv>V`Tlw{ZYno-f_#) z_$sH_r{fV>}%@z*Gma~1FV&P G1Nj$bgJu!{ From 60e943b1e2633ffb3aa23d264851aebcef151245 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Wed, 22 May 2024 14:54:13 +0100 Subject: [PATCH 17/21] Misc updates, now passes private checks on dengue data subset --- .gitignore | 5 +- fhirflat/flat2fhir.py | 141 ++++++++++-------- fhirflat/ingest.py | 58 ++++++- fhirflat/resources/base.py | 16 +- tests/data/condition_flat.parquet | Bin 12276 -> 14975 bytes tests/dummy_data/combined_dummy_data.csv | 2 +- .../dummy_data/encounter_dummy_data_multi.csv | 2 +- .../encounter_dummy_data_single.csv | 2 +- tests/dummy_data/encounter_dummy_mapping.csv | 2 +- .../dummy_data/observation_dummy_mapping.csv | 38 ++--- tests/dummy_data/vital_signs_dummy_data.csv | 2 +- tests/test_condition_resource.py | 34 +++++ 12 files changed, 211 insertions(+), 91 deletions(-) diff --git a/.gitignore b/.gitignore index ad56c74..c150b65 100644 --- a/.gitignore +++ b/.gitignore @@ -68,4 +68,7 @@ coverage.xml .pytest_cache/ cover/ -.DS_Store \ No newline at end of file +.DS_Store + +# tests on private data +tests/tests_private/ \ No newline at end of file diff --git a/fhirflat/flat2fhir.py b/fhirflat/flat2fhir.py index 925a4f1..a9be8c0 100644 --- a/fhirflat/flat2fhir.py +++ b/fhirflat/flat2fhir.py @@ -1,5 +1,9 @@ # Converts FHIRflat files into FHIR resources -from .util import group_keys, get_fhirtype, get_local_extension_type +from .util import ( + group_keys, + get_fhirtype, + get_local_extension_type, +) from fhir.resources.quantity import Quantity from fhir.resources.codeableconcept import CodeableConcept from fhir.resources.period import Period @@ -9,6 +13,7 @@ from fhir.resources.backbonetype import BackboneType as _BackboneType from pydantic.v1.error_wrappers import ValidationError +from pydantic.v1 import BaseModel def create_codeable_concept( @@ -18,33 +23,23 @@ def create_codeable_concept( # for reading in from ingestion pipeline if name + ".code" in old_dict and name + ".system" in old_dict: - code = old_dict[name + ".code"] - if isinstance(code, list) and len(code) > 1: - new_dict = {"coding": []} - for system, code, name in zip( - old_dict[name + ".system"], code, old_dict[name + ".text"] - ): - formatted_code = code if isinstance(code, str) else str(int(code)) - display = name - - subdict = {"system": system, "code": code, "display": display} - - new_dict["coding"].append(subdict) + raw_codes = old_dict.get(name + ".code") + if not isinstance(raw_codes, list): + formatted_code = ( + raw_codes if isinstance(raw_codes, str) else str(int(raw_codes)) + ) + codes = [old_dict[name + ".system"] + "|" + formatted_code] else: - formatted_code = code if isinstance(code, str) else str(int(code)) - new_dict = { - "coding": [ - { - "system": old_dict[name + ".system"], - "code": formatted_code, - "display": old_dict[name + ".text"], - } - ] - } - return new_dict - - # From FHIRflat file - codes = old_dict.get(name + ".code") + formatted_code = [ + c if isinstance(c, str) else str(int(c)) for c in raw_codes + ] + codes = [ + [s + "|" + c] + for s, c in zip(old_dict[name + ".system"], formatted_code) + ] + else: + # From FHIRflat file + codes = old_dict.get(name + ".code") if codes is None: return { @@ -164,12 +159,53 @@ def set_datatypes(k, v_dict, klass) -> dict: } data_type = prop[value_type[0]]["type"] - data_class = get_fhirtype(data_type) - return {"url": k, f"{value_type[0]}": set_datatypes(k, v_dict, data_class)} + try: + data_class = get_fhirtype(data_type) + return {"url": k, f"{value_type[0]}": set_datatypes(k, v_dict, data_class)} + except AttributeError: + # datatype should be a primitive + return {"url": k, f"{value_type[0]}": v_dict[k]} return {s.split(".", 1)[1]: v_dict[s] for s in v_dict} +def find_data_class(data_class: list[BaseModel] | BaseModel, k: str) -> BaseModel: + """ + Finds the type class for item k within the data class. + + Parameters + ---------- + data_class: list[BaseModel] or BaseModel + The data class to search within. If a list, the function will search for the + a class with a matching title to k. + k: str + The property to search for within the data class + """ + + if isinstance(data_class, list): + title_matches = [k.lower() == c.schema()["title"].lower() for c in data_class] + result = [x for x, y in zip(data_class, title_matches) if y] + if len(result) == 1: + return get_fhirtype(k) + else: + raise ValueError(f"Couldn't find a matching class for {k} in {data_class}") + + else: + k_schema = data_class.schema()["properties"].get(k) + + base_class = ( + k_schema.get("items").get("type") + if k_schema.get("items") is not None + else k_schema.get("type") + ) + + if base_class is None: + assert k_schema.get("type") == "array" + + base_class = [opt.get("type") for opt in k_schema["items"]["anyOf"]] + return get_fhirtype(base_class) + + def expand_concepts(data: dict, data_class: type[_DomainResource]) -> dict: """ Combines columns containing flattened FHIR concepts back into @@ -180,36 +216,7 @@ def expand_concepts(data: dict, data_class: type[_DomainResource]) -> dict: for k in groups.keys(): - if isinstance(data_class, list): - title_matches = [ - k.lower() == c.schema()["title"].lower() for c in data_class - ] - result = [x for x, y in zip(data_class, title_matches) if y] - if len(result) == 1: - group_classes[k] = k - continue - else: - raise ValueError( - f"Couldn't find a matching class for {k} in {data_class}" - ) - - else: - k_schema = data_class.schema()["properties"].get(k) - - group_classes[k] = ( - k_schema.get("items").get("type") - if k_schema.get("items") is not None - else k_schema.get("type") - ) - - if group_classes[k] is None: - assert k_schema.get("type") == "array" - - group_classes[k] = [ - opt.get("type") for opt in k_schema["items"]["anyOf"] - ] - - group_classes = {k: get_fhirtype(v) for k, v in group_classes.items()} + group_classes[k] = find_data_class(data_class, k) expanded = {} keys_to_replace = [] @@ -228,6 +235,22 @@ def expand_concepts(data: dict, data_class: type[_DomainResource]) -> dict: # coming back out of nested recursion expanded[k] = {s.split(".", 1)[1]: v_dict[s] for s in v_dict} + elif any(isinstance(v, dict) for v in v_dict.values()) and isinstance( + group_classes[k], list + ): + # extensions, where some classes are just values and others have codes etc + non_dict_items = { + k: v for k, v in v_dict.items() if not isinstance(v, dict) + } + stripped_dict = { + s.split(".", 1)[1]: non_dict_items[s] for s in non_dict_items.keys() + } + for k1, v1 in stripped_dict.items(): + klass = find_data_class(group_classes[k], k1) + v_dict[k + "." + k1] = set_datatypes(k1, {k1: v1}, klass) + + expanded[k] = {s.split(".", 1)[1]: v_dict[s] for s in v_dict} + else: expanded[k] = set_datatypes(k, v_dict, group_classes[k]) diff --git a/fhirflat/ingest.py b/fhirflat/ingest.py index d939fa1..ecb64b6 100644 --- a/fhirflat/ingest.py +++ b/fhirflat/ingest.py @@ -50,7 +50,10 @@ def find_field_value(row, response, mapp, raw_data=None): try: return row[col] except KeyError: - return raw_data.loc[row["index"], col] + try: + return raw_data.loc[row["index"], col] + except KeyError: + raise KeyError(f"Column {col} not found in data") else: return mapp @@ -149,7 +152,11 @@ def create_dict_long( def create_dictionary( - data: pd.DataFrame, map_file: pd.DataFrame, resource: str, one_to_one=False + data: pd.DataFrame, + map_file: pd.DataFrame, + resource: str, + one_to_one=False, + subject_id="subjid", ) -> pd.DataFrame: """ Given a data file and a single mapping file for one FHIR resource type, @@ -167,6 +174,8 @@ def create_dictionary( The name of the resource being mapped. one_to_one: bool Whether the resource should be mapped as one-to-one or one-to-many. + subject_id: str + The name of the column containing the subject ID in the data file. """ data = pd.read_csv(data, header=0) @@ -180,6 +189,32 @@ def create_dictionary( warnings.warn(f"No data found for the {resource} resource.", UserWarning) return None + if one_to_one: + + def condense(x): + """ + In case where data is actually multi-row per subject, condenses the relevant + data into a single row for 1:1 mapping. + """ + + # Check if the column contains nan values + if x.isnull().any(): + # If the column contains a single non-nan value, return it + non_nan_values = x.dropna() + if non_nan_values.nunique() == 1: + return non_nan_values + elif non_nan_values.empty: + return np.nan + else: + raise ValueError("Multiple values found in one-to-one mapping") + else: + if len(x) == 1: + return x + else: + raise ValueError("Multiple values found in one-to-one mapping") + + filtered_data = filtered_data.groupby(subject_id, as_index=False).agg(condense) + if not one_to_one: filtered_data = filtered_data.reset_index() melted_data = filtered_data.melt(id_vars="index", var_name="column") @@ -215,6 +250,7 @@ def convert_data_to_flat( folder_name: str, mapping_files_types: tuple[dict, dict] | None = None, sheet_id: str | None = None, + subject_id="subjid", ): """ Takes raw clinical data (currently assumed to be a one-row-per-patient format like @@ -236,6 +272,8 @@ def convert_data_to_flat( the mapping types - one column listing the resource name, and another describing whether the mapping is one-to-one or one-to-many. The subsequent sheets must be named by resource, and contain the mapping for that resource. + subject_id: str + The name of the column containing the subject ID in the data file. """ if not mapping_files_types and not sheet_id: @@ -268,11 +306,23 @@ def convert_data_to_flat( t = types[resource.__name__] if t == "one-to-one": - df = create_dictionary(data, map_file, resource.__name__, one_to_one=True) + df = create_dictionary( + data, + map_file, + resource.__name__, + one_to_one=True, + subject_id=subject_id, + ) if df is None: continue elif t == "one-to-many": - df = create_dictionary(data, map_file, resource.__name__, one_to_one=False) + df = create_dictionary( + data, + map_file, + resource.__name__, + one_to_one=False, + subject_id=subject_id, + ) if df is None: continue else: diff --git a/fhirflat/resources/base.py b/fhirflat/resources/base.py index d5bb8a2..f0dc723 100644 --- a/fhirflat/resources/base.py +++ b/fhirflat/resources/base.py @@ -100,7 +100,7 @@ def ingest_backbone_elements(cls, mapped_data: pd.Series) -> pd.Series: """ - def fhir_format(row): + def fhir_format(row: pd.Series) -> pd.Series: for b_e, b_c in cls.backbone_elements.items(): keys_present = [key for key in row if key.startswith(b_e)] if keys_present: @@ -123,8 +123,8 @@ def fhir_format(row): row[b_e] = backbone_list return row - mapped_data.apply(fhir_format) - return mapped_data + condensed_mapped_data = mapped_data.apply(fhir_format) + return condensed_mapped_data @classmethod def ingest_to_flat(cls, data: pd.DataFrame, filename: str): @@ -152,6 +152,16 @@ def ingest_to_flat(cls, data: pd.DataFrame, filename: str): x for x in flat_df.columns if "date" in x.lower() or "period" in x.lower() ]: flat_df[date_cols] = flat_df[date_cols].astype(str) + flat_df[date_cols] = flat_df[date_cols].replace("nan", None) + + for coding_column in [ + x + for x in flat_df.columns + if x.lower().endswith(".code") or x.lower().endswith(".text") + ]: + flat_df[coding_column] = flat_df[coding_column].apply( + lambda x: [x] if isinstance(x, str) else x + ) flat_df.to_parquet(f"{filename}.parquet") diff --git a/tests/data/condition_flat.parquet b/tests/data/condition_flat.parquet index af35f9b0ff6c517bd7d31fb30765dd19ff0d2f4f..f213484721ce977ee26bc82d686915dabc7ac32c 100644 GIT binary patch literal 14975 zcmdU0TWs6r6{Zx`Nm?h(vQU##H}#x^^%}>z*m9&SeW2t>mTD`uZ0pt)hIp|}o4QmN z%W_*_7=~dOilG>aVc5eE48<@E!_bGWD~2N2!ybmBk3*mOPz*(}hdmU-Fcd{m?3|w> zMN-tojg@9fLXrH>fBx@W|8ssjD{RtBJ7^C*w?x}%>O4iczITD5sC=bVFA5WExgyCE z!RZN~$2&_?^m*Dj4Rq(|5qe~No|?K~Mdz}73rtT>*x_oo+3XbascoDd9rIHo=f+%* z=ec}MEQrP0TCE}#w`sf6?)=E+{HKk{FsF{X&e${vMA3E%F6ZEjqTo3~$5jC@|M)3* zw^OT?Z@S&pVyPesGg5J@NCaFj{sJEJU}!+k?RT6 z!}mutJp|5hC84<{)x;UTB!~`Ks@6`5GEods$sg>_x9!ZE4(7cRsFZ%|^#^3Bs&RTk zRn^(KM*(zWrzDFLh|R(S9&L}hboVe#Z|(O_Yf`$}Qr5(S8WHRboAdWJ=EGCWU1Dt= z9|KQIxc{)xAKI4Cr|8R{hH7G^AQek;X}dYIBj1=QRkqz> zQ2Sc53cl#(OGQDdNu}a-9(;4VRB66ku9P^i#&xvIb+A%FbrMHAAmS}2IR@@s&8Nzt z`O^tI`jg{??S!4V`?#j*z!?>~r9U~GZ#bO4b}*lhGG9K5{`5exTiVh|UsPT1v8j_F z(~6>UQ1k?S1>?GvdF)dsTSG4O2B^>7LOqZ6=I7_V;C`yt=+fv!zd5eH6tHk-a{2SO+R~rK zoMPz+cIR7m=eu_16Po$*0y;w9#Rh<5sVmq%R9&5(cmyCv#C@@1kyRcc11{wY==8bd zY z;~>16g`ObMkRi-C_*S_BBHa@((1-L-o*Ad@tzz&@saO?j z3$W>~O9hdDdClhhg^l^;cbI=qjrZVbp&S#)KUsprw=M@>5oG(x1oH3GZIGiCkuOyQ zQ9ztJj2~ZP-n|AG+e}O>EiqoCFR6+=<2en>hSlQ3tkHRD>a;fKYTufm{?4sAp$ni^ zOZ5sbt~bjHRK?5g%`iWlK`(1_(KECJI%vA_VTm$lT}F3HH`<)8p{!AWw4LeZXS15j zfiqRySEO3AXLh;gWj^rq$}ZZ113fb4uHKZ$pYBaoPr(r-bJpKvb{HN zC7X2)dDZ58-Nt<4XWsUsYVB9Zbu7$K z+6(5PE8v--mEFv6N@M0hmafkKJn6JEr=0)21V1ktn|6(q^F<+Fb^YzaOOr~?%$MYP zp;(=~Ir02tM;$%+!t}(XLZOt{A|ZTfvY0Q3Fr`?RWuSml30c@QzEc+3tU*5ugp>86 zgthP_P#5cka#I^9k&Xl37sOg#$k*~f{?a7Yivs+V3FmICN3I7ZUwNes#)x9l^ph!M zW`JZxa9{XP?pLM(W=rNt!_R~|*Y6lMt&Tsec)q2g7Y5W(cMMn)_Q0SHn=l;BLLIAX zZH8)QqXjyKO{*V^2DC*~5xZbehlheeJ;3j@0BVMub>Ps@)>efx$&PEQL#DOw)ACwt z*D0}K=$KdSc~)%!r7yh+-?4b4@dOvtp`EC!wlJEhKmG7g8qS&fDYApugpVtgWaME? z`s!O<%I`=hh7;ya7#+i=g<&TdbhqhPT7Pcbxw3imSQl=e0$VL?95pR`pJ8k`jqO~# zW7xF%v4o4Zh;?*y3=ai^at3Olf%brD-TTu#x?e}d(pidi?io_CHa~|rjlre8Hb)n! zpDJR$R;rL1OP07wzS6`JtK7_2Dy0T8gEM*j5g@+x`e!^do=K>5)a^<)=$rB3808ve zlgT^jo66y+>tD~f@zdB|Wmy*g;paJ)onu4WY=mXE+3+@b7YZ518~F0JHXg${_zWMy zg^b5ow!y+Y_(mq7Eb@Y9pxg%f(2ZC@uIHkuxr{Hd&n>2ETya^tBZat_9NfyM{d>90 z&K4I<`Eu!lt>s8?D<*rYv2c8s^Bs7(v|pu>`DScB7O4S0{|=YlKpu4=eNbHrvr?`g z%Um?M#T6p8+#1kD{a!BGKt7EutdY$mwuD8wLB_&yFISL(e^QP~jcwrP)$i`^#Bha1Ufexj{GH zhL{R^#7sy=9I--B$`?|*LYR!#av%@z@MJZ89R!!cXfIX2tduPT_rz49EJQczI^KnR zI#ta@g7qxOq419H=N9Gld?qO`5#4MMnXGVfQTb4a)HEAphJ;rDpKHn58*ecwi;JsW z@eW0=KJ)=$pD*t9(Q9G{tpU$_< zG{w*kS4i&gUVY3Ql=;~tr};q2L;Qp*rfU46tj{U?*+N;)&MJPyrBj}4ItjMBt>vk$ zwFc%6UCdw(*{Ju?riIPKXiE4h`(!u25*L%O_a(}lZ?4WS#$|psBxijx=IFhR zOiG5F*wx<#oonTXu$S#~%HC(OH^%olUqgxQfnqlZ_DOk?uz%)K$uia>UH0&OkX06L zV=hQ-};kEqc6P<6h0rru22G zg?}cz`n~nR*u6(@5sq(Vd}>c~fHtpY4ntd+CaJ5f;-<_MSAB`Dxks)G3t7uvLx~%& zfN_zXg?P>+_jw6=4&1Awvcwl6O%dyN=ySw$q{$b8vr2x+K^`SJmh>~r>l^ju@LVHS zB>hPnBlxSb&mfOjb7?mQbnzgzZ!DXIf1wubGTcIo2TfiGdcP_*fQt?61$pfC_E{K@ zvI}tAfIsko=QtaNn{8|h=df*jZWTyFWa=G!PLO_a4yo7iqBNCr*PvB}hY-6%URLo+ z`bqN0Ch;rd4MN`pb$cXO&1G^j7YQ^n-T)4feiRm7RQgeT#Uf;Qb9lq;Mw;6Z(!@gZ ziw}5FK0JJd4u!PR`?1k=pue`K;#p9*>>;g!hfKvu>;f*ZsEdCkp5SFTi}QUe`_O;G z7SgN%fV5-JshYoO5p-#a^Lpq%R#O9;JVWcwGlP?!BYoSy)E;g%xl; zoxejc!l$ioVoK08*xw81ATp1kxPtNr=G+h}@DytKD~T^%{Kt_K@$oh6O?F7Y#Bn{} zI)oaD?^e7ZrS7~+iFf^c(jOvG3g`Q^`Cak@;t#=DNUDZ&AfLc`*8(cuzWE{A(HjGk za0BNDO#14VAACrEunXqV%S-fYnIE#c{$0xtEX24wxC|G)zDdyN^li=$9;6o_#`X5? zh;M`v)tw2cKF)^raDG{vufz{AiY|T>e$hTodJd q<;A*KyUv%(*U2I7MC*`tLOH&L-{ON`OZ^Iesn4E)-!q2)dip;D@h>v~ delta 2283 zcmZuzS!^3s6phE3?1^J%b(V%~)1-Dhy%7F{aXM(OF8;_KC?lALplwa z{4wgoQh2YLqtBHve-=X>ty;eS5p_}qn+lG;T+VEiK}(enDk(2@QvunkcKW`YxhRKa zN(;}I*r~HM@J>lPeW{u`Q4Oml+U3F->T*52spzE7)iJ-+E4zQ6i4Kq#oTdKihGR7?^atI{-7fgFrUM?Azeatnh9ect z^i>seTm^4c7*+~jr&d|`76Vr`%!3}#6n22DRspY+SHgP5Aa#LASe3cpU3nv{$Qt1@xe{*3XJDq%1r9}JMOi0RUmQt= zC+Aahkr^0Pbmq&Lr|O}WGP`S0U6oslE-8~!34d-CG*Tv~1o+MF0#xlqzCz06P#%yn zIh?minH-`qCWmQ8@-)f3l-Yygs$6PcY^c4itfiQu+A9s!d3j28eZf>PH5#0)zgG#v z4L`ykt;g2RH5EzL&NfmK`1_eR05w*Du5l8kS~F{rwjv2!Y<>vI_G$Qd-vZojzgBXm zjU^+EobW}*3sQU92LE)lmvbhBkbo~_4cpe3&MFC5*z;>Ib_FGC_ZY9FGQ=CQ0iWCA z_qh`uYhn(ZT@5RmT4459=Lf8P9GqbtFZ;dr@lmU1F5oqABRY2`=ryElvGkZN8JG%q z7vnzOFdp)b#16+$Y%eTa>sFSeJo7=Xb0J_I0lC@^Kj1~a)SN8As|OO{WP*#pC#n|M zWb5E}O*Pz7zuNNury&Cu@Qrg%uLc%AxH)}K5 zC&Eb&rrMaXO-znOGWOY^*PikF>;^Kor33@EGa+3#<}jhyXws7j^NEE}%=WC!WLtC& z>C=u06pIew(xim(P|`gf)*AVM&z=o>jq_rR3ETWYOrN%;I3$c=UG#6^mcnU-%=p5? zf{A}wK+S_#UuHjXoF=z^5=6j`+rqPCds(&@ z&nza4CjR6un#4c?F@@f!k59}8tRB6OUrgYg&HA!J)F)!ckQtGc!z7dK(I-CRt}L`3 zSR)VKGBI1ChGAs!OOV!7DC#B;`V5T}t8)ELW>dKB2mi8ur=84om1kn=6T8#9sZqlAQWVPUYkEGt&S zI*cc(ICd@T-B}Gk3BtQrTn+XJHWv}X+QVw}BOy)tLo9?S25dgzHkTR^hkH94(s?p| q-ZK*M2?^qha35g`MHZmpP^Ux&!9z7Lb*Ny+RamsymX}BV;rs{Y=D`{O diff --git a/tests/dummy_data/combined_dummy_data.csv b/tests/dummy_data/combined_dummy_data.csv index 94449ef..eb4d59d 100644 --- a/tests/dummy_data/combined_dummy_data.csv +++ b/tests/dummy_data/combined_dummy_data.csv @@ -1,4 +1,4 @@ -usubjid,visitid,dates_enrolment,dates_adm,dates_admdate,dates_admtime,non_encounter_field,outco_denguediag,outco_denguediag_main,outco_denguediag_class,outco_not_dengue,outco_secondiag_oth,outco_date,outco_outcome,daily_date,vital_highesttem_c,vital_hr,vital_rr,vital_systolicbp,vital_diastolicbp,vital_spo2,vital_fio2spo2_02110,vital_fio2spo2_pcnt,vital_capillaryr,vital_avpu,vital_gcs,vital_urineflow +subjid,visitid,dates_enrolment,dates_adm,dates_admdate,dates_admtime,non_encounter_field,outco_denguediag,outco_denguediag_main,outco_denguediag_class,outco_not_dengue,outco_secondiag_oth,outco_date,outco_outcome,daily_date,vital_highesttem_c,vital_hr,vital_rr,vital_systolicbp,vital_diastolicbp,vital_spo2,vital_fio2spo2_02110,vital_fio2spo2_pcnt,vital_capillaryr,vital_avpu,vital_gcs,vital_urineflow 1,10,2020-05-01,0,,,,,,,cough,,,7,2020-01-01,36.2,120,30,70,120,5,,75,1,1,1,150 2,11,,1,2021-04-01,18:00,fish,1,,2,,,2021-04-10,1,2021-02-02,37,100,40,80,130,6,10,85,0,2,1,200 3,12,,1,2021-05-10,17:30,,1,,1,flu,,2021-05-15,4,2022-03-03,35.5,70,50,90,140,7,,95,0,3,1, diff --git a/tests/dummy_data/encounter_dummy_data_multi.csv b/tests/dummy_data/encounter_dummy_data_multi.csv index d0ad22e..eedb6fc 100644 --- a/tests/dummy_data/encounter_dummy_data_multi.csv +++ b/tests/dummy_data/encounter_dummy_data_multi.csv @@ -1,4 +1,4 @@ -usubjid,visitid,dates_enrolment,dates_adm,dates_admdate,dates_admtime,non_encounter_field,outco_denguediag,outco_denguediag_main,outco_denguediag_class,outco_not_dengue,outco_secondiag_oth,outco_date,outco_outcome +subjid,visitid,dates_enrolment,dates_adm,dates_admdate,dates_admtime,non_encounter_field,outco_denguediag,outco_denguediag_main,outco_denguediag_class,outco_not_dengue,outco_secondiag_oth,outco_date,outco_outcome 1,10,2020-05-01,0,,,,,,,cough,,,7 2,11,,1,2021-04-01,18:00,fish,1,,2,,,2021-04-10,1 3,12,,1,2021-05-10,17:30,,1,,1,flu,,2021-05-15,4 diff --git a/tests/dummy_data/encounter_dummy_data_single.csv b/tests/dummy_data/encounter_dummy_data_single.csv index 4f2122f..d0c908d 100644 --- a/tests/dummy_data/encounter_dummy_data_single.csv +++ b/tests/dummy_data/encounter_dummy_data_single.csv @@ -1,2 +1,2 @@ -usubjid,visitid,dates_enrolment,dates_adm,dates_admdate,dates_admtime,non_encounter_field,outco_denguediag,outco_denguediag_main,outco_denguediag_class,outco_not_dengue,outco_secondiag_oth,outco_date,outco_outcome +subjid,visitid,dates_enrolment,dates_adm,dates_admdate,dates_admtime,non_encounter_field,outco_denguediag,outco_denguediag_main,outco_denguediag_class,outco_not_dengue,outco_secondiag_oth,outco_date,outco_outcome 2,11,2021-04-02,1,2021-04-01,18:00,fish,1,,2,,,2021-04-10,1 \ No newline at end of file diff --git a/tests/dummy_data/encounter_dummy_mapping.csv b/tests/dummy_data/encounter_dummy_mapping.csv index 42026f2..ae99d6b 100644 --- a/tests/dummy_data/encounter_dummy_mapping.csv +++ b/tests/dummy_data/encounter_dummy_mapping.csv @@ -1,5 +1,5 @@ raw_variable,raw_response,id,subject,extension.timingPhase.system,extension.timingPhase.code,extension.timingPhase.text,class.system,class.code,class.text,actualPeriod.start,actualPeriod.end,diagnosis.condition.concept.system,diagnosis.condition.concept.code,diagnosis.condition.concept.text,diagnosis.use.system,diagnosis.use.code,diagnosis.use.text,admission.dischargeDisposition.system,admission.dischargeDisposition.code,admission.dischargeDisposition.text -usubjid,,,Patient/+,,,,,,,,,,,,,,,,, +subjid,,,Patient/+,,,,,,,,,,,,,,,,, visitid,,,,,,,,,,,,,,,,,,,, dates_enrolment,,,,,,,,,, if not , if not ,,,,,,,,, dates_adm,"1, Yes",,,https://snomed.info/sct,278307001,On admission (qualifier value),https://snomed.info/sct,32485007,Hospital admission (procedure),,,,,,,,,,, diff --git a/tests/dummy_data/observation_dummy_mapping.csv b/tests/dummy_data/observation_dummy_mapping.csv index 742d63d..6e39236 100644 --- a/tests/dummy_data/observation_dummy_mapping.csv +++ b/tests/dummy_data/observation_dummy_mapping.csv @@ -1,20 +1,20 @@ raw_variable,raw_response,single_resource_group,category.system,category.code,category.text,effectiveDateTime,code.system,code.code,code.text,subject,encounter,valueQuantity.value,valueQuantity.system,valueQuantity.code,valueQuantity.unit,valueCodeableConcept.system,valueCodeableConcept.code,valueCodeableConcept.text,valueDateTime,valueInteger -vital_highesttem_c,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8310-5,Body temperature,Patient/+,Encounter/+,,http://unitsofmeasure,Cel,DegreesCelsius,,,,, -vital_highesttem_f,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8310-5,Body temperature,Patient/+,Encounter/+,,http://unitsofmeasure,degF,DegreesFarenheit,,,,, -vital_hr,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8867-4,Heart rate,Patient/+,Encounter/+,,https://snomed.info/sct,258983007,Beats/minute (qualifier value),,,,, -vital_rr,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,9279-1,Respiratory rate,Patient/+,Encounter/+,,https://snomed.info/sct,258984001,Breaths/minute (qualifier value),,,,, -vital_systolicbp,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8480-6,Systolic blood pressure,Patient/+,Encounter/+,,http://unitsofmeasure,mm[Hg],MilliMetersOfMercury,,,,, -vital_diastolicbp,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8462-4,Diastolic blood pressure,Patient/+,Encounter/+,,http://unitsofmeasure,mm[Hg],MilliMetersOfMercury,,,,, -vital_spo2,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,59408-5,Oxygen saturation in Arterial blood by Pulse oximetry,Patient/+,Encounter/+,,http://unitsofmeasure,%,Percent,,,,, -vital_fio2spo2_02110,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,3150-0,Inhaled oxygen concentration,Patient/+,Encounter/+,,,,,,,,, -vital_fio2spo2_pcnt,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,3150-0,Inhaled oxygen concentration,Patient/+,Encounter/+,,http://unitsofmeasure,%,Percent,,,,, -vital_capillaryr,"1, Yes",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,50427001,Increased capillary filling time (finding),Patient/+,Encounter/+,,,,,https://snomed.info/sct,373066001,Yes,, -,"0, No",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,50427001,Increased capillary filling time (finding),Patient/+,Encounter/+,,,,,https://snomed.info/sct,373067005,No,, -,"99, Unknown",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,50427001,Increased capillary filling time (finding),Patient/+,Encounter/+,,,,,https://snomed.info/sct,261665006,Unknown,, -vital_avpu,"1, Alert",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),Patient/+,Encounter/+,,,,,https://snomed.info/sct,271591004,Fully conscious (finding),, -,"5, Confusion",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),Patient/+,Encounter/+,,,,,https://snomed.info/sct,40917007,Clouded consciousness (finding),, -,"2, Verbal",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),Patient/+,Encounter/+,,,,,https://snomed.info/sct,300202002,Responds to voice (finding),, -,"3, Pain",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),Patient/+,Encounter/+,,,,,https://snomed.info/sct,450847001,Responds to pain (finding),, -,"4, Unresponsive",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),Patient/+,Encounter/+,,,,,https://snomed.info/sct,422768004,Unresponsive (finding),, -vital_gcs,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,9269-2,Glasgow coma score total,Patient/+,Encounter/+,,,,,,,,, -vital_urineflow,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,9192-6,Urine output 24 hour,Patient/+,Encounter/+,,https://snomed.info/sct,258861009,Millilitre/24 hours (qualifier value),,,,, \ No newline at end of file +vital_highesttem_c,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8310-5,Body temperature,Patient/+,Encounter/+,,http://unitsofmeasure,Cel,DegreesCelsius,,,,, +vital_highesttem_f,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8310-5,Body temperature,Patient/+,Encounter/+,,http://unitsofmeasure,degF,DegreesFarenheit,,,,, +vital_hr,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8867-4,Heart rate,Patient/+,Encounter/+,,https://snomed.info/sct,258983007,Beats/minute (qualifier value),,,,, +vital_rr,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,9279-1,Respiratory rate,Patient/+,Encounter/+,,https://snomed.info/sct,258984001,Breaths/minute (qualifier value),,,,, +vital_systolicbp,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8480-6,Systolic blood pressure,Patient/+,Encounter/+,,http://unitsofmeasure,mm[Hg],MilliMetersOfMercury,,,,, +vital_diastolicbp,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,8462-4,Diastolic blood pressure,Patient/+,Encounter/+,,http://unitsofmeasure,mm[Hg],MilliMetersOfMercury,,,,, +vital_spo2,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,59408-5,Oxygen saturation in Arterial blood by Pulse oximetry,Patient/+,Encounter/+,,http://unitsofmeasure,%,Percent,,,,, +vital_fio2spo2_02110,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,3150-0,Inhaled oxygen concentration,Patient/+,Encounter/+,,,,,,,,, +vital_fio2spo2_pcnt,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,3150-0,Inhaled oxygen concentration,Patient/+,Encounter/+,,http://unitsofmeasure,%,Percent,,,,, +vital_capillaryr,"1, Yes",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,50427001,Increased capillary filling time (finding),Patient/+,Encounter/+,,,,,https://snomed.info/sct,373066001,Yes,, +,"0, No",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,50427001,Increased capillary filling time (finding),Patient/+,Encounter/+,,,,,https://snomed.info/sct,373067005,No,, +,"99, Unknown",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,50427001,Increased capillary filling time (finding),Patient/+,Encounter/+,,,,,https://snomed.info/sct,261665006,Unknown,, +vital_avpu,"1, Alert",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),Patient/+,Encounter/+,,,,,https://snomed.info/sct,271591004,Fully conscious (finding),, +,"5, Confusion",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),Patient/+,Encounter/+,,,,,https://snomed.info/sct,40917007,Clouded consciousness (finding),, +,"2, Verbal",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),Patient/+,Encounter/+,,,,,https://snomed.info/sct,300202002,Responds to voice (finding),, +,"3, Pain",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),Patient/+,Encounter/+,,,,,https://snomed.info/sct,450847001,Responds to pain (finding),, +,"4, Unresponsive",,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,1.10444E+15,Alert Confusion Voice Pain Unresponsive scale score (observable entity),Patient/+,Encounter/+,,,,,https://snomed.info/sct,422768004,Unresponsive (finding),, +vital_gcs,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://snomed.info/sct,9269-2,Glasgow coma score total,Patient/+,Encounter/+,,,,,,,,, +vital_urineflow,,,http://terminology.hl7.org/CodeSystem/observation-category,vital-signs,Vital Signs,,https://loinc.org,9192-6,Urine output 24 hour,Patient/+,Encounter/+,,https://snomed.info/sct,258861009,Millilitre/24 hours (qualifier value),,,,, \ No newline at end of file diff --git a/tests/dummy_data/vital_signs_dummy_data.csv b/tests/dummy_data/vital_signs_dummy_data.csv index a361b93..b03903b 100644 --- a/tests/dummy_data/vital_signs_dummy_data.csv +++ b/tests/dummy_data/vital_signs_dummy_data.csv @@ -1,4 +1,4 @@ -usubjid,visitid,daily_date,vital_highesttem_c,vital_hr,vital_rr,vital_systolicbp,vital_diastolicbp,vital_spo2,vital_fio2spo2_02110,vital_fio2spo2_pcnt,vital_capillaryr,vital_avpu,vital_gcs,vital_urineflow +subjid,visitid,daily_date,vital_highesttem_c,vital_hr,vital_rr,vital_systolicbp,vital_diastolicbp,vital_spo2,vital_fio2spo2_02110,vital_fio2spo2_pcnt,vital_capillaryr,vital_avpu,vital_gcs,vital_urineflow 1,10,2020-01-01,36.2,120,30,70,120,5,,75,1,1,1,150 2,11,2021-02-02,37,100,40,80,130,6,10,85,0,2,1,200 3,12,2022-03-03,35.5,70,50,90,140,7,,95,0,3,1, \ No newline at end of file diff --git a/tests/test_condition_resource.py b/tests/test_condition_resource.py index 958d224..208370f 100644 --- a/tests/test_condition_resource.py +++ b/tests/test_condition_resource.py @@ -6,6 +6,21 @@ CONDITION_DICT_INPUT = { "id": "c201", + "extension": [ + { + "url": "presenceAbsence", + "valueCodeableConcept": { + "coding": [ + { + "system": "http://snomed.info/sct", + "code": "410605003", + "display": "Present", + } + ] + }, + }, + {"url": "prespecifiedQuery", "valueBoolean": True}, + ], "identifier": [{"value": "12345"}], "clinicalStatus": { "coding": [ @@ -90,6 +105,9 @@ CONDITION_FLAT = { "resourceType": ["Condition"], + "extension.presenceAbsence.code": ["http://snomed.info/sct|410605003"], + "extension.presenceAbsence.text": ["Present"], + "extension.prespecifiedQuery": [True], "category.code": [ [ "http://snomed.info/sct|55607006", @@ -111,6 +129,21 @@ } CONDITION_DICT_OUT = { + "extension": [ + {"url": "prespecifiedQuery", "valueBoolean": True}, + { + "url": "presenceAbsence", + "valueCodeableConcept": { + "coding": [ + { + "system": "http://snomed.info/sct", + "code": "410605003", + "display": "Present", + } + ] + }, + }, + ], "clinicalStatus": { "coding": [ { @@ -177,6 +210,7 @@ def test_condition_to_flat(): assert_frame_equal( pd.read_parquet("test_condition.parquet"), pd.DataFrame(CONDITION_FLAT), + check_like=True, ) os.remove("test_condition.parquet") From 9bf774c24d9c7e4380a38c42d5df9cc717d0d1b0 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Wed, 22 May 2024 16:14:40 +0100 Subject: [PATCH 18/21] Fix some typehinting errors --- fhirflat/fhir2flat.py | 4 +- fhirflat/ingest.py | 33 ++++++------ fhirflat/resources/base.py | 4 +- fhirflat/resources/condition.py | 6 +-- fhirflat/resources/encounter.py | 4 +- fhirflat/resources/extension_types.py | 2 +- fhirflat/resources/extensions.py | 50 +++++++++---------- fhirflat/resources/immunization.py | 4 +- fhirflat/resources/location.py | 4 +- .../resources/medicationadministration.py | 4 +- fhirflat/resources/medicationstatement.py | 4 +- fhirflat/resources/observation.py | 4 +- fhirflat/resources/organization.py | 4 +- fhirflat/resources/patient.py | 4 +- fhirflat/resources/procedure.py | 4 +- fhirflat/resources/researchsubject.py | 4 +- fhirflat/resources/specimen.py | 4 +- fhirflat/util.py | 2 +- tests/test_ingest.py | 4 ++ 19 files changed, 79 insertions(+), 70 deletions(-) diff --git a/fhirflat/fhir2flat.py b/fhirflat/fhir2flat.py index a278d4c..31df281 100644 --- a/fhirflat/fhir2flat.py +++ b/fhirflat/fhir2flat.py @@ -208,7 +208,9 @@ def flattenExtensions(df: pd.DataFrame, extension: str) -> pd.DataFrame: def expand_and_redefine(df, extension): - def redefine(row: pd.Series, extension: str) -> pd.Series: + def redefine( + row: pd.Series | pd.DataFrame, extension: str + ) -> pd.Series | pd.DataFrame: """Expands out simple extensions and leaves complex ones as is. To be dealt with later in the pipeline.""" diff --git a/fhirflat/ingest.py b/fhirflat/ingest.py index ecb64b6..253a356 100644 --- a/fhirflat/ingest.py +++ b/fhirflat/ingest.py @@ -50,15 +50,18 @@ def find_field_value(row, response, mapp, raw_data=None): try: return row[col] except KeyError: - try: - return raw_data.loc[row["index"], col] - except KeyError: - raise KeyError(f"Column {col} not found in data") + if raw_data is not None: + try: + return raw_data.loc[row["index"], col] + except KeyError: + raise KeyError(f"Column {col} not found in data") + else: + raise KeyError(f"Column {col} not found in the filtered data") else: return mapp -def create_dict_wide(row: pd.Series, map_df: pd.DataFrame) -> pd.Series: +def create_dict_wide(row: pd.Series, map_df: pd.DataFrame) -> dict: """ Takes a wide-format dataframe and iterates through the columns of the row, applying the mapping to each column and produces a fhirflat-like dictionary to @@ -117,7 +120,7 @@ def create_dict_wide(row: pd.Series, map_df: pd.DataFrame) -> pd.Series: def create_dict_long( row: pd.Series, full_df: pd.DataFrame, map_df: pd.DataFrame -) -> pd.Series: +) -> dict | None: """ Takes a long-format dataframe and a mapping file, and produces a fhirflat-like dictionary for each row in the dataframe. @@ -152,12 +155,12 @@ def create_dict_long( def create_dictionary( - data: pd.DataFrame, - map_file: pd.DataFrame, + data: str, + map_file: str, resource: str, one_to_one=False, subject_id="subjid", -) -> pd.DataFrame: +) -> pd.DataFrame | None: """ Given a data file and a single mapping file for one FHIR resource type, returns a single column dataframe with the mapped data in a FHIRflat-like @@ -165,11 +168,11 @@ def create_dictionary( Parameters ---------- - data: pd.DataFrame - The data file containing the clinical data. + data: str + The path to the data file containing the clinical data. map_file: pd.DataFrame - The mapping file containing the mapping of the clinical data to the FHIR - resource. + The path to the mapping file containing the mapping of the clinical data to the + FHIR resource. resource: str The name of the resource being mapped. one_to_one: bool @@ -178,8 +181,8 @@ def create_dictionary( The name of the column containing the subject ID in the data file. """ - data = pd.read_csv(data, header=0) - map_df = pd.read_csv(map_file, header=0) + data: pd.DataFrame = pd.read_csv(data, header=0) + map_df: pd.DataFrame = pd.read_csv(map_file, header=0) # setup the data ----------------------------------------------------------- relevant_cols = map_df["raw_variable"].dropna().unique() diff --git a/fhirflat/resources/base.py b/fhirflat/resources/base.py index f0dc723..97483e7 100644 --- a/fhirflat/resources/base.py +++ b/fhirflat/resources/base.py @@ -18,14 +18,14 @@ class FHIRFlatBase(DomainResource): Base class for FHIR resources to add FHIRflat functionality. """ - flat_exclusions: ClassVar[set[str]] = ( + flat_exclusions: ClassVar[set[str]] = { "meta", "implicitRules", "language", "text", "contained", "modifierExtension", - ) + } flat_defaults: ClassVar[list[str]] = [] diff --git a/fhirflat/resources/condition.py b/fhirflat/resources/condition.py index b805bf1..a02ba52 100644 --- a/fhirflat/resources/condition.py +++ b/fhirflat/resources/condition.py @@ -38,14 +38,14 @@ class Condition(_Condition, FHIRFlatBase): ) # attributes to exclude from the flat representation - flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions + ( + flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions | { "id", "identifier", "verificationStatus", "evidence", "note", "participant", - ) + } # required attributes that are not present in the FHIRflat representation flat_defaults: ClassVar[list[str]] = FHIRFlatBase.flat_defaults + ["clinicalStatus"] @@ -88,7 +88,7 @@ def cleanup(cls, data: JsonString | dict, json_data=True) -> Condition: like codeableConcepts back into structured data. """ if json_data: - data = orjson.loads(data) + data: dict = orjson.loads(data) data["encounter"] = {"reference": data["encounter"]} data["subject"] = {"reference": data["subject"]} diff --git a/fhirflat/resources/encounter.py b/fhirflat/resources/encounter.py index f65500a..bd3c9cd 100644 --- a/fhirflat/resources/encounter.py +++ b/fhirflat/resources/encounter.py @@ -42,7 +42,7 @@ class Encounter(_Encounter, FHIRFlatBase): ) # attributes to exclude from the flat representation - flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions + ( + flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions | { "identifier", "participant", # participants other than the patient "appointment", # appointment that scheduled the encounter @@ -50,7 +50,7 @@ class Encounter(_Encounter, FHIRFlatBase): "dietPreference", "specialArrangement", # if translator, streatcher, wheelchair etc. needed "specialCourtesy", # contains ID information, VIP, board member, etc. - ) + } # required attributes that are not present in the FHIRflat representation flat_defaults: ClassVar[list[str]] = FHIRFlatBase.flat_defaults + ["status"] diff --git a/fhirflat/resources/extension_types.py b/fhirflat/resources/extension_types.py index 55f61ea..a8ffcdb 100644 --- a/fhirflat/resources/extension_types.py +++ b/fhirflat/resources/extension_types.py @@ -4,7 +4,7 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - from pydantic.v1.types import CallableGenerator + from pydantic.v1.typing import CallableGenerator class AbstractType(_AbstractType): diff --git a/fhirflat/resources/extensions.py b/fhirflat/resources/extensions.py index 18c9f3e..5529f93 100644 --- a/fhirflat/resources/extensions.py +++ b/fhirflat/resources/extensions.py @@ -23,9 +23,9 @@ class timingPhase(_DataType): with an appropriate SNOMED (or similar) code. """ - resource_type = Field("timingPhase", const=True) + resource_type: str = Field(default="timingPhase", const=True) - url = Field("timingPhase", const=True, alias="url") + url: str = Field("timingPhase", const=True, alias="url") valueCodeableConcept: fhirtypes.CodeableConceptType = Field( None, @@ -61,9 +61,9 @@ class relativeDay(_DataType): both the relative start and end dates instead. """ - resource_type = Field("relativeDay", const=True) + resource_type: str = Field(default="relativeDay", const=True) - url = Field("relativeDay", const=True, alias="url") + url: str = Field("relativeDay", const=True, alias="url") valueInteger: fhirtypes.Integer = Field( None, @@ -97,9 +97,9 @@ class relativeStart(_DataType): An ISARIC extension for use inside the complex `relativePeriod` extension. """ - resource_type = Field("relativeStart", const=True) + resource_type: str = Field(default="relativeStart", const=True) - url = Field("relativeStart", const=True, alias="url") + url: str = Field("relativeStart", const=True, alias="url") valueInteger: fhirtypes.Integer = Field( None, @@ -133,9 +133,9 @@ class relativeEnd(_DataType): An ISARIC extension for use inside the complex `relativePeriod` extension. """ - resource_type = Field("relativeEnd", const=True) + resource_type: str = Field(default="relativeEnd", const=True) - url = Field("relativeEnd", const=True, alias="url") + url: str = Field("relativeEnd", const=True, alias="url") valueInteger: fhirtypes.Integer = Field( None, @@ -177,9 +177,9 @@ class relativePeriod(_DataType): relativeEnd is 5. """ - resource_type = Field("relativePeriod", const=True) + resource_type: str = Field(default="relativePeriod", const=True) - url = Field("relativePeriod", const=True, alias="url") + url: str = Field("relativePeriod", const=True, alias="url") extension: list[Union[et.relativeStartType, et.relativeEndType]] = Field( None, @@ -224,9 +224,9 @@ class approximateDate(_DataType): approximateDate extension with a valueString of "3 months". """ - resource_type = Field("approximateDate", const=True) + resource_type: str = Field(default="approximateDate", const=True) - url = Field("approximateDate", const=True, alias="url") + url: str = Field("approximateDate", const=True, alias="url") valueDate: fhirtypes.Date = Field( None, @@ -314,9 +314,9 @@ class Duration(_DataType): duration is not an option in the base FHIR specification. """ - resource_type = Field("Duration", const=True) + resource_type: str = Field(default="Duration", const=True) - url = Field("duration", const=True, alias="url") + url: str = Field("duration", const=True, alias="url") valueQuantity: fhirtypes.QuantityType = Field( None, @@ -350,9 +350,9 @@ class Age(_DataType): An ISARIC extension collecting data on the age of a patient. """ - resource_type = Field("Age", const=True) + resource_type: str = Field(default="Age", const=True) - url = Field("age", const=True, alias="url") + url: str = Field("age", const=True, alias="url") valueQuantity: fhirtypes.QuantityType = Field( None, @@ -386,9 +386,9 @@ class birthSex(_DataType): An ISARIC extension collecting data on the birth sex of a patient. """ - resource_type = Field("birthSex", const=True) + resource_type: str = Field(default="birthSex", const=True) - url = Field("birthSex", const=True, alias="url") + url: str = Field("birthSex", const=True, alias="url") valueCodeableConcept: fhirtypes.CodeableConceptType = Field( None, @@ -422,9 +422,9 @@ class Race(_DataType): An ISARIC extension collecting data on the race of a patient. """ - resource_type = Field("Race", const=True) + resource_type: str = Field(default="Race", const=True) - url = Field("race", const=True, alias="url") + url: str = Field("race", const=True, alias="url") valueCodeableConcept: fhirtypes.CodeableConceptType = Field( None, @@ -458,9 +458,9 @@ class presenceAbsence(_DataType): An ISARIC extension to indicate if a clinical finding is present, absent or unknown. """ - resource_type = Field("presenceAbsence", const=True) + resource_type: str = Field(default="presenceAbsence", const=True) - url = Field("presenceAbsence", const=True, alias="url") + url: str = Field("presenceAbsence", const=True, alias="url") valueCodeableConcept: fhirtypes.CodeableConceptType = Field( None, @@ -494,9 +494,9 @@ class prespecifiedQuery(_DataType): An ISARIC extension to indicate if a finding is the result of a prespecified query. """ - resource_type = Field("prespecifiedQuery", const=True) + resource_type: str = Field(default="prespecifiedQuery", const=True) - url = Field("prespecifiedQuery", const=True, alias="url") + url: str = Field("prespecifiedQuery", const=True, alias="url") valueBoolean: bool = Field( None, @@ -535,7 +535,7 @@ class dateTimeExtension(_FHIRPrimitiveExtension): to the current date. """ - resource_type = Field("dateTimeExtension", const=True) + resource_type: str = Field(default="dateTimeExtension", const=True) extension: list[ Union[et.approximateDateType, et.relativeDayType, fhirtypes.ExtensionType] diff --git a/fhirflat/resources/immunization.py b/fhirflat/resources/immunization.py index 35e5b03..686d91e 100644 --- a/fhirflat/resources/immunization.py +++ b/fhirflat/resources/immunization.py @@ -37,7 +37,7 @@ class Immunization(_Immunization, FHIRFlatBase): ) # attributes to exclude from the flat representation - flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions + ( + flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions | { "id", "identifier", "basedOn", @@ -50,7 +50,7 @@ class Immunization(_Immunization, FHIRFlatBase): "informationSource", "performer", "note", - ) + } # required attributes that are not present in the FHIRflat representation flat_defaults: ClassVar[list[str]] = FHIRFlatBase.flat_defaults + ["status"] diff --git a/fhirflat/resources/location.py b/fhirflat/resources/location.py index e999301..b1711a1 100644 --- a/fhirflat/resources/location.py +++ b/fhirflat/resources/location.py @@ -12,13 +12,13 @@ class Location(_Location, FHIRFlatBase): # attributes to exclude from the flat representation - flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions + ( + flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions | { "id", "identifier", "status", "contact", # phone numbers, addresses, "hoursOfOperation", - ) + } @classmethod def cleanup(cls, data: JsonString | dict, json_data=True) -> Location: diff --git a/fhirflat/resources/medicationadministration.py b/fhirflat/resources/medicationadministration.py index c3ed488..2f57337 100644 --- a/fhirflat/resources/medicationadministration.py +++ b/fhirflat/resources/medicationadministration.py @@ -14,13 +14,13 @@ class MedicationAdministration(_MedicationAdministration, FHIRFlatBase): # attributes to exclude from the flat representation - flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions + ( + flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions | { "id", "identifier", "basedOn", "performer", "note", - ) + } # required attributes that are not present in the FHIRflat representation flat_defaults: ClassVar[list[str]] = FHIRFlatBase.flat_defaults + ["status"] diff --git a/fhirflat/resources/medicationstatement.py b/fhirflat/resources/medicationstatement.py index 824971e..458f498 100644 --- a/fhirflat/resources/medicationstatement.py +++ b/fhirflat/resources/medicationstatement.py @@ -14,12 +14,12 @@ class MedicationStatement(_MedicationStatement, FHIRFlatBase): # attributes to exclude from the flat representation - flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions + ( + flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions | { "id", "identifier", "informationSource", "note", - ) + } # required attributes that are not present in the FHIRflat representation flat_defaults: ClassVar[list[str]] = FHIRFlatBase.flat_defaults + ["status"] diff --git a/fhirflat/resources/observation.py b/fhirflat/resources/observation.py index 956c49d..e937481 100644 --- a/fhirflat/resources/observation.py +++ b/fhirflat/resources/observation.py @@ -67,7 +67,7 @@ class Observation(_Observation, FHIRFlatBase): ) # attributes to exclude from the flat representation - flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions + ( + flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions | { "id", "identifier", "instantiatesCanonical", @@ -77,7 +77,7 @@ class Observation(_Observation, FHIRFlatBase): "referenceRange", "issued", "note", - ) + } # required attributes that are not present in the FHIRflat representation flat_defaults: ClassVar[list[str]] = FHIRFlatBase.flat_defaults + ["status"] diff --git a/fhirflat/resources/organization.py b/fhirflat/resources/organization.py index 47cc7c8..8239079 100644 --- a/fhirflat/resources/organization.py +++ b/fhirflat/resources/organization.py @@ -12,12 +12,12 @@ class Organization(_Organization, FHIRFlatBase): # attributes to exclude from the flat representation - flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions + ( + flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions | { "id", "identifier", "active", "contact", # phone numbers, addresses - ) + } @classmethod def cleanup(cls, data: JsonString | dict, json_data=True) -> Organization: diff --git a/fhirflat/resources/patient.py b/fhirflat/resources/patient.py index ac06f5b..dc44784 100644 --- a/fhirflat/resources/patient.py +++ b/fhirflat/resources/patient.py @@ -30,7 +30,7 @@ class Patient(Patient, FHIRFlatBase): ) # attributes to exclude from the flat representation - flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions + ( + flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions | { "identifier", "active", "name", @@ -40,7 +40,7 @@ class Patient(Patient, FHIRFlatBase): "contact", "communication", "link", - ) + } @validator("extension") def validate_extension_contents(cls, extensions): diff --git a/fhirflat/resources/procedure.py b/fhirflat/resources/procedure.py index 179f028..7c36c75 100644 --- a/fhirflat/resources/procedure.py +++ b/fhirflat/resources/procedure.py @@ -48,7 +48,7 @@ class Procedure(_Procedure, FHIRFlatBase): ) # attributes to exclude from the flat representation - flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions + ( + flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions | { "id", "identifier", "instantiatesCanonical", @@ -62,7 +62,7 @@ class Procedure(_Procedure, FHIRFlatBase): "reason", "note", "supportingInfo", - ) + } # required attributes that are not present in the FHIRflat representation flat_defaults: ClassVar[list[str]] = FHIRFlatBase.flat_defaults + ["status"] diff --git a/fhirflat/resources/researchsubject.py b/fhirflat/resources/researchsubject.py index d888321..17bef29 100644 --- a/fhirflat/resources/researchsubject.py +++ b/fhirflat/resources/researchsubject.py @@ -12,10 +12,10 @@ class ResearchSubject(_ResearchSubject, FHIRFlatBase): # attributes to exclude from the flat representation - flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions + ( + flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions | { "id", "identifier", - ) + } # required attributes that are not present in the FHIRflat representation flat_defaults: ClassVar[list[str]] = FHIRFlatBase.flat_defaults + ["status"] diff --git a/fhirflat/resources/specimen.py b/fhirflat/resources/specimen.py index 5661274..e9512ac 100644 --- a/fhirflat/resources/specimen.py +++ b/fhirflat/resources/specimen.py @@ -12,13 +12,13 @@ class Specimen(_Specimen, FHIRFlatBase): # attributes to exclude from the flat representation - flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions + ( + flat_exclusions: ClassVar[set[str]] = FHIRFlatBase.flat_exclusions | { "id", "identifier", "accessionIdentifier", "status", "note", - ) + } @classmethod def cleanup(cls, data: JsonString | dict, json_data=True) -> Specimen: diff --git a/fhirflat/util.py b/fhirflat/util.py index 71cdbe8..8e52a4b 100644 --- a/fhirflat/util.py +++ b/fhirflat/util.py @@ -9,7 +9,7 @@ import fhirflat -def group_keys(data_keys: list[str]) -> list[dict[str, list[str]]]: +def group_keys(data_keys: list[str]) -> dict[str, list[str]]: """ Finds columns with a '.' in the name denoting data that has been flattened and groups them together. diff --git a/tests/test_ingest.py b/tests/test_ingest.py index f304bf6..d6e5feb 100644 --- a/tests/test_ingest.py +++ b/tests/test_ingest.py @@ -51,6 +51,7 @@ def test_create_dict_one_to_one_single_row(): one_to_one=True, ) + assert df is not None dict_out = df["flat_dict"][0] assert dict_out == ENCOUNTER_DICT_OUT @@ -133,6 +134,7 @@ def test_load_data_one_to_one_single_row(): one_to_one=True, ) + assert df is not None Encounter.ingest_to_flat(df, "encounter_ingestion_single") assert_frame_equal( @@ -336,6 +338,7 @@ def test_load_data_one_to_one_multi_row(): one_to_one=True, ) + assert df is not None Encounter.ingest_to_flat(df, "encounter_ingestion_multi") assert_frame_equal( @@ -427,6 +430,7 @@ def test_load_data_one_to_many_multi_row(): one_to_one=False, ) + assert df is not None Observation.ingest_to_flat(df.dropna(), "observation_ingestion") full_df = pd.read_parquet("observation_ingestion.parquet") From dd4afe96529c6e862f86dcb313d80b1c11425fd4 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Wed, 22 May 2024 16:42:01 +0100 Subject: [PATCH 19/21] Update init file --- fhirflat/__init__.py | 26 ++++++++++++++------------ fhirflat/resources/__init__.py | 12 ++++++++++++ fhirflat/resources/base.py | 4 ++-- 3 files changed, 28 insertions(+), 14 deletions(-) create mode 100644 fhirflat/resources/__init__.py diff --git a/fhirflat/__init__.py b/fhirflat/__init__.py index 8e0e52b..701e314 100644 --- a/fhirflat/__init__.py +++ b/fhirflat/__init__.py @@ -1,14 +1,16 @@ -from .resources.condition import Condition -from .resources.encounter import Encounter -from .resources.immunization import Immunization -from .resources.location import Location -from .resources.medicationadministration import MedicationAdministration -from .resources.medicationstatement import MedicationStatement -from .resources.observation import Observation -from .resources.organization import Organization -from .resources.patient import Patient -from .resources.procedure import Procedure -from .resources.researchsubject import ResearchSubject -from .resources.specimen import Specimen +from .resources import ( + Condition, + Encounter, + Immunization, + Location, + MedicationAdministration, + MedicationStatement, + Observation, + Organization, + Patient, + Procedure, + ResearchSubject, + Specimen, +) from .ingest import convert_data_to_flat diff --git a/fhirflat/resources/__init__.py b/fhirflat/resources/__init__.py new file mode 100644 index 0000000..de3ba2b --- /dev/null +++ b/fhirflat/resources/__init__.py @@ -0,0 +1,12 @@ +from .condition import Condition +from .encounter import Encounter +from .immunization import Immunization +from .location import Location +from .medicationadministration import MedicationAdministration +from .medicationstatement import MedicationStatement +from .observation import Observation +from .organization import Organization +from .patient import Patient +from .procedure import Procedure +from .researchsubject import ResearchSubject +from .specimen import Specimen diff --git a/fhirflat/resources/base.py b/fhirflat/resources/base.py index 97483e7..4f4a521 100644 --- a/fhirflat/resources/base.py +++ b/fhirflat/resources/base.py @@ -1,6 +1,6 @@ # from pydantic import BaseModel from __future__ import annotations -from fhir.resources.domainresource import DomainResource +from fhir.resources.domainresource import DomainResource as _DomainResource import pandas as pd import orjson @@ -13,7 +13,7 @@ JsonString: TypeAlias = str -class FHIRFlatBase(DomainResource): +class FHIRFlatBase(_DomainResource): """ Base class for FHIR resources to add FHIRflat functionality. """ From bef7c354929a787f9cf69a5ee77c2079fabb0f66 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Wed, 22 May 2024 17:35:30 +0100 Subject: [PATCH 20/21] Update some relative imports and fix different types test warning --- fhirflat/resources/base.py | 6 +++--- fhirflat/resources/condition.py | 2 +- fhirflat/resources/encounter.py | 2 +- fhirflat/resources/immunization.py | 2 +- fhirflat/resources/location.py | 2 +- fhirflat/resources/medicationadministration.py | 2 +- fhirflat/resources/medicationstatement.py | 2 +- fhirflat/resources/observation.py | 2 +- fhirflat/resources/organization.py | 2 +- fhirflat/resources/patient.py | 2 +- fhirflat/resources/procedure.py | 2 +- fhirflat/resources/researchsubject.py | 2 +- fhirflat/resources/specimen.py | 2 +- tests/test_ingest.py | 9 +++++---- 14 files changed, 20 insertions(+), 19 deletions(-) diff --git a/fhirflat/resources/base.py b/fhirflat/resources/base.py index 4f4a521..7719212 100644 --- a/fhirflat/resources/base.py +++ b/fhirflat/resources/base.py @@ -5,8 +5,8 @@ import pandas as pd import orjson -from ..fhir2flat import fhir2flat -from ..flat2fhir import expand_concepts +from fhirflat.fhir2flat import fhir2flat +from fhirflat.flat2fhir import expand_concepts from typing import TypeAlias, ClassVar @@ -136,7 +136,7 @@ def ingest_to_flat(cls, data: pd.DataFrame, filename: str): Pandas dataframe containing the data """ - data["flat_dict"] = cls.ingest_backbone_elements(data["flat_dict"]) + data.loc[:, "flat_dict"] = cls.ingest_backbone_elements(data["flat_dict"]) # Creates a columns of FHIR resource instances data["fhir"] = data["flat_dict"].apply( diff --git a/fhirflat/resources/condition.py b/fhirflat/resources/condition.py index a02ba52..0a6b00f 100644 --- a/fhirflat/resources/condition.py +++ b/fhirflat/resources/condition.py @@ -5,7 +5,7 @@ from .extensions import presenceAbsence, prespecifiedQuery, timingPhase import orjson -from ..flat2fhir import expand_concepts +from fhirflat.flat2fhir import expand_concepts from typing import TypeAlias, ClassVar, Union from fhir.resources import fhirtypes from pydantic.v1 import Field, validator diff --git a/fhirflat/resources/encounter.py b/fhirflat/resources/encounter.py index bd3c9cd..917dc68 100644 --- a/fhirflat/resources/encounter.py +++ b/fhirflat/resources/encounter.py @@ -10,7 +10,7 @@ from .base import FHIRFlatBase import orjson -from ..flat2fhir import expand_concepts +from fhirflat.flat2fhir import expand_concepts from .extensions import relativePeriod, timingPhase from .extension_types import relativePeriodType, timingPhaseType diff --git a/fhirflat/resources/immunization.py b/fhirflat/resources/immunization.py index 686d91e..4e08019 100644 --- a/fhirflat/resources/immunization.py +++ b/fhirflat/resources/immunization.py @@ -6,7 +6,7 @@ from pydantic.v1 import Field, validator import orjson -from ..flat2fhir import expand_concepts +from fhirflat.flat2fhir import expand_concepts from typing import TypeAlias, ClassVar, Union from fhir.resources import fhirtypes diff --git a/fhirflat/resources/location.py b/fhirflat/resources/location.py index b1711a1..9c0a754 100644 --- a/fhirflat/resources/location.py +++ b/fhirflat/resources/location.py @@ -3,7 +3,7 @@ from .base import FHIRFlatBase import orjson -from ..flat2fhir import expand_concepts +from fhirflat.flat2fhir import expand_concepts from typing import TypeAlias, ClassVar JsonString: TypeAlias = str diff --git a/fhirflat/resources/medicationadministration.py b/fhirflat/resources/medicationadministration.py index 2f57337..acade00 100644 --- a/fhirflat/resources/medicationadministration.py +++ b/fhirflat/resources/medicationadministration.py @@ -5,7 +5,7 @@ from .base import FHIRFlatBase import orjson -from ..flat2fhir import expand_concepts +from fhirflat.flat2fhir import expand_concepts from typing import TypeAlias, ClassVar JsonString: TypeAlias = str diff --git a/fhirflat/resources/medicationstatement.py b/fhirflat/resources/medicationstatement.py index 458f498..3078bc4 100644 --- a/fhirflat/resources/medicationstatement.py +++ b/fhirflat/resources/medicationstatement.py @@ -5,7 +5,7 @@ from .base import FHIRFlatBase import orjson -from ..flat2fhir import expand_concepts +from fhirflat.flat2fhir import expand_concepts from typing import TypeAlias, ClassVar JsonString: TypeAlias = str diff --git a/fhirflat/resources/observation.py b/fhirflat/resources/observation.py index e937481..a189b38 100644 --- a/fhirflat/resources/observation.py +++ b/fhirflat/resources/observation.py @@ -9,7 +9,7 @@ import orjson from fhir.resources import fhirtypes -from ..flat2fhir import expand_concepts +from fhirflat.flat2fhir import expand_concepts from typing import TypeAlias, ClassVar, Union JsonString: TypeAlias = str diff --git a/fhirflat/resources/organization.py b/fhirflat/resources/organization.py index 8239079..545c5de 100644 --- a/fhirflat/resources/organization.py +++ b/fhirflat/resources/organization.py @@ -3,7 +3,7 @@ from .base import FHIRFlatBase import orjson -from ..flat2fhir import expand_concepts +from fhirflat.flat2fhir import expand_concepts from typing import TypeAlias, ClassVar JsonString: TypeAlias = str diff --git a/fhirflat/resources/patient.py b/fhirflat/resources/patient.py index dc44784..757c4d0 100644 --- a/fhirflat/resources/patient.py +++ b/fhirflat/resources/patient.py @@ -4,7 +4,7 @@ from .extensions import Age, birthSex, Race import orjson -from ..flat2fhir import expand_concepts +from fhirflat.flat2fhir import expand_concepts from typing import TypeAlias, ClassVar, Union from fhir.resources import fhirtypes from pydantic.v1 import Field, validator diff --git a/fhirflat/resources/procedure.py b/fhirflat/resources/procedure.py index 7c36c75..6d1dcec 100644 --- a/fhirflat/resources/procedure.py +++ b/fhirflat/resources/procedure.py @@ -14,7 +14,7 @@ from pydantic.v1 import Field, validator import orjson -from ..flat2fhir import expand_concepts +from fhirflat.flat2fhir import expand_concepts from typing import TypeAlias, ClassVar, Union from fhir.resources import fhirtypes diff --git a/fhirflat/resources/researchsubject.py b/fhirflat/resources/researchsubject.py index 17bef29..92663ec 100644 --- a/fhirflat/resources/researchsubject.py +++ b/fhirflat/resources/researchsubject.py @@ -3,7 +3,7 @@ from .base import FHIRFlatBase import orjson -from ..flat2fhir import expand_concepts +from fhirflat.flat2fhir import expand_concepts from typing import TypeAlias, ClassVar JsonString: TypeAlias = str diff --git a/fhirflat/resources/specimen.py b/fhirflat/resources/specimen.py index e9512ac..c4b767d 100644 --- a/fhirflat/resources/specimen.py +++ b/fhirflat/resources/specimen.py @@ -3,7 +3,7 @@ from .base import FHIRFlatBase import orjson -from ..flat2fhir import expand_concepts +from fhirflat.flat2fhir import expand_concepts from typing import TypeAlias, ClassVar JsonString: TypeAlias = str diff --git a/tests/test_ingest.py b/tests/test_ingest.py index d6e5feb..5d6716a 100644 --- a/tests/test_ingest.py +++ b/tests/test_ingest.py @@ -9,6 +9,7 @@ import os import shutil from decimal import Decimal +import numpy as np ENCOUNTER_DICT_OUT = { @@ -275,19 +276,19 @@ def test_load_data_one_to_one_single_row(): None, None, None, - "Malaria", + ["Malaria"], ], "diagnosis.use.code": [ None, None, None, - "https://snomed.info/sct|89100005", + ["https://snomed.info/sct|89100005"], ], "diagnosis.use.text": [ None, None, None, - "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", + ["Final diagnosis (discharge) (contextual qualifier) (qualifier value)"], ], "subject": ["Patient/1", "Patient/2", "Patient/3", "Patient/4"], "id": ["10", "11", "12", "13"], @@ -418,7 +419,7 @@ def test_load_data_one_to_one_multi_row(): ], "valueCodeableConcept.code": [None, None, None, None, None], "valueCodeableConcept.text": [None, None, None, None, None], - "valueInteger": [None, None, None, None, None], + "valueInteger": [np.nan, np.nan, np.nan, np.nan, np.nan], } From 08eda1583702c8e5e5e25b4dee99a89dbe7ffe2a Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Thu, 23 May 2024 10:37:30 +0100 Subject: [PATCH 21/21] Fix more types --- fhirflat/flat2fhir.py | 10 +++++----- fhirflat/ingest.py | 6 +++--- fhirflat/resources/condition.py | 8 +++++--- fhirflat/resources/encounter.py | 8 +++++--- fhirflat/resources/immunization.py | 8 +++++--- fhirflat/resources/location.py | 8 +++++--- .../resources/medicationadministration.py | 8 +++++--- fhirflat/resources/medicationstatement.py | 10 +++++++--- fhirflat/resources/observation.py | 8 +++++--- fhirflat/resources/organization.py | 8 +++++--- fhirflat/resources/patient.py | 19 +++++++++++++------ fhirflat/resources/procedure.py | 8 +++++--- fhirflat/resources/researchsubject.py | 8 +++++--- fhirflat/resources/specimen.py | 8 +++++--- fhirflat/util.py | 3 ++- 15 files changed, 80 insertions(+), 48 deletions(-) diff --git a/fhirflat/flat2fhir.py b/fhirflat/flat2fhir.py index a9be8c0..2d87448 100644 --- a/fhirflat/flat2fhir.py +++ b/fhirflat/flat2fhir.py @@ -17,25 +17,25 @@ def create_codeable_concept( - old_dict: dict[str, list[str] | str], name: str + old_dict: dict[str, list[str] | str | float], name: str ) -> dict[str, list[str]]: """Re-creates a codeableConcept structure from the FHIRflat representation.""" # for reading in from ingestion pipeline if name + ".code" in old_dict and name + ".system" in old_dict: - raw_codes = old_dict.get(name + ".code") + raw_codes: str | float | list[str] = old_dict.get(name + ".code") if not isinstance(raw_codes, list): formatted_code = ( raw_codes if isinstance(raw_codes, str) else str(int(raw_codes)) ) codes = [old_dict[name + ".system"] + "|" + formatted_code] else: - formatted_code = [ + formatted_codes = [ c if isinstance(c, str) else str(int(c)) for c in raw_codes ] codes = [ [s + "|" + c] - for s, c in zip(old_dict[name + ".system"], formatted_code) + for s, c in zip(old_dict[name + ".system"], formatted_codes) ] else: # From FHIRflat file @@ -206,7 +206,7 @@ def find_data_class(data_class: list[BaseModel] | BaseModel, k: str) -> BaseMode return get_fhirtype(base_class) -def expand_concepts(data: dict, data_class: type[_DomainResource]) -> dict: +def expand_concepts(data: dict[str, str], data_class: type[_DomainResource]) -> dict: """ Combines columns containing flattened FHIR concepts back into JSON-like structures. diff --git a/fhirflat/ingest.py b/fhirflat/ingest.py index 253a356..4df3da4 100644 --- a/fhirflat/ingest.py +++ b/fhirflat/ingest.py @@ -68,7 +68,7 @@ def create_dict_wide(row: pd.Series, map_df: pd.DataFrame) -> dict: initialize the resource object for each row. """ - result = {} + result: dict = {} for column in row.index: if column in map_df.index.get_level_values(0): response = row[column] @@ -155,7 +155,7 @@ def create_dict_long( def create_dictionary( - data: str, + data_file: str, map_file: str, resource: str, one_to_one=False, @@ -181,7 +181,7 @@ def create_dictionary( The name of the column containing the subject ID in the data file. """ - data: pd.DataFrame = pd.read_csv(data, header=0) + data: pd.DataFrame = pd.read_csv(data_file, header=0) map_df: pd.DataFrame = pd.read_csv(map_file, header=0) # setup the data ----------------------------------------------------------- diff --git a/fhirflat/resources/condition.py b/fhirflat/resources/condition.py index 0a6b00f..ae1b25b 100644 --- a/fhirflat/resources/condition.py +++ b/fhirflat/resources/condition.py @@ -81,14 +81,16 @@ def flat_descriptions(cls) -> dict[str, str]: return descrip @classmethod - def cleanup(cls, data: JsonString | dict, json_data=True) -> Condition: + def cleanup(cls, data_dict: JsonString | dict, json_data=True) -> Condition: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - if json_data: - data: dict = orjson.loads(data) + if json_data and isinstance(data_dict, str): + data: dict = orjson.loads(data_dict) + elif isinstance(data_dict, dict): + data: dict = data_dict data["encounter"] = {"reference": data["encounter"]} data["subject"] = {"reference": data["subject"]} diff --git a/fhirflat/resources/encounter.py b/fhirflat/resources/encounter.py index 917dc68..78fe9f3 100644 --- a/fhirflat/resources/encounter.py +++ b/fhirflat/resources/encounter.py @@ -74,14 +74,16 @@ def validate_extension_contents(cls, extensions): return extensions @classmethod - def cleanup(cls, data: JsonString | dict, json_data=True) -> Encounter: + def cleanup(cls, data_dict: JsonString | dict, json_data=True) -> Encounter: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - if json_data: - data = orjson.loads(data) + if json_data and isinstance(data_dict, str): + data: dict = orjson.loads(data_dict) + elif isinstance(data_dict, dict): + data: dict = data_dict for field in { "subject", diff --git a/fhirflat/resources/immunization.py b/fhirflat/resources/immunization.py index 4e08019..115439e 100644 --- a/fhirflat/resources/immunization.py +++ b/fhirflat/resources/immunization.py @@ -65,14 +65,16 @@ def validate_extension_contents(cls, extensions): return extensions @classmethod - def cleanup(cls, data: JsonString | dict, json_data=True) -> Immunization: + def cleanup(cls, data_dict: JsonString | dict, json_data=True) -> Immunization: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - if json_data: - data = orjson.loads(data) + if json_data and isinstance(data_dict, str): + data: dict = orjson.loads(data_dict) + elif isinstance(data_dict, dict): + data: dict = data_dict for field in ( {"patient", "encounter", "location"} diff --git a/fhirflat/resources/location.py b/fhirflat/resources/location.py index 9c0a754..ba1b96c 100644 --- a/fhirflat/resources/location.py +++ b/fhirflat/resources/location.py @@ -21,14 +21,16 @@ class Location(_Location, FHIRFlatBase): } @classmethod - def cleanup(cls, data: JsonString | dict, json_data=True) -> Location: + def cleanup(cls, data_dict: JsonString | dict, json_data=True) -> Location: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - if json_data: - data = orjson.loads(data) + if json_data and isinstance(data_dict, str): + data: dict = orjson.loads(data_dict) + elif isinstance(data_dict, dict): + data: dict = data_dict for field in { "managingOrganization", diff --git a/fhirflat/resources/medicationadministration.py b/fhirflat/resources/medicationadministration.py index acade00..c46aa15 100644 --- a/fhirflat/resources/medicationadministration.py +++ b/fhirflat/resources/medicationadministration.py @@ -27,15 +27,17 @@ class MedicationAdministration(_MedicationAdministration, FHIRFlatBase): @classmethod def cleanup( - cls, data: JsonString | dict, json_data=True + cls, data_dict: JsonString | dict, json_data=True ) -> MedicationAdministration: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - if json_data: - data = orjson.loads(data) + if json_data and isinstance(data_dict, str): + data: dict = orjson.loads(data_dict) + elif isinstance(data_dict, dict): + data: dict = data_dict for field in ( { diff --git a/fhirflat/resources/medicationstatement.py b/fhirflat/resources/medicationstatement.py index 3078bc4..4136745 100644 --- a/fhirflat/resources/medicationstatement.py +++ b/fhirflat/resources/medicationstatement.py @@ -25,14 +25,18 @@ class MedicationStatement(_MedicationStatement, FHIRFlatBase): flat_defaults: ClassVar[list[str]] = FHIRFlatBase.flat_defaults + ["status"] @classmethod - def cleanup(cls, data: JsonString | dict, json_data=True) -> MedicationStatement: + def cleanup( + cls, data_dict: JsonString | dict, json_data=True + ) -> MedicationStatement: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - if json_data: - data = orjson.loads(data) + if json_data and isinstance(data_dict, str): + data: dict = orjson.loads(data_dict) + elif isinstance(data_dict, dict): + data: dict = data_dict for field in ( { diff --git a/fhirflat/resources/observation.py b/fhirflat/resources/observation.py index a189b38..582faf7 100644 --- a/fhirflat/resources/observation.py +++ b/fhirflat/resources/observation.py @@ -92,14 +92,16 @@ def validate_extension_contents(cls, extensions): return extensions @classmethod - def cleanup(cls, data: JsonString | dict, json_data=True) -> Observation: + def cleanup(cls, data_dict: JsonString | dict, json_data=True) -> Observation: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - if json_data: - data = orjson.loads(data) + if json_data and isinstance(data_dict, str): + data: dict = orjson.loads(data_dict) + elif isinstance(data_dict, dict): + data: dict = data_dict for field in { "encounter", diff --git a/fhirflat/resources/organization.py b/fhirflat/resources/organization.py index 545c5de..66ab4a8 100644 --- a/fhirflat/resources/organization.py +++ b/fhirflat/resources/organization.py @@ -20,14 +20,16 @@ class Organization(_Organization, FHIRFlatBase): } @classmethod - def cleanup(cls, data: JsonString | dict, json_data=True) -> Organization: + def cleanup(cls, data_dict: JsonString | dict, json_data=True) -> Organization: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - if json_data: - data = orjson.loads(data) + if json_data and isinstance(data_dict, str): + data: dict = orjson.loads(data_dict) + elif isinstance(data_dict, dict): + data: dict = data_dict for field in { "partOf", diff --git a/fhirflat/resources/patient.py b/fhirflat/resources/patient.py index 757c4d0..e94d705 100644 --- a/fhirflat/resources/patient.py +++ b/fhirflat/resources/patient.py @@ -1,4 +1,5 @@ -from fhir.resources.patient import Patient +from __future__ import annotations +from fhir.resources.patient import Patient as _Patient from .base import FHIRFlatBase from .extension_types import ageType, birthSexType, raceType from .extensions import Age, birthSex, Race @@ -12,7 +13,7 @@ JsonString: TypeAlias = str -class Patient(Patient, FHIRFlatBase): +class Patient(_Patient, FHIRFlatBase): extension: list[Union[ageType, birthSexType, raceType, fhirtypes.ExtensionType]] = ( Field( None, @@ -69,10 +70,16 @@ def flat_descriptions(cls) -> dict[str, str]: return descrip @classmethod - def cleanup(cls, data: JsonString | dict, json_data=True) -> Patient: - # Load the data and apply resource-specific changes - if json_data: - data = orjson.loads(data) + def cleanup(cls, data_dict: JsonString | dict, json_data=True) -> Patient: + """ + Load data into a dictionary-like structure, then + apply resource-specific changes and unpack flattened data + like codeableConcepts back into structured data. + """ + if json_data and isinstance(data_dict, str): + data: dict = orjson.loads(data_dict) + elif isinstance(data_dict, dict): + data: dict = data_dict data["id"] = str(data["id"]) diff --git a/fhirflat/resources/procedure.py b/fhirflat/resources/procedure.py index 6d1dcec..9848228 100644 --- a/fhirflat/resources/procedure.py +++ b/fhirflat/resources/procedure.py @@ -81,14 +81,16 @@ def validate_extension_contents(cls, extensions): return extensions @classmethod - def cleanup(cls, data: JsonString | dict, json_data=True) -> Procedure: + def cleanup(cls, data_dict: JsonString | dict, json_data=True) -> Procedure: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - if json_data: - data = orjson.loads(data) + if json_data and isinstance(data_dict, str): + data: dict = orjson.loads(data_dict) + elif isinstance(data_dict, dict): + data: dict = data_dict for field in { "partOf", diff --git a/fhirflat/resources/researchsubject.py b/fhirflat/resources/researchsubject.py index 92663ec..2bf930e 100644 --- a/fhirflat/resources/researchsubject.py +++ b/fhirflat/resources/researchsubject.py @@ -21,14 +21,16 @@ class ResearchSubject(_ResearchSubject, FHIRFlatBase): flat_defaults: ClassVar[list[str]] = FHIRFlatBase.flat_defaults + ["status"] @classmethod - def cleanup(cls, data: JsonString | dict, json_data=True) -> ResearchSubject: + def cleanup(cls, data_dict: JsonString | dict, json_data=True) -> ResearchSubject: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - if json_data: - data = orjson.loads(data) + if json_data and isinstance(data_dict, str): + data: dict = orjson.loads(data_dict) + elif isinstance(data_dict, dict): + data: dict = data_dict for field in ( {"study", "subject", "consent"} diff --git a/fhirflat/resources/specimen.py b/fhirflat/resources/specimen.py index c4b767d..732871d 100644 --- a/fhirflat/resources/specimen.py +++ b/fhirflat/resources/specimen.py @@ -21,14 +21,16 @@ class Specimen(_Specimen, FHIRFlatBase): } @classmethod - def cleanup(cls, data: JsonString | dict, json_data=True) -> Specimen: + def cleanup(cls, data_dict: JsonString | dict, json_data=True) -> Specimen: """ Load data into a dictionary-like structure, then apply resource-specific changes and unpack flattened data like codeableConcepts back into structured data. """ - if json_data: - data = orjson.loads(data) + if json_data and isinstance(data_dict, str): + data: dict = orjson.loads(data_dict) + elif isinstance(data_dict, dict): + data: dict = data_dict for field in ( { diff --git a/fhirflat/util.py b/fhirflat/util.py index 8e52a4b..760cdfd 100644 --- a/fhirflat/util.py +++ b/fhirflat/util.py @@ -3,13 +3,14 @@ import fhir.resources import re import importlib +from collections.abc import KeysView from .resources import extensions import fhirflat -def group_keys(data_keys: list[str]) -> dict[str, list[str]]: +def group_keys(data_keys: list[str] | KeysView) -> dict[str, list[str]]: """ Finds columns with a '.' in the name denoting data that has been flattened and groups them together.