Stop unwanted overwriting (#91)

* Draft: stop unwanted overwriting * Add first test * Increase coverage Fixes: #90
globaldothealth · Nov 23, 2023 · 25014df · 25014df
1 parent 30ec076
commit 25014df
Show file tree

Hide file tree

Showing 4 changed files with 123 additions and 1 deletion.
diff --git a/adtl/__init__.py b/adtl/__init__.py
@@ -719,7 +719,41 @@ def update_table(self, table: str, row: StrDict):
                 value = get_value(row, self.spec[table][attr], self.ctx(attr))
                 # Check against all null elements, for combinedType=set/list, null is []
                 if value is not None and value != []:
-                    self.data[table][group_key][attr] = value
+                    if attr not in self.data[table][group_key].keys():
+                        # if data for this field hasn't already been captured
+                        self.data[table][group_key][attr] = value
+
+                    else:
+                        if "combinedType" in self.spec[table][attr]:
+                            combined_type = self.spec[table][attr]["combinedType"]
+                            existing_value = self.data[table][group_key][attr]
+
+                            if combined_type in ["all", "any", "min", "max"]:
+                                values = [existing_value, value]
+                                # normally calling eval() is a bad idea, but here values are restricted, so okay
+                                self.data[table][group_key][attr] = eval(combined_type)(
+                                    values
+                                )
+                            elif combined_type in ["list", "set"]:
+                                if combined_type == "set":
+                                    self.data[table][group_key][attr] = list(
+                                        set(existing_value + value)
+                                    )
+                                else:
+                                    self.data[table][group_key][attr] = (
+                                        existing_value + value
+                                    )
+                            elif combined_type == "firstNonNull":
+                                # only use the first value found
+                                pass
+                        else:
+                            # otherwise overwrite?
+                            logging.debug(
+                                f"Multiple rows of data found for {attr} without a"
+                                " combinedType listed. Data being overwritten."
+                            )
+                            self.data[table][group_key][attr] = value
+
         elif kind == "oneToMany":
             for match in self.spec[table]:
                 if "if" not in match:

diff --git a/tests/parsers/stop-overwriting.toml b/tests/parsers/stop-overwriting.toml
@@ -0,0 +1,47 @@
+[adtl]
+  name = "overwrite"
+  description = "Example using groupBy on data with multiple rows per subject"
+
+[adtl.tables.visit]
+  kind = "groupBy"
+  groupBy = "subject_id"
+  aggregation = "lastNotNull"
+
+[visit]
+
+  [visit.subject_id]
+    field = "subjid"
+    description = "Subject ID"
+
+  [visit.earliest_admission]
+    combinedType = "min"
+    fields = [
+      { field = "first_admit" },
+    ]
+
+  [visit.start_date]
+    combinedType = "firstNonNull"
+    fields = [
+      { field = "first_admit" },
+      { field = "enrolment" },
+    ]
+
+  [visit.icu_admission_date]
+    combinedType = "list"
+    excludeWhen = "none"
+    fields = [
+      {field = "icu_admission_date"}
+    ]
+
+  [visit.treatment_antiviral_type]
+    combinedType = "set"
+    excludeWhen = "none"
+    fields = [
+      { field = "daily_antiviral_type___1", values = { 1 = "Ribavirin" } },
+      { field = "daily_antiviral_type___2", values = { 1 = "Lopinavir" } },
+      { field = "daily_antiviral_type___3", values = { 1 = "Interferon" } },
+      { field = "overall_antiviral_dc___1", values = { 1 = "Ribavirin" } },
+      { field = "overall_antiviral_dc___2", values = { 1 = "Lopinavir" } },
+      { field = "overall_antiviral_dc___3", values = { 1 = "Interferon" } },
+      ]
+
diff --git a/tests/sources/stop-overwriting.csv b/tests/sources/stop-overwriting.csv
@@ -0,0 +1,11 @@
+subjid,redcap,first_admit,enrolment,icu_admission_date,daily_antiviral_type___1,daily_antiviral_type___2,daily_antiviral_type___3,overall_antiviral_dc___1,overall_antiviral_dc___2,overall_antiviral_dc___3
+1,admit,2023-11-20,2023-11-23,,0,0,0,0,0,0
+1,discharge,,,,0,0,0,1,0,1
+1,day1,2023-11-19,,,1,0,0,0,0,0
+1,day2,,,,1,0,0,0,0,0
+2,admit,,2022-11-23,,0,0,0,0,0,0
+2,discharge,,2020-11-23,2020-11-25,0,0,0,0,0,0
+2,day1,,,2020-11-30,0,1,0,0,0,0
+3,admit,,2020-02-20,,0,0,0,0,0,0
+3,discharge,,,,0,0,0,0,1,1
+3,day1,,,,1,0,0,0,0,0
diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -1316,3 +1316,33 @@ def test_combinedtype_wordsubstituteset(test_row, test_combination, expected):
     }
 
     assert parser.get_combined_type(test_row, test_rule) == unordered(expected)
+
+
+OVERWRITE_OUTPUT = [
+    {
+        "subject_id": 1,
+        "earliest_admission": "2023-11-19",
+        "start_date": "2023-11-20",
+        "treatment_antiviral_type": unordered(["Ribavirin", "Interferon"]),
+    },
+    {
+        "subject_id": 2,
+        "start_date": "2022-11-23",
+        "icu_admission_date": unordered(["2020-11-25", "2020-11-30"]),
+        "treatment_antiviral_type": ["Lopinavir"],
+    },
+    {
+        "subject_id": 3,
+        "start_date": "2020-02-20",
+        "treatment_antiviral_type": unordered(["Ribavirin", "Lopinavir", "Interferon"]),
+    },
+]
+
+
+def test_no_overwriting():
+    overwriting_output = list(
+        parser.Parser(TEST_PARSERS_PATH / "stop-overwriting.toml")
+        .parse(TEST_SOURCES_PATH / "stop-overwriting.csv")
+        .read_table("visit")
+    )
+    assert overwriting_output == OVERWRITE_OUTPUT