Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow external definitions, fixes #10 #66

Merged
merged 1 commit into from
May 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 33 additions & 2 deletions adtl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,14 +385,31 @@ def make_fields_optional(
return _schema


def relative_path(source_file, target_file):
return Path(source_file).parent / target_file


def read_definition(file: Path) -> Dict[str, Any]:
"Reads definition from file into a dictionary"
if file.suffix == ".json":
with file.open() as fp:
return json.load(fp)
elif file.suffix == ".toml":
with file.open("rb") as fp:
return tomli.load(fp)
else:
raise ValueError(f"Unsupported file format: {file}")


class Parser:
def __init__(self, spec: Union[str, Path, StrDict]):
def __init__(self, spec: Union[str, Path, StrDict], include_defs: List[str] = []):
"Loads specification from spec in format (default json)"

self.data: StrDict = {}
self.defs: StrDict = {}
self.fieldnames: Dict[str, List[str]] = {}
self.specfile = None
self.include_defs = include_defs
self.validators: StrDict = {}
self.schemas: StrDict = {}
self.date_fields = []
Expand All @@ -414,7 +431,15 @@ def __init__(self, spec: Union[str, Path, StrDict]):
else:
self.spec = spec
self.header = self.spec.get("adtl", {})
if self.specfile:
self.include_defs = [
relative_path(self.specfile, definition_file)
for definition_file in self.header.get("include-def", [])
] + self.include_defs
self.defs = self.header.get("defs", {})
if self.include_defs:
for definition_file in self.include_defs:
self.defs.update(read_definition(definition_file))
self.spec = expand_refs(self.spec, self.defs)

self.validate_spec()
Expand Down Expand Up @@ -714,8 +739,14 @@ def main():
cmd.add_argument(
"--encoding", help="Encoding input file is in", default="utf-8-sig"
)
cmd.add_argument(
"--include-def",
action="append",
help="Include external definition (TOML or JSON)",
)
args = cmd.parse_args()
spec = Parser(args.spec)
include_defs = args.include_def or []
spec = Parser(args.spec, include_defs=include_defs)
if output := spec.parse(args.file, encoding=args.encoding).save(
args.output or spec.name
):
Expand Down
41 changes: 39 additions & 2 deletions docs/specification.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ These metadata fields are defined under a header key `adtl`.

### Optional fields

### Optional fields

* **defs**: Definitions that can be referred to elsewhere in the schema
* **include-def** (list): List of additional TOML or JSON files to import as
definitions
* **defaultDateFormat**: Default source date format, applied to all fields
with either "date_" / "_date" in the field name or that have format date
set in the JSON schema
Expand Down Expand Up @@ -81,6 +81,43 @@ someTable = { groupBy = "subjid", aggregation = "lastNotNull" }
someReference = { values = { 1 = true, 2 = false } }
```

Often some definitions are repeated across files. adtl supports including
definitions from external files using the *include-def* keyword under the
`[adtl]` section. As an example, a mapping of country codes to country names
could be stored in `countries.toml`:

```toml
[countryMap.values]
1 = "ALB"
2 = "ZZZ"
# and so on
```

This could be included in adtl, and used as a reference just as if it was
included in the TOML file directly:

```toml
[adtl]
include-def = ["countries.toml"]

# ...

[cases.country_iso3]
field = "country"
ref = "countryMap"
```

Definition files can also be included from the command line by passing the
`--include-def` flag to adtl. This is useful when the included file can change
from one run to another, or in cases where the definitions/mappings are located
externally. The following would produce an equivalent result to the
`include-def` assignment in the above example, assuming `data.csv` is the source
data file:

```shell
adtl parser.toml data.csv --include-def countries.toml
```

## Table mappings

Each table has its associated field mappings under a key of the same
Expand Down
30 changes: 30 additions & 0 deletions tests/parsers/groupBy-defs-include.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
[adtl]
name = "groupBy"
description = "Example using groupBy"
include-def = ["include-def.toml"]

[subject]
dataset_id = "dataset-2020-03-23"
country_iso3 = "GBR"

[subject.sex_at_birth]
ref = "sexMapping"
field = "sex"
description = "Sex at Birth"

[subject.subject_id]
field = "subjid"
description = "Subject ID"

[subject.enrolment_date]
field = "dsstdat"
description = "Date of Enrolment"

[subject.admission_date]
field = "hostdat"
description = "Admission date at this facility"

[adtl.tables.subject]
kind = "groupBy"
groupBy = "subject_id"
aggregation = "lastNotNull"
29 changes: 29 additions & 0 deletions tests/parsers/groupBy-external-defs.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
[adtl]
name = "groupBy"
description = "Example using groupBy"

[subject]
dataset_id = "dataset-2020-03-23"
country_iso3 = "GBR"

[subject.sex_at_birth]
ref = "sexMapping"
field = "sex"
description = "Sex at Birth"

[subject.subject_id]
field = "subjid"
description = "Subject ID"

[subject.enrolment_date]
field = "dsstdat"
description = "Date of Enrolment"

[subject.admission_date]
field = "hostdat"
description = "Admission date at this facility"

[adtl.tables.subject]
kind = "groupBy"
groupBy = "subject_id"
aggregation = "lastNotNull"
4 changes: 4 additions & 0 deletions tests/parsers/include-def.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[sexMapping.values]
1 = "male"
2 = "female"
3 = "non_binary"
24 changes: 20 additions & 4 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -626,10 +626,26 @@ def test_reference_expansion():
assert ps_ref.spec == ps_noref.spec


def test_format_equivalence():
adtl_json = parser.Parser(TEST_PARSERS_PATH / "groupBy-defs.json")
adtl_toml = parser.Parser(TEST_PARSERS_PATH / "groupBy-defs.toml")
assert adtl_json.spec == adtl_toml.spec
def test_reference_expansion_with_include():
ps_noinclude = parser.Parser(TEST_PARSERS_PATH / "groupBy-defs.toml")
ps_include = parser.Parser(TEST_PARSERS_PATH / "groupBy-defs-include.toml")
del ps_noinclude.spec["adtl"]["defs"]
del ps_include.spec["adtl"]["include-def"]
assert ps_noinclude.spec == ps_include.spec


def test_external_definitions():
with pytest.raises(KeyError):
parser.Parser(TEST_PARSERS_PATH / "groupBy-external-defs.toml")
ps = parser.Parser(
TEST_PARSERS_PATH / "groupBy-external-defs.toml",
include_defs=[TEST_PARSERS_PATH / "include-def.toml"],
)
assert ps.defs["sexMapping"]["values"] == {
"1": "male",
"2": "female",
"3": "non_binary",
}


FOR_PATTERN = [
Expand Down
Loading