-
Notifications
You must be signed in to change notification settings - Fork 33
ENH+BF: Format species names consistently in species_map, make matching more stringent for common names, improved testing #1866
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: enh-macaca-radiata
Are you sure you want to change the base?
Changes from all commits
1a2d7a9
5a9d473
e98375c
1177640
2eadb00
fbe2f6c
db81fa7
a8834c2
e85ef07
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -332,7 +332,7 @@ def extract_cellLine(metadata: dict) -> str | None: | |
|
|
||
| NCBITAXON_URI_TEMPLATE = "http://purl.obolibrary.org/obo/NCBITaxon_{}" | ||
|
|
||
| # common_names, prefix, uri, name | ||
| # common_names, prefix, uri, name ({current name} - {GenBank common name}) | ||
| species_map = [ | ||
| ( | ||
| ["mouse"], | ||
|
|
@@ -347,7 +347,7 @@ def extract_cellLine(metadata: dict) -> str | None: | |
| "Homo sapiens - Human", | ||
| ), | ||
| ( | ||
| ["rat", "norvegicus"], | ||
| ["brown rat", "rat", "norvegicus"], | ||
| None, | ||
| NCBITAXON_URI_TEMPLATE.format("10116"), | ||
| "Rattus norvegicus - Norway rat", | ||
|
|
@@ -386,13 +386,19 @@ def extract_cellLine(metadata: dict) -> str | None: | |
| ["c. elegans", "caenorhabditis elegans"], | ||
| "caenorhabditis", | ||
| NCBITAXON_URI_TEMPLATE.format("6239"), | ||
| "Caenorhabditis elegans", | ||
| "Caenorhabditis elegans - Roundworm", | ||
| ), | ||
| ( | ||
| ["pig-tailed macaque", "pigtail monkey", "pigtail macaque"], | ||
| None, | ||
| NCBITAXON_URI_TEMPLATE.format("9545"), | ||
| "Macaca nemestrina", | ||
| "Macaca nemestrina - Pig-tailed macaque", | ||
| ), | ||
| ( | ||
| ["bonnet macaque", "bonnet monkey", "radiata"], | ||
| None, | ||
| NCBITAXON_URI_TEMPLATE.format("9548"), | ||
| "Macaca radiata - Bonnet macaque", | ||
| ), | ||
| ( | ||
| ["bonnet macaque", "bonnet monkey", "radiata"], | ||
|
|
@@ -404,13 +410,13 @@ def extract_cellLine(metadata: dict) -> str | None: | |
| ["mongolian gerbil", "mongolian jird"], | ||
| None, | ||
| NCBITAXON_URI_TEMPLATE.format("10047"), | ||
| "Meriones unguiculatus", | ||
| "Meriones unguiculatus - Mongolian gerbil", | ||
| ), | ||
| ( | ||
| ["common paper wasp"], | ||
| None, | ||
| NCBITAXON_URI_TEMPLATE.format("30207"), | ||
| "Polistes fuscatus", | ||
| "Polistes fuscatus - Common paper wasp", | ||
| ), | ||
| ] | ||
|
|
||
|
|
@@ -459,7 +465,7 @@ def parse_purlobourl( | |
|
|
||
| def extract_species(metadata: dict) -> models.SpeciesType | None: | ||
| value_orig = metadata.get("species", None) | ||
| value_id = None | ||
| value_matches: list[tuple[str, str | None]] = [] # of (value_id, value) | ||
| if value_orig is not None and value_orig != "": | ||
| if m := re.fullmatch( | ||
| r"https?://purl\.obolibrary\.org/obo/NCBITaxon_([0-9]+)/?", | ||
|
|
@@ -469,8 +475,7 @@ def extract_species(metadata: dict) -> models.SpeciesType | None: | |
| normed_value = NCBITAXON_URI_TEMPLATE.format(m[1]) | ||
| for _common_names, _prefix, uri, name in species_map: | ||
| if uri == normed_value: | ||
| value_id = uri | ||
| value = name | ||
| value_matches.append((uri, name)) | ||
| break | ||
| else: | ||
| value_id = value_orig | ||
|
|
@@ -487,18 +492,24 @@ def extract_species(metadata: dict) -> models.SpeciesType | None: | |
| value = " - ".join( | ||
| [result[key] for key in lookup if key in result] | ||
| ) | ||
| value_matches.append((value_id, value)) | ||
| else: | ||
| lower_value = value_orig.lower() | ||
| lower_value = value_orig.lower().strip() | ||
| for common_names, prefix, uri, name in species_map: | ||
| if ( | ||
| lower_value == name.lower() | ||
| or any(key in lower_value for key in common_names) | ||
| or (prefix is not None and lower_value.startswith(prefix)) | ||
| or any(lower_value == v.lower() for v in name.partition(" - ")) | ||
| ): | ||
| value_id = uri | ||
| value = name | ||
| break | ||
| if value_id is None: | ||
| value_matches.append((uri, name)) | ||
| # only if no matches -- try to match by common names within common names | ||
| if not value_matches: | ||
| for common_names, prefix, uri, name in species_map: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Towards consistency (+/- future class-based representation) the line above marked such fields as private in the scope, (not must sense to me) so might want to adjust that to match; https://github.com/dandi/dandi-cli/pull/1866/changes#diff-f1fffc52a57ec0f6b12d707f48d19c51f4dd81a4606d3535935d38e10e710bbdL470 Interesting though, https://github.com/dandi/dandi-cli/pull/1866/changes#diff-f1fffc52a57ec0f6b12d707f48d19c51f4dd81a4606d3535935d38e10e710bbdR498 didn't |
||
| if any(key == lower_value for key in common_names): | ||
| value_matches.append((uri, name)) | ||
|
Comment on lines
+508
to
+509
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Plus if this were a class unto itself you could define such equivalency logic as a method within it to make a little more sense from the outside looking in, such as |
||
|
|
||
| value_matches = list(set(value_matches)) # unique values | ||
| if not value_matches: | ||
| raise ValueError( | ||
| f"Cannot interpret species field: {value_orig}. Please " | ||
| "contact help@dandiarchive.org to add your species. " | ||
|
|
@@ -509,6 +520,14 @@ def extract_species(metadata: dict) -> models.SpeciesType | None: | |
| "url for the species Homo sapiens. Please note that " | ||
| "this url is case sensitive." | ||
| ) | ||
| elif len(value_matches) > 1: | ||
| raise ValueError( | ||
| f"Got multiple ({len(value_matches)}) species matched for " | ||
| f"{value_orig}: {value_matches}. Should not happen. " | ||
| "Either you need to be more specific or our code needs " | ||
| "fixing - contact help@dandiarchive.org." | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do like including such info in error since it gives the user something Actionable~(A) at least |
||
| ) | ||
| value_id, value = value_matches[0] | ||
| return models.SpeciesType(identifier=value_id, name=value) | ||
| else: | ||
| return None | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you think, as this list grows and becomes more complicated, it might be better as a formal model, or a data class, or at the very least a
namedtuple? Instead of just loosely structured a list of tuplesThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
good idea, filed an issue , feel welcome to put your claude to it after we are done with this PR
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, sure; out of credits for this month so maybe when back from break