Skip to content

Commit

Permalink
towards introducing authorship paths related to globalbioticinteracti…
Browse files Browse the repository at this point in the history
…ons/name-alignment-template#18 as requested by @eucharitidae
  • Loading branch information
Jorrit Poelen committed Apr 19, 2024
1 parent b8488d3 commit ded8a94
Show file tree
Hide file tree
Showing 18 changed files with 191 additions and 56 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -423,17 +423,19 @@ private void resolveHierarchyIfNeeded(
Taxon resolvedTaxon
) {
if (shouldResolveHierarchy(childParent, resolvedTaxon)) {
List<String> pathNames = new ArrayList<>();
List<String> pathIds = new ArrayList<>();
List<String> path = new ArrayList<>();

List<String> pathIds = new ArrayList<>();
List<String> pathNames = new ArrayList<>();
List<String> pathAuthorships = new ArrayList<>();

path.add(StringUtils.defaultIfBlank(resolvedTaxon.getName(), ""));

pathIds.add(resolvedTaxon.getExternalId());

pathNames.add(StringUtils.defaultIfBlank(resolvedTaxon.getRank(), ""));

pathAuthorships.add(StringUtils.defaultIfBlank(resolvedTaxon.getAuthorship(), ""));

T parent = childParent.get(focalTaxonKey);
List<T> visitedParents = new ArrayList<T>();
visitedParents.add(focalTaxonKey);
Expand All @@ -444,20 +446,23 @@ private void resolveHierarchyIfNeeded(
if (parentTaxonProperties != null) {
Taxon parentTaxon = TaxonUtil.mapToTaxon(parentTaxonProperties);
path.add(StringUtils.defaultIfBlank(parentTaxon.getName(), ""));
pathNames.add(StringUtils.defaultIfBlank(parentTaxon.getRank(), ""));
pathIds.add(parentTaxon.getExternalId());
pathNames.add(StringUtils.defaultIfBlank(parentTaxon.getRank(), ""));
pathAuthorships.add(StringUtils.defaultIfBlank(parentTaxon.getAuthorship(), ""));
}
visitedParents.add(parent);
parent = childParent.get(parent);
}

Collections.reverse(pathNames);
Collections.reverse(pathIds);
Collections.reverse(path);
Collections.reverse(pathIds);
Collections.reverse(pathNames);
Collections.reverse(pathAuthorships);

resolvedTaxon.setPath(StringUtils.join(path, CharsetConstant.SEPARATOR));
resolvedTaxon.setPathIds(StringUtils.join(pathIds, CharsetConstant.SEPARATOR));
resolvedTaxon.setPathNames(StringUtils.join(pathNames, CharsetConstant.SEPARATOR));
resolvedTaxon.setPathAuthorships(StringUtils.join(pathAuthorships, CharsetConstant.SEPARATOR));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,12 @@ private void parseRecord(
return value;
});

Stream<String> pathAuthorshipStream = RANKS
.stream()
.map(rank -> "");

taxon.setPath(pathStream.collect(Collectors.joining(CharsetConstant.SEPARATOR)));
taxon.setPathAuthorships(pathAuthorshipStream.collect(Collectors.joining(CharsetConstant.SEPARATOR)));
String rankNames = RANKS
.stream()
.map(rank -> StringUtils.remove(rank, '_'))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
import java.io.InputStreamReader;
import java.net.URI;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;

public class IndexFungorumTaxonService extends CommonLongTaxonService {
private static final Logger LOG = LoggerFactory.getLogger(IndexFungorumTaxonService.class);
Expand Down Expand Up @@ -74,7 +76,9 @@ private void parseNodes(Map<Long, Map<String, String>> nodes,
taxon.setAuthorship(authorship);
}
taxon.setPath(StringUtils.join(new String[]{kingdomName, phylumName, subphylumName, className, subclassName, orderName, familyName, completeName}, CharsetConstant.SEPARATOR));
taxon.setPathNames(StringUtils.join(new String[]{"kingdom", "phylum", "subphylum", "class", "subclass", "order", "family", ""}, CharsetConstant.SEPARATOR));
String[] ranks = {"kingdom", "phylum", "subphylum", "class", "subclass", "order", "family", ""};
taxon.setPathNames(StringUtils.join(ranks, CharsetConstant.SEPARATOR));
taxon.setPathAuthorships(Stream.of(ranks).map(r -> "").collect(Collectors.joining(CharsetConstant.SEPARATOR)));
if (NumberUtils.isCreatable(taxId)) {
Long taxonKey = Long.parseLong(taxId);
registerIdForName(taxonKey, taxon, name2nodeIds);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,15 @@ void parseNodes(Map<String, Map<String, String>> taxonMap,
: StringUtils.defaultIfBlank(value, "");
});

Stream<String> pathAuthorshipStream = RANKS
.stream()
.map(rank -> {
return "";
});

taxon.setPath(pathStream.collect(Collectors.joining(CharsetConstant.SEPARATOR)));
taxon.setPathNames(String.join(CharsetConstant.SEPARATOR, RANKS));
taxon.setPathAuthorships(pathAuthorshipStream.collect(Collectors.joining(CharsetConstant.SEPARATOR)));


String id = "https://www.mammaldiversity.org/explore.html#genus=" + genus + "&species=" + specificEpithet + "&id=" + taxonId;
Expand Down Expand Up @@ -124,19 +131,19 @@ void parseNodes(Map<String, Map<String, String>> taxonMap,
Stream<Taxon> subspeciesTaxa = subspecies
.stream()
.map(subspecificEpithetAndAuthor -> {
String[] s = subspecificEpithetAndAuthor.split(" ");
String subspecificEpithet = s[0];
String subspecificAuthorship = StringUtils.trim(RegExUtils.replaceFirst(subspecificEpithetAndAuthor, subspecificEpithet, ""));
Taxon subspecificTaxon = TaxonUtil.copy(taxon);
subspecificTaxon.setName(taxon.getName() + " " + subspecificEpithet);
subspecificTaxon.setPath(taxon.getPath() + CharsetConstant.SEPARATOR + subspecificEpithet);
subspecificTaxon.setPathNames(taxon.getPathNames() + CharsetConstant.SEPARATOR + "subspecificEpithet");
subspecificTaxon.setAuthorship(subspecificAuthorship);
String suspecificId = taxon.getExternalId() + "&subspecies=" + subspecificEpithet;
subspecificTaxon.setExternalId(suspecificId);
subspecificTaxon.setExternalUrl(suspecificId);
return subspecificTaxon;
});
String[] s = subspecificEpithetAndAuthor.split(" ");
String subspecificEpithet = s[0];
String subspecificAuthorship = StringUtils.trim(RegExUtils.replaceFirst(subspecificEpithetAndAuthor, subspecificEpithet, ""));
Taxon subspecificTaxon = TaxonUtil.copy(taxon);
subspecificTaxon.setName(taxon.getName() + " " + subspecificEpithet);
subspecificTaxon.setPath(taxon.getPath() + CharsetConstant.SEPARATOR + subspecificEpithet);
subspecificTaxon.setPathNames(taxon.getPathNames() + CharsetConstant.SEPARATOR + "subspecificEpithet");
subspecificTaxon.setAuthorship(subspecificAuthorship);
String suspecificId = taxon.getExternalId() + "&subspecies=" + subspecificEpithet;
subspecificTaxon.setExternalId(suspecificId);
subspecificTaxon.setExternalUrl(suspecificId);
return subspecificTaxon;
});

subspeciesTaxa.forEach(t -> {
registerTaxon(taxonMap, name2nodeIds, t.getExternalId(), t);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;

public class NCBITaxonService extends PropertyEnricherSimple implements TermMatcher {
Expand All @@ -47,13 +48,15 @@ public class NCBITaxonService extends PropertyEnricherSimple implements TermMatc
private static final String NAME_IDS = "nameIds";
private static final String SYNONYM_IDS = "synonymIds";
private static final String COMMON_NAME_IDS = "commonNamesIds";
private static final String AUTHORITY_IDS = "authorityIds";


private final TermMatcherContext ctx;

private BTreeMap<String, List<String>> nameIds = null;
private BTreeMap<String, List<String>> synonymIds = null;
private BTreeMap<String, List<String>> commonNameIds = null;
private BTreeMap<String, List<String>> authoritiesIds = null;
private BTreeMap<String, String> mergedNodes = null;
private BTreeMap<String, Map<String, String>> ncbiDenormalizedNodes = null;

Expand Down Expand Up @@ -224,6 +227,7 @@ private void lazyInit() throws PropertyEnricherException {
nameIds = db.getTreeMap(NAME_IDS);
synonymIds = db.getTreeMap(SYNONYM_IDS);
commonNameIds = db.getTreeMap(COMMON_NAME_IDS);
authoritiesIds = db.getTreeMap(AUTHORITY_IDS);
} else {
LOG.info("NCBI taxonomy importing...");
StopWatch watch = new StopWatch();
Expand Down Expand Up @@ -271,6 +275,12 @@ private void lazyInit() throws PropertyEnricherException {
.valueSerializer(Serializer.JAVA)
.make();

authoritiesIds = db
.createTreeMap(AUTHORITY_IDS)
.keySerializer(BTreeKeySerializer.STRING)
.valueSerializer(Serializer.JAVA)
.make();


try {
parseMerged(mergedNodes, ctx.retrieve(getMergedNodesUrl()));
Expand All @@ -285,7 +295,14 @@ private void lazyInit() throws PropertyEnricherException {
.make();

try {
parseNames(ctx.retrieve(getNamesUrl()), ncbiNames, nameIds, commonNameIds, synonymIds);
parseNames(
ctx.retrieve(getNamesUrl()),
ncbiNames,
nameIds,
commonNameIds,
synonymIds,
authoritiesIds
);
} catch (IOException e) {
throw new PropertyEnricherException("failed to parse NCBI nodes", e);
}
Expand All @@ -296,7 +313,7 @@ private void lazyInit() throws PropertyEnricherException {
.keySerializer(BTreeKeySerializer.STRING)
.valueSerializer(Serializer.JAVA)
.make();
denormalizeTaxa(ncbiNodes, ncbiDenormalizedNodes, childParent, ncbiNames);
denormalizeTaxa(ncbiNodes, ncbiDenormalizedNodes, childParent, ncbiNames, authoritiesIds);

watch.stop();
TaxonCacheService.logCacheLoadStats(watch.getTime(), ncbiNodes.size(), LOG);
Expand Down Expand Up @@ -353,58 +370,94 @@ static void parseNodes(Map<String, Map<String, String>> taxonMap, Map<String, St
}
}

static void denormalizeTaxa(Map<String, Map<String, String>> taxonMap, Map<String, Map<String, String>> taxonMapDenormalized, Map<String, String> childParent, Map<String, String> taxonNames) {
static void denormalizeTaxa(
Map<String, Map<String, String>> taxonMap,
Map<String, Map<String, String>> taxonMapDenormalized,
Map<String, String> childParent,
Map<String, String> taxonNames,
Map<String, List<String>> authoritiesIds) {
Set<Map.Entry<String, Map<String, String>>> taxa = taxonMap.entrySet();
for (Map.Entry<String, Map<String, String>> taxon : taxa) {
denormalizeTaxa(taxonMap, taxonMapDenormalized, childParent, taxonNames, taxon);
denormalizeTaxa(taxonMap, taxonMapDenormalized, childParent, taxonNames, taxon, authoritiesIds);
}
}

private static void denormalizeTaxa(Map<String, Map<String, String>> taxonMap, Map<String, Map<String, String>> taxonEnrichMap, Map<String, String> childParent, Map<String, String> names, Map.Entry<String, Map<String, String>> taxon) {
private static void denormalizeTaxa(
Map<String, Map<String, String>> taxonMap,
Map<String, Map<String, String>> taxonEnrichMap,
Map<String, String> childParent,
Map<String, String> names,
Map.Entry<String, Map<String, String>> taxon,
Map<String, List<String>> authorityIds) {
Map<String, String> childTaxon = taxon.getValue();
List<String> pathNames = new ArrayList<>();
List<String> pathIds = new ArrayList<>();
List<String> path = new ArrayList<>();
List<String> pathIds = new ArrayList<>();
List<String> pathNames = new ArrayList<>();
List<String> pathAuthorships = new ArrayList<>();

Taxon origTaxon = TaxonUtil.mapToTaxon(childTaxon);

String str = names.get(origTaxon.getExternalId());
origTaxon.setName(str);
path.add(StringUtils.defaultIfBlank(str, ""));
String name = names.get(origTaxon.getExternalId());
origTaxon.setName(name);
path.add(StringUtils.defaultIfBlank(name, ""));

String externalId = origTaxon.getExternalId();
origTaxon.setExternalId(externalId);
pathIds.add(StringUtils.defaultIfBlank(externalId, ""));

origTaxon.setRank(origTaxon.getRank());
pathNames.add(StringUtils.defaultIfBlank(origTaxon.getRank(), ""));

String authorship = getAuthorshipById(authorityIds, name, origTaxon.getExternalId());
origTaxon.setAuthorship(authorship);
pathAuthorships.add(StringUtils.defaultIfBlank(origTaxon.getAuthorship(), ""));

String parent = childParent.get(taxon.getKey());
while (StringUtils.isNotBlank(parent) && !pathIds.contains(parent)) {
Map<String, String> stringStringMap = taxonMap.get(parent);
if (stringStringMap != null) {
Taxon parentTaxon = TaxonUtil.mapToTaxon(stringStringMap);
pathNames.add(StringUtils.defaultIfBlank(parentTaxon.getRank(), ""));
String parentName = names.get(parentTaxon.getExternalId());
path.add(StringUtils.defaultIfBlank(parentName, ""));

String parentAuthorship = getAuthorshipById(authorityIds, parentName, parentTaxon.getExternalId());

pathIds.add(StringUtils.defaultIfBlank(parentTaxon.getExternalId(), ""));
path.add(StringUtils.defaultIfBlank(names.get(parentTaxon.getExternalId()), ""));
pathNames.add(StringUtils.defaultIfBlank(parentTaxon.getRank(), ""));
pathAuthorships.add(StringUtils.defaultIfBlank(parentAuthorship, ""));
}
parent = childParent.get(parent);
}

Collections.reverse(pathNames);
Collections.reverse(pathIds);
Collections.reverse(path);
Collections.reverse(pathIds);
Collections.reverse(pathNames);
Collections.reverse(pathAuthorships);

origTaxon.setPath(StringUtils.join(path, CharsetConstant.SEPARATOR));
origTaxon.setPathIds(StringUtils.join(pathIds, CharsetConstant.SEPARATOR));
origTaxon.setPathNames(StringUtils.join(pathNames, CharsetConstant.SEPARATOR));
origTaxon.setPathAuthorships(StringUtils.join(pathAuthorships, CharsetConstant.SEPARATOR));

taxonEnrichMap.put(taxon.getKey(), TaxonUtil.taxonToMap(origTaxon));
}

private static String getAuthorshipById(Map<String, List<String>> authorityIds, String name, String externalId) {
List<String> authorships = authorityIds.get(externalId);
List<String> collect = authorships == null
? Collections.emptyList()
: authorships.stream().filter(auth -> StringUtils.startsWith(auth, name)).collect(Collectors.toList());

return collect.size() == 0
? ""
: StringUtils.defaultIfBlank(StringUtils.trim(StringUtils.replace(collect.get(0), name, "")), "");
}

static void parseNames(InputStream resourceAsStream, Map<String, String> nameMap,
Map<String, List<String>> nameIds,
Map<String, List<String>> commonNameIds,
Map<String, List<String>> synonymIds) throws PropertyEnricherException {
Map<String, List<String>> synonymIds,
Map<String, List<String>> authorityIds) throws PropertyEnricherException {

BufferedReader reader = new BufferedReader(new InputStreamReader(resourceAsStream));
String line;
Expand Down Expand Up @@ -434,15 +487,15 @@ static void parseNames(InputStream resourceAsStream, Map<String, String> nameMap
"teleomorph",
"type material");

String ncbiTaxonId = TaxonomyProvider.ID_PREFIX_NCBI + taxId;
if (StringUtils.equals("scientific name", taxonNameClass)) {
String ncbiTaxonId = TaxonomyProvider.ID_PREFIX_NCBI + taxId;
nameMap.put(ncbiTaxonId, taxonName);
addIdMapEntry(nameIds, taxonName, ncbiTaxonId);
} else if (StringUtils.equals("authority", taxonNameClass)) {
addIdMapEntry(authorityIds, ncbiTaxonId, taxonName);
} else if (StringUtils.equals("synonym", taxonNameClass)) {
String ncbiTaxonId = TaxonomyProvider.ID_PREFIX_NCBI + taxId;
addIdMapEntry(synonymIds, taxonName, ncbiTaxonId);
} else if (Arrays.asList("genbank common name", "common name").contains(taxonNameClass)) {
String ncbiTaxonId = TaxonomyProvider.ID_PREFIX_NCBI + taxId;
addIdMapEntry(commonNameIds, taxonName, ncbiTaxonId);
}

Expand All @@ -453,17 +506,17 @@ static void parseNames(InputStream resourceAsStream, Map<String, String> nameMap
}
}

private static void addIdMapEntry(Map<String, List<String>> nameIds,
String taxonName,
String key) {
List<String> ids = nameIds.get(taxonName);
if (ids == null) {
ids = new ArrayList<>();
private static void addIdMapEntry(Map<String, List<String>> lookupTable,
String key,
String value) {
List<String> values = lookupTable.get(key);
if (values == null) {
values = new ArrayList<>();
}
if (!ids.contains(key)) {
ids.add(key);
if (!values.contains(value)) {
values.add(value);
}
nameIds.put(taxonName, ids);
lookupTable.put(key, values);
}

static void parseMerged(Map<String, String> mergedMap, InputStream resourceAsStream) throws PropertyEnricherException {
Expand Down

0 comments on commit ded8a94

Please sign in to comment.