Skip to content

Commit

Permalink
first pass towards supporting offline matching against plazi treatmen…
Browse files Browse the repository at this point in the history
…ts; related to #23
  • Loading branch information
jhpoelen committed Sep 30, 2020
1 parent 948a962 commit 7f85ef0
Show file tree
Hide file tree
Showing 10 changed files with 409 additions and 222 deletions.
4 changes: 2 additions & 2 deletions nomer-parent/pom.xml
Expand Up @@ -14,8 +14,8 @@
<project.build.resourceEncoding>UTF-8</project.build.resourceEncoding>
<project.build.testResourceEncoding>UTF-8</project.build.testResourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<!--<globi.version>0.19.1</globi.version>-->
<globi.version>0.6-SNAPSHOT</globi.version>
<globi.version>0.19.1</globi.version>
<!--<globi.version>0.6-SNAPSHOT</globi.version>-->
<maven-assembly-plugin.version>3.1.0</maven-assembly-plugin.version>
</properties>

Expand Down
5 changes: 5 additions & 0 deletions nomer-taxon-resolver/pom.xml
Expand Up @@ -20,6 +20,11 @@
<artifactId>eol-globi-taxon-resolver</artifactId>
<version>${globi.version}</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
<version>1.20</version>
</dependency>
<dependency>
<groupId>org.apache.jena</groupId>
<artifactId>apache-jena-libs</artifactId>
Expand Down
@@ -0,0 +1,201 @@
package org.globalbioticinteractions.nomer.match;

import org.apache.commons.compress.archivers.ArchiveEntry;
import org.apache.commons.compress.archivers.ArchiveException;
import org.apache.commons.compress.archivers.ArchiveInputStream;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.StopWatch;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eol.globi.data.CharsetConstant;
import org.eol.globi.domain.PropertyAndValueDictionary;
import org.eol.globi.domain.Taxon;
import org.eol.globi.domain.TaxonImpl;
import org.eol.globi.domain.TaxonomyProvider;
import org.eol.globi.service.PropertyEnricherException;
import org.eol.globi.service.TaxonUtil;
import org.eol.globi.taxon.PropertyEnricherSimple;
import org.eol.globi.taxon.TaxonCacheListener;
import org.eol.globi.taxon.TaxonCacheService;
import org.globalbioticinteractions.nomer.util.PropertyEnricherInfo;
import org.globalbioticinteractions.nomer.util.TermMatcherContext;
import org.mapdb.BTreeKeySerializer;
import org.mapdb.BTreeMap;
import org.mapdb.DB;
import org.mapdb.DBMaker;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

@PropertyEnricherInfo(name = "plazi", description = "Lookup Plazi taxon treatment by name or id using offline-enabled database dump")
public class PlaziService extends PropertyEnricherSimple {

private static final Log LOG = LogFactory.getLog(PlaziService.class);
private static final String TREATMENTS = "treatments";


private final TermMatcherContext ctx;

private BTreeMap<String, Map<String, String>> treatments = null;

public PlaziService(TermMatcherContext ctx) {
this.ctx = ctx;
}

@Override
public Map<String, String> enrich(Map<String, String> properties) throws PropertyEnricherException {
Map<String, String> enriched = new TreeMap<>(properties);
String name = properties.get(PropertyAndValueDictionary.NAME);
if (StringUtils.isNotBlank(name)) {
if (needsInit()) {
if (ctx == null) {
throw new PropertyEnricherException("context needed to initialize");
}
lazyInit();
}
Map<String, String> enrichedProperties = treatments.get(name);
enriched = enrichedProperties == null ? enriched : new TreeMap<>(enrichedProperties);
}
return enriched;
}


static void parseNodes(Map<String, Map<String, String>> taxonMap,
Map<String, String> childParent,
Map<String, String> rankIdNameMap,
InputStream resourceAsStream) throws PropertyEnricherException {
BufferedReader reader = new BufferedReader(new InputStreamReader(resourceAsStream));

String line;
try {
while ((line = reader.readLine()) != null) {
String[] rowValues = StringUtils.splitByWholeSeparatorPreserveAllTokens(line, "|");
if (rowValues.length > 24) {
String taxId = rowValues[0];
String parentTaxId = rowValues[17];
String rankKingdomId = rowValues[20];
String rankId = rowValues[21];
String rankKey = rankKingdomId + "-" + rankId;
String rank = rankIdNameMap.getOrDefault(rankKey, rankKey);

String completeName = rowValues[25];

String externalId = TaxonomyProvider.ID_PREFIX_ITIS + taxId;
TaxonImpl taxon = new TaxonImpl(completeName, externalId);
taxon.setRank(StringUtils.equals(StringUtils.trim(rank), "no rank") ? "" : rank);
taxonMap.put(externalId, TaxonUtil.taxonToMap(taxon));
childParent.put(
TaxonomyProvider.ID_PREFIX_ITIS + taxId,
TaxonomyProvider.ID_PREFIX_ITIS + parentTaxId
);
}
}
} catch (IOException e) {
throw new PropertyEnricherException("failed to parse ITIS taxon dump", e);
}
}


private void lazyInit() throws PropertyEnricherException {
File cacheDir = getCacheDir(this.ctx);
if (!cacheDir.exists()) {
if (!cacheDir.mkdirs()) {
throw new PropertyEnricherException("failed to create cache dir at [" + cacheDir.getAbsolutePath() + "]");
}
}

File taxonomyDir = new File(cacheDir, "plazi");
DB db = DBMaker
.newFileDB(taxonomyDir)
.mmapFileEnableIfSupported()
.compressionEnable()
.closeOnJvmShutdown()
.transactionDisable()
.make();

if (db.exists(TREATMENTS)) {
LOG.info("Plazi taxonomy already indexed at [" + taxonomyDir.getAbsolutePath() + "], no need to import.");
treatments = db.getTreeMap(TREATMENTS);
} else {
LOG.info("Plazi treatments importing...");
StopWatch watch = new StopWatch();
watch.start();

treatments = db
.createTreeMap(TREATMENTS)
.keySerializer(BTreeKeySerializer.STRING)
.make();

try {
InputStream resource = this.ctx.getResource(getArchiveUrl());
TaxonCacheListener listener = new TaxonCacheListener() {

@Override
public void start() {

}

@Override
public void addTaxon(Taxon taxon) {
treatments.put(taxon.getName(), TaxonUtil.taxonToMap(taxon));
}

@Override
public void finish() {

}
};
ArchiveInputStream archiveInputStream = new ArchiveStreamFactory()
.createArchiveInputStream(resource);

ArchiveEntry nextEntry;
while ((nextEntry = archiveInputStream.getNextEntry()) != null) {
if (!nextEntry.isDirectory() && StringUtils.endsWith(nextEntry.getName(), ".ttl")) {
PlaziTreatmentsLoader.importTreatment(archiveInputStream, listener);
}
}


} catch (IOException e) {
throw new PropertyEnricherException("failed to load archive", e);
} catch (ArchiveException e) {
throw new PropertyEnricherException("failed to load archive", e);
}


watch.stop();
TaxonCacheService.logCacheLoadStats(watch.getTime(), treatments.size(), LOG);
LOG.info("Plazi treatments imported.");
}
}

private boolean needsInit() {
return treatments == null;
}

@Override
public void shutdown() {

}

private File getCacheDir(TermMatcherContext ctx) {
File cacheDir = new File(ctx.getCacheDir(), "plazi");
cacheDir.mkdirs();
return cacheDir;
}

private String getArchiveUrl() throws PropertyEnricherException {
return ctx.getProperty("nomer.plazi.treatments.archive");
}

}
@@ -0,0 +1,95 @@
package org.globalbioticinteractions.nomer.match;

import com.hp.hpl.jena.ontology.OntModel;
import com.hp.hpl.jena.query.Query;
import com.hp.hpl.jena.query.QueryExecution;
import com.hp.hpl.jena.query.QueryExecutionFactory;
import com.hp.hpl.jena.query.QueryFactory;
import com.hp.hpl.jena.query.QuerySolution;
import com.hp.hpl.jena.query.ResultSet;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.RDFNode;
import org.eol.globi.data.CharsetConstant;
import org.eol.globi.domain.PropertyAndValueDictionary;
import org.eol.globi.domain.Taxon;
import org.eol.globi.service.TaxonUtil;
import org.eol.globi.taxon.TaxonCacheListener;

import java.io.InputStream;
import java.util.AbstractMap;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

public class PlaziTreatmentsLoader {

public static void importTreatment(InputStream treatmentGraph, TaxonCacheListener listener) {
OntModel m = ModelFactory.createOntologyModel();
m.read(treatmentGraph, null, "TURTLE");

String queryString =
"PREFIX tr: <http://www.thomsonreuters.com/>\n" +
"PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n" +
"PREFIX trt: <http://plazi.org/vocab/treatment#>\n" +
"PREFIX fp: <http://filteredpush.org/ontologies/oa/dwcFP#>\n" +
"PREFIX dwc: <http://rs.tdwg.org/dwc/terms/>\n" +
"PREFIX dc: <http://purl.org/dc/elements/1.1/>\n" +
"SELECT * WHERE { \n" +
" ?treatment (trt:augmentsTaxonConcept|trt:definesTaxonConcept|trt:deprecates) ?tc .\n" +
" ?tc trt:hasTaxonName ?tn .\n" +
" ?tc a fp:TaxonConcept .\n" +
" OPTIONAL { ?tc dwc:species ?species . }\n" +
" OPTIONAL { ?tc dwc:genus ?genus . }\n" +
" OPTIONAL { ?tc dwc:family ?family . }\n" +
" OPTIONAL { ?tc dwc:class ?class . }\n" +
" OPTIONAL { ?tc dwc:order ?order . }\n" +
" OPTIONAL { ?tc dwc:phylum ?phylum . }\n" +
" OPTIONAL { ?tc dwc:kingdom ?kingdom . }\n" +
" OPTIONAL { ?tc dwc:rank ?rank . }\n" +
"}";
Query query = QueryFactory.create(queryString);
QueryExecution qexec = QueryExecutionFactory.create(query, m);
ResultSet rs = qexec.execSelect();
while (rs.hasNext()) {
final QuerySolution next = rs.next();
List<String> taxonRanks = Arrays.asList("?species", "?genus", "?family", "?order", "?class", "?phylum", "?kingdom");
Map<String, String> taxonMap =
taxonRanks
.stream()
.map(key -> {
RDFNode value = next.get(key);
String valueString = value != null && value.isLiteral()
? value.asLiteral().getLexicalForm()
: "";
return new AbstractMap.SimpleEntry<>(key.substring(1), valueString);
})
.filter(x -> org.apache.commons.lang3.StringUtils.isNoneBlank(x.getValue()))
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));

RDFNode rdfNode = next.get("?tc");
if (rdfNode != null && rdfNode.isURIResource()) {
taxonMap.put(PropertyAndValueDictionary.EXTERNAL_ID, rdfNode.asResource().getURI());
}

String path = TaxonUtil.generateTaxonPath(taxonMap);
taxonMap.put(PropertyAndValueDictionary.PATH, path);

String[] pathSplit = org.apache.commons.lang3.StringUtils.split(path, CharsetConstant.SEPARATOR);

taxonMap.put(PropertyAndValueDictionary.NAME, pathSplit.length > 0 ? pathSplit[pathSplit.length - 1] : "");

String pathNames = TaxonUtil.generateTaxonPathNames(taxonMap);
taxonMap.put(PropertyAndValueDictionary.PATH_NAMES, pathNames);

String[] split = org.apache.commons.lang3.StringUtils.split(pathNames, CharsetConstant.SEPARATOR);
taxonMap.put(PropertyAndValueDictionary.RANK, split.length > 0 ? split[split.length - 1] : "");


Taxon taxon = TaxonUtil.mapToTaxon(taxonMap);
listener.addTaxon(taxon);

}
}

}

0 comments on commit 7f85ef0

Please sign in to comment.