Skip to content

Commit

Permalink
add integration test for supporting Symbiota/NEON style associatedOcc…
Browse files Browse the repository at this point in the history
…urrences; related to #903
  • Loading branch information
Jorrit Poelen committed Jun 14, 2023
1 parent 222963a commit 4b17991
Show file tree
Hide file tree
Showing 18 changed files with 126 additions and 1 deletion.
Expand Up @@ -116,6 +116,9 @@ public class DatasetImporterForDwCA extends DatasetImporterWithListener {
Pattern.compile("^([(][a-zA-Z ]+[)])[ ](.*)(http[s]{0,1}://mczbase.mcz.harvard.edu/guid/)([a-zA-Z0-9:-]+)");
private static final Pattern ARCTOS_ASSOCIATED_OCCURRENCES_VERB_PATTERN = Pattern.compile("^[(][a-zA-Z ]+[)][ ]");

private static final Pattern NEON_ASSOCIATED_OCCURRENCES_PATTERN =
Pattern.compile("^(?<verb>[a-zA-Z ]+)[:](.*)(http[s]{0,1}://.*)(index.php[?]guid=)(?<occurrenceId>[a-zA-Z0-9:-]+)");

private static final Pattern MCZ_ASSOCIATED_OCCURRENCES_VERB_PATTERN =
Pattern.compile("^(.*)" +
"<a href=\"(.*)/SpecimenDetail.*collection_object_id=[0-9]+\">[ ]+" +
Expand Down Expand Up @@ -573,6 +576,7 @@ static List<Map<String, String>> parseAssociatedOccurrences(String s) {
String relationshipTrimmed = StringUtils.trim(relationship);
attemptToParseArctosAssocatedOccurrences(propertyList, relationshipTrimmed);
attemptToParseMCZAssocatedOccurrences(propertyList, relationshipTrimmed);
attemptToParseNEONAssocatedOccurrences(propertyList, relationshipTrimmed);
}
return propertyList;
}
Expand Down Expand Up @@ -646,6 +650,16 @@ private static void attemptToParseMCZAssocatedOccurrences(List<Map<String, Strin
}
}

private static void attemptToParseNEONAssocatedOccurrences(List<Map<String, String>> propertyList, String relationshipTrimmed) {
Matcher matcher = NEON_ASSOCIATED_OCCURRENCES_PATTERN.matcher(relationshipTrimmed);
if (matcher.find()) {
TreeMap<String, String> properties = new TreeMap<>();
properties.put(TARGET_OCCURRENCE_ID, matcher.group("occurrenceId"));
properties.put(INTERACTION_TYPE_NAME, StringUtils.trim(matcher.group("verb")));
appendAssociatedOccurrencesProperties(propertyList, properties);
}
}

static Map<String, String> parseDynamicPropertiesForInteractionsOnly(String s) {
Map<String, String> properties = new HashMap<>();
String[] parts = StringUtils.splitByWholeSeparator(s, ";");
Expand Down
Expand Up @@ -474,6 +474,17 @@ public void associatedTaxaCaughtAfterVisiting() {
assertThat(properties.get(0).get(INTERACTION_TYPE_NAME), is("Caught after visiting"));
}

@Test
public void associatedOcurrencesSymbiotaStyle() {
String associatedTaxa = "hasHost: https://biorepo.neonscience.org/portal/collections/individual/index.php?guid=NEON01ILC";
List<Map<String, String>> properties = parseAssociatedTaxa(associatedTaxa);

assertThat(properties.size(), is(1));
assertThat(properties.get(0).get(TaxonUtil.TARGET_TAXON_NAME), is("https://biorepo.neonscience.org/portal/collections/individual/index.php?guid=NEON01ILC"));
assertThat(properties.get(0).get(INTERACTION_TYPE_ID), is(nullValue()));
assertThat(properties.get(0).get(INTERACTION_TYPE_NAME), is("hasHost"));
}

@Test
public void associatedTaxaVisiting() {
String associatedTaxa = "Visiting Pogonia ophioglossoides; no orchid pollen";
Expand Down
Expand Up @@ -9,6 +9,7 @@
import org.eol.globi.process.InteractionListener;
import org.eol.globi.service.TaxonUtil;
import org.eol.globi.tool.NullImportLogger;
import org.eol.globi.util.ResourceServiceLocal;
import org.eol.globi.util.ResourceServiceLocalAndRemote;
import org.gbif.dwc.Archive;
import org.gbif.dwc.record.Record;
Expand Down Expand Up @@ -110,6 +111,43 @@ public void importRecordsFromDir() throws StudyImporterException, URISyntaxExcep
, "http://rs.tdwg.org/dwc/terms/dynamicProperties | http://rs.tdwg.org/dwc/terms/Occurrence | http://rs.tdwg.org/dwc/terms/associatedTaxa");
}

@Test
public void discoverRecordTypesInMEEP() throws StudyImporterException, URISyntaxException {
URL resource = getClass().getResource("/org/globalbioticinteractions/dataset/neon/meep/meta.xml");
URI archiveRoot = new File(resource.toURI()).getParentFile().toURI();
assertImportsSomethingOfType(archiveRoot
, new AtomicInteger(0)
, "http://rs.tdwg.org/dwc/terms/habitat | http://rs.tdwg.org/dwc/terms/Occurrence | http://rs.tdwg.org/dwc/terms/associatedOccurrences");
}

@Test
public void importTaxonDescriptionsFromMEEPDir() throws StudyImporterException, URISyntaxException {
URL resource = getClass().getResource("/org/globalbioticinteractions/dataset/neon/meep/meta.xml");
URI archiveRoot = new File(resource.toURI()).getParentFile().toURI();
List<Map<String, String>> links = new ArrayList<>();
DatasetImporterForDwCA studyImporterForDwCA = new DatasetImporterForDwCA(null, null);
studyImporterForDwCA.setDataset(new DatasetWithResourceMapping("some/namespace", archiveRoot, new ResourceServiceLocal(inStream -> inStream)));
studyImporterForDwCA.setInteractionListener(new InteractionListener() {
@Override
public void on(Map<String, String> interaction) throws StudyImporterException {
links.add(interaction);
}
});
studyImporterForDwCA.importStudy();

assertThat(links.size(), is(4));
Map<String, String> interaction = links.get(3);
assertThat(interaction.get(DATASET_CITATION), containsString("org/globalbioticinteractions/dataset/neon/"));
assertThat(interaction.get(REFERENCE_CITATION), is("https://biorepo.neonscience.org/portal/collections/individual/index.php?occid=850796"));
assertThat(interaction.get(SOURCE_TAXON_NAME), is("Oestridae"));
assertThat(interaction.get(SOURCE_OCCURRENCE_ID), is("7a5cbc85-a611-4fb1-a0c1-537dd8d215e1"));
assertThat(interaction.get(INTERACTION_TYPE_NAME), is("hasHost"));
assertThat(interaction.get(TARGET_TAXON_NAME), is(nullValue()));
assertThat(interaction.get(TARGET_OCCURRENCE_ID), is("NEON01ILC"));
assertThat(interaction.get(RESOURCE_TYPES), is("http://rs.tdwg.org/dwc/terms/associatedOccurrences | http://rs.tdwg.org/dwc/terms/Occurrence"));
}


@Test
public void importAssociatedTaxaFromDir() throws StudyImporterException, URISyntaxException {
URL resource = getClass().getResource("/org/globalbioticinteractions/dataset/associated-taxa-test/meta.xml");
Expand Down Expand Up @@ -995,7 +1033,7 @@ public void associatedOccurrencesMalformed2() {
}

@Test
public void associatedOccurrences2() {
public void associatedOccurrencesArctosEating() {
String associateOccurrences = "(ate) DZTM: Denver Zoology Tissue Mammal 2822; (ate) DZTM: Denver Zoology Tissue Mammal 2823";
List<Map<String, String>> propertyList = parseAssociatedOccurrences(associateOccurrences);

Expand All @@ -1015,6 +1053,36 @@ public void associatedOccurrences2() {
assertThat(properties.get(DatasetImporterForTSV.RESOURCE_TYPES), is("http://rs.tdwg.org/dwc/terms/associatedOccurrences"));
}

@Test
public void associatedOccurrencesSymbiotaNEONMeep() {
String associateOccurrences = "hasHost: https://biorepo.neonscience.org/portal/collections/individual/index.php?guid=NEON01ILC";
List<Map<String, String>> propertyList = parseAssociatedOccurrences(associateOccurrences);

assertThat(propertyList.size(), is(1));

Map<String, String> properties = propertyList.get(0);
assertThat(properties.get(TaxonUtil.TARGET_TAXON_NAME), is(nullValue()));
assertThat(properties.get(DatasetImporterForTSV.TARGET_OCCURRENCE_ID), is("NEON01ILC"));
assertThat(properties.get(INTERACTION_TYPE_NAME), is("hasHost"));
assertThat(properties.get(INTERACTION_TYPE_ID), is(nullValue()));
assertThat(properties.get(DatasetImporterForTSV.RESOURCE_TYPES), is("http://rs.tdwg.org/dwc/terms/associatedOccurrences"));
}

@Test
public void associatedOccurrencesSymbiotaNEONMamm() {
String associateOccurrences = "hostOf: https://biorepo.neonscience.org/portal/collections/individual/index.php?guid=7a5cbc85-a611-4fb1-a0c1-537dd8d215e1";
List<Map<String, String>> propertyList = parseAssociatedOccurrences(associateOccurrences);

assertThat(propertyList.size(), is(1));

Map<String, String> properties = propertyList.get(0);
assertThat(properties.get(TaxonUtil.TARGET_TAXON_NAME), is(nullValue()));
assertThat(properties.get(DatasetImporterForTSV.TARGET_OCCURRENCE_ID), is("7a5cbc85-a611-4fb1-a0c1-537dd8d215e1"));
assertThat(properties.get(INTERACTION_TYPE_NAME), is("hostOf"));
assertThat(properties.get(INTERACTION_TYPE_ID), is(nullValue()));
assertThat(properties.get(DatasetImporterForTSV.RESOURCE_TYPES), is("http://rs.tdwg.org/dwc/terms/associatedOccurrences"));
}

@Test
public void hasAssociatedTaxaExtension() throws IOException, URISyntaxException {
URI sampleArchive = getClass().getResource("AEC-DBCNet_DwC-A20160308-sample.zip").toURI();
Expand Down
@@ -0,0 +1,3 @@
ALTERED FOR INTEGRATION TESTING PURPOSES

see https://github.com/globalbioticinteractions/globalbioticinteractions/issues/903
@@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8"?>
<eml:eml xmlns:eml="eml://ecoinformatics.org/eml-2.1.1" xmlns:dc="http://purl.org/dc/terms/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="eml://ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.0.1/eml.xsd" packageId="a504e1d3-d28b-4fd7-963b-9a61de583e93" system="https://symbiota.org" scope="system" xml:lang="eng"><dataset><alternateIdentifier>https://biorepo.neonscience.org/portal/collections/misc/collprofiles.php?collid=28</alternateIdentifier><title xml:lang="eng">NEON Biorepository Mammal Collection (Vouchers [Standard Sampling])</title><creator id="5b77bf2d-3bb2-4cb3-9f9d-50c20357f4cb"><organizationName>NEON Biorepository Data Portal</organizationName><electronicMailAddress>biorepo@asu.edu</electronicMailAddress><onlineUrl>https://biorepo.neonscience.org/portal/index.php</onlineUrl></creator><metadataProvider><organizationName>NEON Biorepository Data Portal</organizationName><electronicMailAddress>biorepo@asu.edu</electronicMailAddress><onlineUrl>https://biorepo.neonscience.org/portal/index.php</onlineUrl></metadataProvider><pubDate>2023-05-22</pubDate><language>eng</language><abstract><para>This collection contains small mammal vouchers collected during small mammal sampling (NEON sample classes: mam_pertrapnight_in.voucherSampleID). Small mammal sampling is based on the lunar calendar, with timing of sampling constrained to occur within 10 days before or after the new moon. Typically, core sites are sampled 6 times per year, and relocatable sites 4 times per year. Small mammals are sampled using box traps (models LFA, XLK, H.B. Sherman Traps, Inc., Tallahassee, FL, USA). Box traps are arrayed in three to eight (depending on the size of the site) 10 x 10 grids with 10m spacing between traps at all sites. Small mammal trapping bouts are comprised of one or three nights of trapping, depending on whether a grid is designated for pathogen sample collection (3 nights) or not (1 night). Only mortalities and individuals that require euthanasia due to injuries are vouchered. The NEON Biorepository receives whole frozen specimens and prepares vouchers as either study skins with skulls (or full skeletons) or in 70-95% ethanol. Standard mammalian measurements are taken during specimen preparation (in mm; total length, tail length, hind foot length, ear length; and in g: mass) and are accessible in downloaded records (note: field measurements are listed in parentheses after preparation measurements, when available). Additional notes about parasites and reproductive condition are also accessible in downloaded records. See related links below for protocols and NEON related data products.</para></abstract><contact><organizationName>NEON Biorepository Mammal Collection (Vouchers [Standard Sampling])</organizationName><electronicMailAddress>biorepo@asu.edu</electronicMailAddress><onlineUrl>https://data.neonscience.org/data-products/DP1.10072.001</onlineUrl><addr><deliveryPoint>734 W. Alameda Drive Suite 158</deliveryPoint><city>Tempe</city><administrativeArea>AZ</administrativeArea><postalCode>85282</postalCode></addr></contact><associatedParty><individualName><surName>NEON Biorepository</surName><givenName></givenName></individualName><electronicMailAddress>biorepo@asu.edu</electronicMailAddress><role>contentProvider</role></associatedParty><associatedParty><individualName><surName>Steger</surName><givenName>Laura</givenName></individualName><positionName>Collection Manager</positionName><electronicMailAddress>lsteger@asu.edu</electronicMailAddress><userId>https://orcid.org/0000-0002-0878-4132</userId><role>contentProvider</role></associatedParty><associatedParty><individualName><surName>Rocha Prado</surName><givenName>Laura</givenName></individualName><positionName>Bioinformatician</positionName><electronicMailAddress>lauraprado@asu.edu</electronicMailAddress><userId>https://orcid.org/0000-0003-1237-2824</userId><role>contentProvider</role></associatedParty><associatedParty><individualName><surName>Liao</surName><givenName>Rosie</givenName></individualName><positionName>Senior Sample Preparator</positionName><electronicMailAddress>rliao7@asu.edu</electronicMailAddress><userId>https://orcid.org/0000-0001-7943-049X</userId><role>contentProvider</role></associatedParty><associatedParty><individualName><surName>Yochheim</surName><givenName>Jordyn</givenName></individualName><positionName>Osteology Prep and Dermestarium Manager</positionName><role>contentProvider</role></associatedParty><associatedParty><individualName><surName>De Leon</surName><givenName>Gilma J.</givenName></individualName><positionName>Sample Preparator</positionName><role>contentProvider</role></associatedParty><intellectualRights><para>To the extent possible under law, the publisher has waived all rights to these data and has dedicated them to the <ulink url="http://creativecommons.org/publicdomain/zero/1.0/"><citetitle></citetitle></ulink></para></intellectualRights></dataset><additionalMetadata><metadata><symbiota id="5b77bf2d-3bb2-4cb3-9f9d-50c20357f4cb"><dateStamp>2023-05-22T09:44:22-07:00</dateStamp><citation identifier="8cb4465c-7e6f-4988-bd50-a9c23fc196b9">NEON Biorepository Data Portal - 8cb4465c-7e6f-4988-bd50-a9c23fc196b9</citation><physical><characterEncoding>UTF-8</characterEncoding><dataFormat><externallyDefinedFormat><formatName>Darwin Core Archive</formatName></externallyDefinedFormat></dataFormat></physical><collection identifier="362132e9-de5e-4e2c-9b67-75d1d13e6c41" id="28"><alternateIdentifier>https://biorepo.neonscience.org/portal/collections/misc/collprofiles.php?collid=28</alternateIdentifier><parentCollectionIdentifier>NEON</parentCollectionIdentifier><collectionIdentifier>MAMC-VSS</collectionIdentifier><collectionName>NEON Biorepository Mammal Collection (Vouchers [Standard Sampling])</collectionName><resourceLogoUrl>https://biorepo.neonscience.org/portal/content/collicon/asu-neon-iv.png</resourceLogoUrl><onlineUrl>https://data.neonscience.org/data-products/DP1.10072.001</onlineUrl><intellectualRights>http://creativecommons.org/publicdomain/zero/1.0/</intellectualRights><associatedParty><individualName><surName>NEON Biorepository</surName><givenName></givenName></individualName><electronicMailAddress>biorepo@asu.edu</electronicMailAddress></associatedParty><associatedParty><individualName><surName>Steger</surName><givenName>Laura</givenName></individualName><positionName>Collection Manager</positionName><electronicMailAddress>lsteger@asu.edu</electronicMailAddress><userId>https://orcid.org/0000-0002-0878-4132</userId></associatedParty><associatedParty><individualName><surName>Rocha Prado</surName><givenName>Laura</givenName></individualName><positionName>Bioinformatician</positionName><electronicMailAddress>lauraprado@asu.edu</electronicMailAddress><userId>https://orcid.org/0000-0003-1237-2824</userId></associatedParty><associatedParty><individualName><surName>Liao</surName><givenName>Rosie</givenName></individualName><positionName>Senior Sample Preparator</positionName><electronicMailAddress>rliao7@asu.edu</electronicMailAddress><userId>https://orcid.org/0000-0001-7943-049X</userId></associatedParty><associatedParty><individualName><surName>Yochheim</surName><givenName>Jordyn</givenName></individualName><positionName>Osteology Prep and Dermestarium Manager</positionName></associatedParty><associatedParty><individualName><surName>De Leon</surName><givenName>Gilma J.</givenName></individualName><positionName>Sample Preparator</positionName></associatedParty><abstract><para>This collection contains small mammal vouchers collected during small mammal sampling (NEON sample classes: mam_pertrapnight_in.voucherSampleID). Small mammal sampling is based on the lunar calendar, with timing of sampling constrained to occur within 10 days before or after the new moon. Typically, core sites are sampled 6 times per year, and relocatable sites 4 times per year. Small mammals are sampled using box traps (models LFA, XLK, H.B. Sherman Traps, Inc., Tallahassee, FL, USA). Box traps are arrayed in three to eight (depending on the size of the site) 10 x 10 grids with 10m spacing between traps at all sites. Small mammal trapping bouts are comprised of one or three nights of trapping, depending on whether a grid is designated for pathogen sample collection (3 nights) or not (1 night). Only mortalities and individuals that require euthanasia due to injuries are vouchered. The NEON Biorepository receives whole frozen specimens and prepares vouchers as either study skins with skulls (or full skeletons) or in 70-95% ethanol. Standard mammalian measurements are taken during specimen preparation (in mm; total length, tail length, hind foot length, ear length; and in g: mass) and are accessible in downloaded records (note: field measurements are listed in parentheses after preparation measurements, when available). Additional notes about parasites and reproductive condition are also accessible in downloaded records. See related links below for protocols and NEON related data products.</para></abstract></collection></symbiota></metadata></additionalMetadata></eml:eml>
@@ -0,0 +1 @@
coreid,identifiedBy,dateIdentified,identificationQualifier,scientificName,tidInterpreted,scientificNameAuthorship,genus,specificEpithet,taxonRank,infraspecificEpithet,identificationReferences,identificationRemarks,recordID,modified
@@ -0,0 +1 @@
coreid,materialSampleID,sampleType,catalogNumber,sampleCondition,disposition,preservationType,preparationDetails,preparationDate,preparedBy,individualCount,sampleSize,storageLocation,remarks,recordID,concentration,concentrationMethod,dnaHybridization,dnaMeltingPoint,estimatedSize,poolDnaExtracts,purificationMethod,quality,qualityCheckDate,qualityRemarks,ratioOfAbsorbance260_230,ratioOfAbsorbance260_280,sampleDesignation,sieving,volume,weight,weightMethod

0 comments on commit 4b17991

Please sign in to comment.