Skip to content

Commit

Permalink
Harvester / Simple URL / Improvements
Browse files Browse the repository at this point in the history
* Add XML file support
* Add support for multiple URL
* Clarify options order (follow up of
  #6221)
* Add all missing translations
* Use XSL conversion combo instead of free text
  • Loading branch information
fxprunayre committed Nov 21, 2022
1 parent 8190014 commit a32e350
Show file tree
Hide file tree
Showing 6 changed files with 235 additions and 137 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ public class Aligner extends BaseAligner<SimpleUrlParams> {
private IMetadataIndexer metadataIndexer;

private HarvestResult result;

public HarvestResult getResult() {
return result;
}
private Map<String, Object> processParams = new HashMap<String, Object>();
private Logger log;

Expand Down Expand Up @@ -146,7 +150,6 @@ private void insertOrUpdate(Map<String, Element> records, Collection<HarvestErro
addPrivileges(id, params.getPrivileges(), localGroups, context);
result.privilegesAppendedOnExistingRecord++;
}

}
result.totalMetadata++;
} catch (Throwable t) {
Expand Down Expand Up @@ -178,7 +181,7 @@ public HarvestResult cleanupRemovedRecords(Set<String> records) throws Exception
String id = localUuids.getID(uuid);
log.debug(" - Removing old metadata with local id:" + id);
metadataManager.deleteMetadata(context, id);
result.locallyRemoved++;
result.locallyRemoved ++;
}
}
dataMan.forceIndexChanges();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,10 @@
import org.fao.geonet.utils.GeonetHttpRequestFactory;
import org.fao.geonet.utils.Log;
import org.fao.geonet.utils.Xml;
import org.jdom.Attribute;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.Text;
import org.json.JSONException;
import org.json.JSONObject;
import org.json.XML;
Expand All @@ -64,6 +66,8 @@
import java.util.Map;
import java.util.concurrent.atomic.AtomicBoolean;

import static org.fao.geonet.utils.Xml.isXMLLike;

/**
* Harvest metadata from a JSON source.
* <p>
Expand All @@ -73,7 +77,7 @@
* This harvester has been tested with CKAN search API.
*/
class Harvester implements IHarvester<HarvestResult> {
public static final String LOGGER_NAME = "geonetwork.harvester.json";
public static final String LOGGER_NAME = "geonetwork.harvester.simpleurl";

private final AtomicBoolean cancelMonitor;
private Logger log;
Expand All @@ -97,86 +101,144 @@ public Harvester(AtomicBoolean cancelMonitor, Logger log, ServiceContext context

public HarvestResult harvest(Logger log) throws Exception {
this.log = log;
log.debug("Retrieving simple URL: " + params.getName());
log.debug("Retrieving from harvester: " + params.getName());

requestFactory = context.getBean(GeonetHttpRequestFactory.class);

String jsonResponse = retrieveUrl(params.url, log);
if (cancelMonitor.get()) {
return new HarvestResult();
}
log.debug("Response is: " + jsonResponse);
String[] urlList = params.url.split("\n");
boolean error = false;
Aligner aligner = new Aligner(cancelMonitor, context, params, log);

// TODO: Add support for XML or JSON
int numberOfRecordsToHarvest = -1;
ObjectMapper objectMapper = new ObjectMapper();
JsonNode jsonObj = objectMapper.readTree(jsonResponse);
for (String url : urlList) {
log.debug("Loading URL: " + url);
String content = retrieveUrl(url, log);
if (cancelMonitor.get()) {
return new HarvestResult();
}
log.debug("Response is: " + content);

if (StringUtils.isNotEmpty(params.numberOfRecordPath)) {
try {
numberOfRecordsToHarvest = jsonObj.at(params.numberOfRecordPath).asInt();
log.debug("Number of records to harvest: " + numberOfRecordsToHarvest);
} catch (Exception e) {
int numberOfRecordsToHarvest = -1;

ObjectMapper objectMapper = new ObjectMapper();
JsonNode jsonObj = null;
Element xmlObj = null;
SimpleUrlResourceType type = content.startsWith("<?xml version") || isXMLLike(content)
? SimpleUrlResourceType.XML : SimpleUrlResourceType.JSON;
if (type == SimpleUrlResourceType.XML) {
xmlObj = Xml.loadString(content, false);
} else {
jsonObj = objectMapper.readTree(content);
}
}
boolean error = false;
HarvestResult result = null;
Map<String, Element> allUuids = new HashMap<String, Element>();
try {
Aligner aligner = new Aligner(cancelMonitor, context, params, log);
List<String> listOfUrlForPages = buildListOfUrl(params, numberOfRecordsToHarvest);
for (int i = 0; i < listOfUrlForPages.size(); i ++) {
if (i != 0) {
jsonResponse = retrieveUrl(listOfUrlForPages.get(i), log);
jsonObj = objectMapper.readTree(jsonResponse);

if (StringUtils.isNotEmpty(params.numberOfRecordPath)) {
try {
if (type == SimpleUrlResourceType.XML) {
Object element = Xml.selectSingle(xmlObj, params.numberOfRecordPath, xmlObj.getAdditionalNamespaces());
if (element != null) {
String s = getXmlElementTextValue(element);
numberOfRecordsToHarvest = Integer.parseInt(s);
}
} else {
numberOfRecordsToHarvest = jsonObj.at(params.numberOfRecordPath).asInt();
}
log.debug("Number of records to harvest: " + numberOfRecordsToHarvest);
} catch (Exception e) {
}
}
Map<String, Element> allUuids = new HashMap<String, Element>();
try {
Map<String, Element> uuids = new HashMap<String, Element>();
JsonNode nodes;
if (StringUtils.isNotEmpty(params.loopElement)) {
try {
nodes = jsonObj.at(params.loopElement);
log.debug("Number of records in response: " + nodes.size());

nodes.forEach(record -> {
String uuid = this.extractUuidFromIdentifier(record.get(params.recordIdPath).asText());
String apiUrl = params.url.split("\\?")[0];
URL url = null;
try {
url = new URL(apiUrl);
String nodeUrl = new StringBuilder(url.getProtocol()).append("://").append(url.getAuthority()).toString();
Element xml = convertRecordToXml(record, uuid, apiUrl, nodeUrl);
uuids.put(uuid, xml);
} catch (MalformedURLException e) {
log.warning("Failed to parse Node URL");
List<String> listOfUrlForPages = buildListOfUrl(params, numberOfRecordsToHarvest);
for (int i = 0; i < listOfUrlForPages.size(); i++) {
if (i != 0) {
content = retrieveUrl(listOfUrlForPages.get(i), log);
if (type == SimpleUrlResourceType.XML) {
xmlObj = Xml.loadString(content, false);
} else {
jsonObj = objectMapper.readTree(content);
}
}
JsonNode nodes = null;
List<Element> xmlNodes = null;
if (StringUtils.isNotEmpty(params.loopElement)) {
try {
if (type == SimpleUrlResourceType.XML) {
xmlNodes = Xml.selectNodes(xmlObj, params.loopElement, xmlObj.getAdditionalNamespaces());
log.debug(String.format("%d records found in XML response.", xmlNodes.size()));
} else {
nodes = jsonObj.at(params.loopElement);
log.debug(String.format("%d records found in JSON response.", nodes.size()));
}
});
aligner.align(uuids, errors);
allUuids.putAll(uuids);
} catch (Exception e) {
log.warning("Failed to collect record in response");


if (type == SimpleUrlResourceType.XML && xmlNodes != null) {
xmlNodes.forEach(record -> {
String uuid =
null;
try {
uuid = getXmlElementTextValue(Xml.selectSingle(record, params.recordIdPath, record.getAdditionalNamespaces()));
uuids.put(uuid, record);
} catch (JDOMException e) {
log.error(String.format("Failed to extract UUID for record. Error is %s.",
e.getMessage()));
aligner.getResult().badFormat ++;
aligner.getResult().totalMetadata ++;
}
});
} else if (nodes != null) {
nodes.forEach(record -> {
String uuid = this.extractUuidFromIdentifier(record.get(params.recordIdPath).asText());
String apiUrlPath = params.url.split("\\?")[0];
URL apiUrl = null;
try {
apiUrl = new URL(apiUrlPath);
String nodeUrl = new StringBuilder(apiUrl.getProtocol()).append("://").append(apiUrl.getAuthority()).toString();
Element xml = convertRecordToXml(record, uuid, apiUrlPath, nodeUrl);
uuids.put(uuid, xml);
} catch (MalformedURLException e) {
log.warning(String.format("Failed to parse JSON source URL. Error is: %s", e.getMessage()));
}
});
}
aligner.align(uuids, errors);
allUuids.putAll(uuids);
} catch (Exception e) {
log.warning("Failed to collect record in response");
}
}
}
aligner.cleanupRemovedRecords(allUuids.keySet());
} catch (Exception t) {
error = true;
log.error("Unknown error trying to harvest");
log.error(t.getMessage());
log.error(t);
errors.add(new HarvestError(context, t));
} catch (Throwable t) {
error = true;
log.fatal("Something unknown and terrible happened while harvesting");
log.fatal(t.getMessage());
errors.add(new HarvestError(context, t));
}
result = aligner.cleanupRemovedRecords(allUuids.keySet());
} catch (Exception t) {
error = true;
log.error("Unknown error trying to harvest");
log.error(t.getMessage());
log.error(t);
errors.add(new HarvestError(context, t));
} catch (Throwable t) {
error = true;
log.fatal("Something unknown and terrible happened while harvesting");
log.fatal(t.getMessage());
errors.add(new HarvestError(context, t));
}

log.info("Total records processed in all searches :" + allUuids.size());
if (error) {
log.warning("Due to previous errors the align process has not been called");
log.info("Total records processed in all searches :" + allUuids.size());
if (error) {
log.warning("Due to previous errors the align process has not been called");
}
}
return aligner.getResult();
}

return result;
private String getXmlElementTextValue(Object element) {
String s = null;
if (element instanceof Text) {
s = ((Text) element).getTextNormalize();
} else if (element instanceof Attribute) {
s = ((Attribute) element).getValue();
} else if (element instanceof String) {
s = (String) element;
}
return s;
}

private String extractUuidFromIdentifier(final String identifier ) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package org.fao.geonet.kernel.harvest.harvester.simpleUrl;

public enum SimpleUrlResourceType {
JSON,
XML,
RDFXML
}
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,7 @@
scope: {
harvester: "=gnHarvesterAccount"
},
templateUrl:
"../../catalog/components/admin/harvester/partials/" + "account.html",
templateUrl: "../../catalog/components/admin/harvester/partials/account.html",
link: function (scope, element, attrs) {}
};
}
Expand Down
12 changes: 12 additions & 0 deletions web-ui/src/main/resources/catalog/locales/en-admin.json
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,18 @@
"geonetwork-useChangeDateForUpdateHelp": "Use change date to detect changes on remote server. This will not update minor changes but improves speed on harvester.",
"geonetwork-xslfilter": "XSL filter name to apply",
"geonetwork-xslfilterHelp": "The XSL filter is applied to each metadata record",
"simpleurl-urlHelp": "URL pointing to JSON or XML documents. If harvesting more than one URL, add one line for each.",
"loopElement": "Element to loop on",
"simpleurl-loopElementHelp": "For each element, one metadata record is created. For JSON document, points to a property. For XML document, points using XPath. eg. '.' if the element at the root of the XML document is a metadata document like 'mdb:MD_Metadata'.",
"simpleurl-pagination": "Pagination parameters (optional)",
"numberOfRecordPath": "Element for the number of records to collect",
"simpleurl-numberOfRecordPathHelp": "JSON property or XPath to the element containing the number of records to collect. This information is used to compute the number of pages in case pagination is needed to collect all records.",
"recordIdPath": "Element for the UUID of each record",
"simpleurl-recordIdPathHelp": "JSON property or XPath to the UUID of the record. eg. 'mdb:metadataIdentifier/*/mcc:code/*/text()' for XML document in ISO19115-3.",
"pageFromParam": "From URL parameter",
"simpleurl-pageFromParamHelp": "",
"pageSizeParam": "Size URL parameter",
"simpleurl-pageSizeParamHelp": "",
"groupDeleteError": "Error when deleting group",
"groupDeleteConfirm": "Do you really want to delete the group?",
"userDeleteError": "Error when deleting user",
Expand Down
Loading

0 comments on commit a32e350

Please sign in to comment.