Skip to content

Commit

Permalink
Indexing / More robust
Browse files Browse the repository at this point in the history
* Dublin core / Multiple title / Only first one indexed (it will be better managed in DCAT2 plugin providing multilingual support).

* Invalid role value `{{role}}` trying to create an invalid field name.
```
GNIDX-XSL||Invalid element name. Invalid QName {{{role}}OrgForResource} (42)
```

* Multiple phones - only the first one is indexed
```
Error on line 1169 of index.xsl:
  XTTE0790: A sequence of more than one item is not allowed as the first argument of
  gn-fn-index:json-escape() ("02 38 42 24 55", "06 31 14 98 21")
```

* Date parsing - more flexible with support for 20170322, 2015-10-18T19:59:30.2675269Z (nanoseconds)
```
Error parsing ISO DateTimes '20170322'. Error is: Text '20170322' could not be parsed: Unable to obtain ZonedDateTime from TemporalAccessor: {},ISO resolved to 2017-03-22 of type java.time.format.Parsed
```

* Email / JSON character to escape. Only one email per contact is indexed.
```
Parsing invalid JSON node { "organisation":"Direction régionale de l'environnement, de l'aménagement et du logement Centre-Val de Loire", "role":"owner", "email":"###Paramètre "adresse email" dans la fonction "Services INSPIRE"", "website":"", "logo":"", "individual":"", "position":"", "phone":"", "address":"" } for property contactForResource. Error is: Unexpected character ('a' (code 97)): was expecting comma to separate Object entries
 at [Source: (String)"{
```
  • Loading branch information
fxprunayre committed Feb 5, 2021
1 parent 4076385 commit 3df6712
Show file tree
Hide file tree
Showing 5 changed files with 172 additions and 71 deletions.
62 changes: 50 additions & 12 deletions common/src/main/java/org/fao/geonet/utils/DateUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,23 +54,40 @@ public class DateUtil {
public static final DateTimeFormatter ISO_OFFSET_DATE_TIME_NANOSECONDS;
public static final DateTimeFormatter CATCH_ALL_DATE_TIME_FORMATTER;
private static final String DEFAULT_DATE_TIME = "3000-01-01T00:00:00.000Z"; // JUNK

// Pattern to check dates
private static final Pattern gsYear = Pattern.compile("([0-9]{4})(-([0-2][0-9]):([0-5][0-9])([A-Z]))?");
private static final Pattern gsYearMonth = Pattern.compile("([0-9]{4})-([0-1][0-9])(-([0-2][0-9]):([0-5][0-9])([A-Z]{0,1}))?");

// Some catalogs are using 2012-09-12Z
private static final Pattern gsYearMonthDayZ = Pattern.compile("([0-9]{4})-([0-1][0-9])-([0-2][0-9])Z");
private static final Pattern gsDayMonthYear = Pattern.compile("([0-2][0-9])/([0-1][0-9])/([0-9]{4})");

// Fri Jan 01 2010 00:00:00 GMT+0100 (CET)
private static final Pattern htmlFormat = Pattern
.compile("([a-zA-Z]{3}) ([a-zA-Z]{3}) ([0-9]{2}) ([0-9]{4}) ([0-2][0-9]):([0-5][0-9]):([0-5][0-9]) (.+)");

static {
ISO_OFFSET_DATE_TIME_NANOSECONDS = new DateTimeFormatterBuilder().parseCaseInsensitive().append(DateTimeFormatter.ISO_LOCAL_DATE)
.appendLiteral('T').appendValue(ChronoField.HOUR_OF_DAY, 2).appendLiteral(':').appendValue(MINUTE_OF_HOUR, 2)
.appendLiteral(':').appendValue(SECOND_OF_MINUTE, 2).appendFraction(NANO_OF_SECOND, 3, 9, true).appendOffsetId()
.toFormatter();
ISO_OFFSET_DATE_TIME_NANOSECONDS = new DateTimeFormatterBuilder().parseCaseInsensitive()
.append(DateTimeFormatter.ISO_LOCAL_DATE)
.appendLiteral('T')
.appendValue(ChronoField.HOUR_OF_DAY, 2)
.appendLiteral(':')
.appendValue(MINUTE_OF_HOUR, 2)
.appendLiteral(':')
.appendValue(SECOND_OF_MINUTE, 2)
.appendFraction(NANO_OF_SECOND, 3, 9, true)
.appendOffsetId()
.toFormatter();

CATCH_ALL_DATE_TIME_FORMATTER = new DateTimeFormatterBuilder().parseCaseInsensitive()
.appendPattern("yyyy[-M][-d['T'H[:m[:s[.SSS][.SS][.S]][XXX]]]]").parseDefaulting(ChronoField.MONTH_OF_YEAR, 1)
.parseDefaulting(ChronoField.DAY_OF_MONTH, 1).parseDefaulting(HOUR_OF_DAY, 0).parseDefaulting(MINUTE_OF_HOUR, 0)
.parseDefaulting(SECOND_OF_MINUTE, 0).parseDefaulting(NANO_OF_SECOND, 0).toFormatter();
.appendPattern("yyyy[[-]M][[-]d['T'H[:m[:s[.SSSSSSSSS][.SSSSSSSS][.SSSSSSS][.SSSSSS][.SSSSS][.SSSS][.SSS][.SS][.S]][XXX]]]]")
.parseDefaulting(ChronoField.MONTH_OF_YEAR, 1)
.parseDefaulting(ChronoField.DAY_OF_MONTH, 1)
.parseDefaulting(HOUR_OF_DAY, 0)
.parseDefaulting(MINUTE_OF_HOUR, 0)
.parseDefaulting(SECOND_OF_MINUTE, 0)
.parseDefaulting(NANO_OF_SECOND, 0).toFormatter();
}

/**
Expand Down Expand Up @@ -131,9 +148,10 @@ public static String parseISODateTimes(String dateTimeString, String durationOrD

odt1 = idt.withZoneSameInstant(ZoneOffset.UTC);
odt = idt.withZoneSameInstant(ZoneOffset.UTC).format(ISO_OFFSET_DATE_TIME_NANOSECONDS);

} catch (Exception e) {
Log.error("geonetwork.domain", "Error parsing ISO DateTimes, error: " + e.getMessage(), e);
Log.error("geonetwork.domain",
String.format("Error parsing ISO DateTimes '%s'. Error is: %s",
dateTimeString, e.getMessage()), e);
return DEFAULT_DATE_TIME;
}

Expand Down Expand Up @@ -190,11 +208,13 @@ public static String parseISODateTimes(String dateTimeString, String durationOrD
* Also string in no ISO format (e.g. {@code yyyy[-MM][-dd][-hh:mm][Z]})
* @return a {@link ZonedDateTime}.
*/
public static ZonedDateTime parseBasicOrFullDateTime(String stringToParse) {
public static ZonedDateTime parseBasicOrFullDateTime(String stringToParse) {
ZonedDateTime result;
Matcher matcher;
if (stringToParse.length() == 8 && !stringToParse.startsWith("T")) {
result = ZonedDateTime.parse(stringToParse, DateTimeFormatter.BASIC_ISO_DATE);
result = LocalDate
.parse(stringToParse, DateTimeFormatter.BASIC_ISO_DATE)
.atStartOfDay(ZoneId.systemDefault());
} else if (stringToParse.startsWith("T") && stringToParse.contains(":")) {
result = parseTime(stringToParse);
} else if (stringToParse.contains("T") && !stringToParse.contains(":") && !stringToParse.contains("-")) {
Expand Down Expand Up @@ -232,6 +252,24 @@ public static ZonedDateTime parseBasicOrFullDateTime(String stringToParse) {
}
}

result = generateDate(year, month, day, seconds, minute, hour, timezone);
} else if ((matcher = gsYearMonthDayZ.matcher(stringToParse)).matches()) {
int year = Integer.parseInt(matcher.group(1));
int month = Integer.parseInt(matcher.group(2));
int day = Integer.parseInt(matcher.group(3));
int minute = 0;
int hour = 0;
int seconds = 0;
String timezone = ZoneId.systemDefault().getId();
result = generateDate(year, month, day, seconds, minute, hour, timezone);
} else if ((matcher = gsDayMonthYear.matcher(stringToParse)).matches()) {
int year = Integer.parseInt(matcher.group(3));
int month = Integer.parseInt(matcher.group(2));
int day = Integer.parseInt(matcher.group(1));
int minute = 0;
int hour = 0;
int seconds = 0;
String timezone = ZoneId.systemDefault().getId();
result = generateDate(year, month, day, seconds, minute, hour, timezone);
} else if ((matcher = htmlFormat.matcher(stringToParse)).matches()) {
// Fri Jan 01 2010 00:00:00 GMT+0100 (CET)
Expand Down Expand Up @@ -341,4 +379,4 @@ public static ZonedDateTime generateDate(int year, int month, int day, int secon
return ZonedDateTime.of(year, month, day, hour, minute, second, 0, zone.toZoneId());

}
}
}
14 changes: 13 additions & 1 deletion common/src/test/java/org/fao/geonet/utils/DateUtilTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,20 @@ public void convertToISOZuluDateTime() {
zdt = zdt.withZoneSameInstant(ZoneOffset.UTC);
assertEquals("Testing 2020-11", DateUtil.ISO_OFFSET_DATE_TIME_NANOSECONDS.format(zdt), datetimeInIsoFormat);

datetimeInIsoFormat = DateUtil.convertToISOZuluDateTime("2012-09-12Z");
ld = LocalDate.parse("2012-09-12");
zdt = ld.atStartOfDay(ZoneId.systemDefault());
zdt = zdt.withZoneSameInstant(ZoneOffset.UTC);
assertEquals(DateUtil.ISO_OFFSET_DATE_TIME_NANOSECONDS.format(zdt), datetimeInIsoFormat);

datetimeInIsoFormat = DateUtil.convertToISOZuluDateTime("2015-10-18T19:59:30.2675269Z");
assertEquals("2015-10-18T19:59:30.2675269Z", datetimeInIsoFormat);

datetimeInIsoFormat = DateUtil.convertToISOZuluDateTime("20170322");
ld = LocalDate.parse("2017-03-22");
zdt = ld.atStartOfDay(ZoneId.systemDefault());
zdt = zdt.withZoneSameInstant(ZoneOffset.UTC);
assertEquals(DateUtil.ISO_OFFSET_DATE_TIME_NANOSECONDS.format(zdt), datetimeInIsoFormat);
}

@Test
Expand Down Expand Up @@ -132,4 +144,4 @@ public void testGenerateDate() {
private String getRandom(int max, int min) {
return Integer.toString(min + (int) (Math.random() * ((max - min) + 1)));
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,12 @@
</harvestedDate>


<!-- For multilingual docs it is good to have a title in the default locale. In this type of metadata we don't have one but in the general case we do so we need to add it to all -->
<resourceTitle><xsl:value-of select="string(dc:title)"/></resourceTitle>
<!-- For multilingual docs it is good to have a title in the default locale.
In this type of metadata we don't have one but in the general
case we do so we need to add it to all -->
<xsl:for-each select="dc:title[1]">
<resourceTitle><xsl:value-of select="string(.)"/></resourceTitle>
</xsl:for-each>

<xsl:for-each select="dc:language">
<mainLanguage><xsl:value-of select="string(.)"/></mainLanguage>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -286,15 +286,26 @@
as="xs:string?"/>
<xsl:variable name="date"
select="string(cit:date/gco:Date|cit:date/gco:DateTime)"/>
<xsl:element name="{$dateType}DateForResource">

<xsl:variable name="zuluDateTime" as="xs:string?">
<xsl:value-of select="date-util:convertToISOZuluDateTime(normalize-space($date))"/>
</xsl:element>
<xsl:element name="{$dateType}YearForResource">
<xsl:value-of select="substring($date, 0, 5)"/>
</xsl:element>
<xsl:element name="{$dateType}MonthForResource">
<xsl:value-of select="substring($date, 0, 8)"/>
</xsl:element>
</xsl:variable>
<xsl:choose>
<xsl:when test="$zuluDateTime != ''">
<xsl:element name="{$dateType}DateForResource">
<xsl:value-of select="$zuluDateTime"/>
</xsl:element>
<xsl:element name="{$dateType}YearForResource">
<xsl:value-of select="substring($zuluDateTime, 0, 5)"/>
</xsl:element>
<xsl:element name="{$dateType}MonthForResource">
<xsl:value-of select="substring($zuluDateTime, 0, 8)"/>
</xsl:element>
</xsl:when>
<xsl:otherwise>
<indexingErrorMsg>Warning / Date <xsl:value-of select="$dateType"/> with value '<xsl:value-of select="$date"/>' was not a valid date format.</indexingErrorMsg>
</xsl:otherwise>
</xsl:choose>
</xsl:for-each>

<xsl:for-each select="cit:date/cit:CI_Date[gn-fn-index:is-isoDate(cit:date/*/text())]">
Expand All @@ -303,20 +314,29 @@
as="xs:string?"/>
<xsl:variable name="date"
select="string(cit:date/gco:Date|cit:date/gco:DateTime)"/>
<resourceDate type="object">
{"type": "<xsl:value-of select="$dateType"/>", "date": "<xsl:value-of select="date-util:convertToISOZuluDateTime(normalize-space($date))"/>"}
</resourceDate>

<xsl:variable name="zuluDate"
select="date-util:convertToISOZuluDateTime($date)"/>
<xsl:if test="$zuluDate != ''">
<resourceDate type="object">
{"type": "<xsl:value-of select="$dateType"/>", "date": "<xsl:value-of select="$zuluDate"/>"}
</resourceDate>
</xsl:if>
</xsl:for-each>


<xsl:if test="$useDateAsTemporalExtent">
<xsl:for-each-group select="cit:date/cit:CI_Date[gn-fn-index:is-isoDate(cit:date/*/text())]/cit:date/*/text()"
group-by=".">

<resourceTemporalDateRange type="object">{
"gte": "<xsl:value-of select="date-util:convertToISOZuluDateTime(.)"/>",
"lte": "<xsl:value-of select="date-util:convertToISOZuluDateTime(.)"/>"
}</resourceTemporalDateRange>
<xsl:variable name="zuluDate"
select="date-util:convertToISOZuluDateTime(.)"/>
<xsl:if test="$zuluDate != ''">
<resourceTemporalDateRange type="object">{
"gte": "<xsl:value-of select="$zuluDate"/>",
"lte": "<xsl:value-of select="$zuluDate"/>"
}</resourceTemporalDateRange>
</xsl:if>
</xsl:for-each-group>
</xsl:if>

Expand Down Expand Up @@ -798,25 +818,33 @@
select="gml:beginPosition|gml:begin/gml:TimeInstant/gml:timePosition"/>
<xsl:variable name="end"
select="gml:endPosition|gml:end/gml:TimeInstant/gml:timePosition"/>
<xsl:if test="gn-fn-index:is-isoDate($start)">


<xsl:variable name="zuluStartDate"
select="date-util:convertToISOZuluDateTime($start)"/>
<xsl:variable name="zuluEndDate"
select="date-util:convertToISOZuluDateTime($end)"/>

<xsl:if test="$zuluStartDate != '' and $zuluEndDate != ''">
<resourceTemporalDateRange type="object">{
"gte": "<xsl:value-of select="date-util:convertToISOZuluDateTime(normalize-space($start))"/>"
"gte": "<xsl:value-of select="$zuluStartDate"/>"
<xsl:if test="$start &lt; $end and not($end/@indeterminatePosition = 'now')">
,"lte": "<xsl:value-of select="date-util:convertToISOZuluDateTime(normalize-space($end))"/>"
,"lte": "<xsl:value-of select="$zuluEndDate"/>"
</xsl:if>
}</resourceTemporalDateRange>
<resourceTemporalExtentDateRange type="object">{
"gte": "<xsl:value-of select="date-util:convertToISOZuluDateTime(normalize-space($start))"/>"
"gte": "<xsl:value-of select="$zuluStartDate"/>"
<xsl:if test="$start &lt; $end and not($end/@indeterminatePosition = 'now')">
,"lte": "<xsl:value-of select="date-util:convertToISOZuluDateTime(normalize-space($end))"/>"
,"lte": "<xsl:value-of select="$zuluEndDate"/>"
</xsl:if>
}</resourceTemporalExtentDateRange>
<xsl:if test="$start &gt; $end">
<indexingErrorMsg>Warning / Field resourceTemporalDateRange /
Lower range bound '<xsl:value-of select="."/>' can not be
greater than upper bound '<xsl:value-of select="$end"/>'.
Date range not indexed.</indexingErrorMsg>
</xsl:if>
</xsl:if>

<xsl:if test="$start &gt; $end">
<indexingErrorMsg>Warning / Field resourceTemporalDateRange /
Lower range bound '<xsl:value-of select="."/>' can not be
greater than upper bound '<xsl:value-of select="$end"/>'.
Date range not indexed.</indexingErrorMsg>
</xsl:if>
</xsl:for-each>

Expand Down Expand Up @@ -1189,7 +1217,7 @@
<xsl:if test="count(preceding-sibling::*[name() = $elementName
and .//cit:CI_Organisation/cit:name/gco:CharacterString = $organisationName
and .//cit:role/*/@codeListValue = $role]) = 0">
<xsl:element name="{$role}Org{$fieldSuffix}">
<xsl:element name="{replace($role, '[^a-zA-Z0-9-]', '')}Org{$fieldSuffix}">
<xsl:value-of select="$organisationName"/>
</xsl:element>
</xsl:if>
Expand All @@ -1200,7 +1228,7 @@
"organisation":"<xsl:value-of
select="gn-fn-index:json-escape($organisationName)"/>",
"role":"<xsl:value-of select="$role"/>",
"email":"<xsl:value-of select="$email"/>",
"email":"<xsl:value-of select="gn-fn-index:json-escape($email)"/>",
"website":"<xsl:value-of select="$website"/>",
"logo":"<xsl:value-of select="$logo"/>",
"individual":"<xsl:value-of select="gn-fn-index:json-escape($individualName)"/>",
Expand Down

0 comments on commit 3df6712

Please sign in to comment.