/
config-lucene.xml
373 lines (327 loc) · 17.2 KB
/
config-lucene.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
<?xml version="1.0"?>
<!--
~ Copyright (C) 2001-2016 Food and Agriculture Organization of the
~ United Nations (FAO-UN), United Nations World Food Programme (WFP)
~ and United Nations Environment Programme (UNEP)
~
~ This program is free software; you can redistribute it and/or modify
~ it under the terms of the GNU General Public License as published by
~ the Free Software Foundation; either version 2 of the License, or (at
~ your option) any later version.
~
~ This program is distributed in the hope that it will be useful, but
~ WITHOUT ANY WARRANTY; without even the implied warranty of
~ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
~ General Public License for more details.
~
~ You should have received a copy of the GNU General Public License
~ along with this program; if not, write to the Free Software
~ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
~
~ Contact: Jeroen Ticheler - FAO - Viale delle Terme di Caracalla 2,
~ Rome - Italy. email: geonetwork@osgeo.org
-->
<config>
<index>
<!--
The amount of memory to be used for buffering documents in memory.
48MB seems to be plenty for running at least two long
indexing jobs (eg. importing 20,000 records) and keeping disk
activity for lucene index writing to a minimum.
-->
<RAMBufferSizeMB>48.0d</RAMBufferSizeMB>
<!-- Determines how often segment indices are merged by addDocument(). -->
<MergeFactor>10</MergeFactor>
<!-- Default Lucene version to use (mainly for Analyzer creation). -->
<luceneVersion>4_9</luceneVersion>
<!-- How often to check if a commit is required -->
<commitInterval>1000</commitInterval>
<!-- If true then NRTManagerReopenThread will control how often the reader are reopened.
If false then the reader will be opened everytime they are obtained. -->
<useNRTManagerReopenThread>true</useNRTManagerReopenThread>
<!--
Only applies if useNRTManagerReopenThread is true
Maximum time until a new reader must be opened; this sets the upper bound
on how slowly reopens may occur
See NRTManagerReopenThread -->
<nrtManagerReopenThreadMaxStaleSec>5</nrtManagerReopenThreadMaxStaleSec>
<!--
Only applies if useNRTManagerReopenThread is true
Minimum time until a new reader can be opened; this sets the lower bound
on how quickly reopens may occur, when a caller
is waiting for a specific indexing change to become visible
See NRTManagerReopenThread-->
<nrtManagerReopenThreadMinStaleSec>0.1</nrtManagerReopenThreadMinStaleSec>
</index>
<!-- Search parameters are applied at search time and does not need
an index rebuild in order to be take into account. -->
<search>
<!-- Score parameters. Turning these parameters to true, affects performance. -->
<!-- Set track doc score to true if score needs to be displayed in results using
geonet:info/score element -->
<trackDocScores>false</trackDocScores>
<trackMaxScore>false</trackMaxScore>
<!-- Not used because no Scorer defined -->
<docsScoredInOrder>false</docsScoredInOrder>
<!--
By default Lucene compute score according to search criteria
and the corresponding result set and their index content.
In case of search with no criteria, Lucene will return top docs
in index order (because none are more relevant than others).
In order to change the score computation, a boost function could
be define. Boosting query needs to be loaded in classpath.
* RecencyBoostingQuery will promote recently modified documents
-->
<!--
<boostQuery name="org.fao.geonet.kernel.search.function.RecencyBoostingQuery">
<Param name="multiplier" type="double" value="2.0"/>
<Param name="maxDaysAgo" type="int" value="365"/>
<Param name="dayField" type="java.lang.String" value="_changeDate"/>
</boostQuery>
-->
<!--
List of fields to dump when using q service. Fields must be stored in the index.
The attribute 'multilingualSortField' flag fields that are used for sorting also may have translations.
This is the solution for http://trac.osgeo.org/geonetwork/ticket/1112
This is an issue where searching by one field and sorting by another will result
in an incorrect sorting. The reason is that sorted field may have multiple translations.
If the document found is one language (say french) and the UI language is english then
then the french translation will be used for sorting even though when the metadata
is viewed by user the metadata will be in english.
The following fields are the sort field that can have translations and therefore
need special care when indexing and sorting.
-->
<dumpFields>
<field name="_isTemplate" tagName="isTemplate"/>
<field name="_isHarvested" tagName="isHarvested"/>
<field name="_popularity" tagName="popularity"/>
<field name="_rating" tagName="rating"/>
<field name="feedbackCount" tagName="feedbackCount"/>
<field name="_displayOrder" tagName="displayOrder"/>
<field name="_view" tagName="view"/>
<field name="_notify" tagName="notify"/>
<field name="_download" tagName="download"/>
<field name="_dynamic" tagName="dynamic"/>
<field name="_featured" tagName="featured"/>
<field name="_owner" tagName="owner"/>
<field name="_userinfo" tagName="userinfo"/>
<field name="_ownername" tagName="ownername"/>
<field name="_cat" tagName="category"/>
<field name="topicCat" tagName="topicCat"/>
<field name="inspirethemewithac" tagName="inspirethemewithac"/>
<field name="_valid" tagName="valid"/>
<field name="_valid_schematron-rules-geonetwork" tagName="valid_schematron-rules-geonetwork"/>
<field name="_valid_schematron-rules-iso" tagName="valid_schematron-rules-iso"/>
<field name="_valid_schematron-rules-inspire" tagName="valid_schematron-rules-inspire"/>
<field name="_valid_xsd" tagName="valid_xsd"/>
<field name="_selected" tagName="selected"/>
<field name="_source" tagName="source"/>
<field name="_status" tagName="mdStatus"/>
<field name="_edit" tagName="edit"/>
<field name="_root" tagName="root"/>
<field name="_title" tagName="title" multilingualSortField="true"/>
<field name="_docLocale" tagName="docLocale"/>
<field name="_locale" tagName="_locale"/>
<field name="_defaultTitle" tagName="defaultTitle"/>
<field name="createDate" tagName="creationDate"/>
<field name="revisionDate" tagName="revisionDate"/>
<field name="publicationDate" tagName="publicationDate"/>
<field name="abstract" tagName="abstract" multilingualSortField="true"/>
<field name="_defaultAbstract" tagName="defaultAbstract"/>
<field name="keyword" tagName="keyword"/>
<field name="keywordGroup" tagName="keywordGroup"/>
<field name="parentUuid" tagName="parentId"/>
<field name="type" tagName="type"/>
<field name="identifier" tagName="identifier"/>
<field name="image" tagName="image"/>
<field name="link" tagName="link"/>
<field name="format" tagName="format"/>
<field name="denominator" tagName="denominator"/>
<field name="resolution" tagName="resolution"/>
<field name="crs" tagName="crs"/>
<field name="crsDetails" tagName="crsDetails"/>
<field name="credit" tagName="credit"/>
<field name="cl_status" tagName="status"/>
<field name="cl_status_text" tagName="status_text"/>
<field name="updateFrequency" tagName="updateFrequency"/>
<field name="lineage" tagName="lineage"/>
<field name="responsibleParty" tagName="responsibleParty"/>
<field name="MD_LegalConstraintsOtherConstraints" tagName="legalConstraints"/>
<field name="MD_LegalConstraintsUseLimitation" tagName="legalConstraints"/>
<field name="MD_SecurityConstraintsUseLimitation" tagName="securityConstraints"/>
<field name="MD_ConstraintsUseLimitation" tagName="resourceConstraints"/>
<field name="cl_classification_text" tagName="classification_text"/>
<field name="cl_maintenanceAndUpdateFrequency_text"
tagName="maintenanceAndUpdateFrequency_text"/>
<field name="cl_spatialRepresentationType_text" tagName="spatialRepresentationType_text"/>
<field name="datasetLang" tagName="datasetLang"/>
<field name="mdLanguage" tagName="mdLanguage"/>
<field name="serviceType" tagName="serviceType"/>
<field name="tempExtentBegin" tagName="tempExtentBegin"/>
<field name="tempExtentEnd" tagName="tempExtentEnd"/>
<field name="geoBox" tagName="geoBox"/>
<field name="extentDesc" tagName="geoDesc"/>
<field name="geoDescCode" tagName="geoDescCode"/>
<field name="_indexingError" tagName="idxError"/>
<field name="_indexingErrorMsg" tagName="idxMsg"/>
<field name="_op0" tagName="publishedForGroup"/>
<field name="_groupOwner" tagName="groupOwner"/>
<field name="_logo" tagName="logo"/>
<field name="standardName" tagName="standardName"/>
<field name="_groupWebsite" tagName="groupWebsite"/>
<field name="attributeTable" tagName="attributeTable"/>
</dumpFields>
</search>
<!-- Default analyzer to use for all fields not defined in the fieldSpecificAnalyzer section.
If not set, GeoNetwork use a default per field analyzer (ie. fieldSpecificAnalyzer is not
take into account). The default analyzer is defined in SearchManager.
Example:
org.apache.lucene.analysis.fr.FrenchAnalyzer
<defaultAnalyzer name="org.apache.lucene.analysis.standard.StandardAnalyzer">
<Param name="version" type="org.apache.lucene.util.Version"/>
</defaultAnalyzer>
-->
<defaultAnalyzer name="org.fao.geonet.kernel.search.GeoNetworkAnalyzer"/>
<!-- TODO: Add a language specific analyzer according to:
* metadata language
* metadata elements language (ie. PT_FreeText elements)
-->
<!-- Field analyzer
Define here specific analyzer for each fields stored in the index
For example adding a different analyzer for any (ie. full text search)
could be better than a standard analyzer which has a particular way of
creating tokens.
In that situation, when field is "mission AD-T" is tokenized to "mission" "ad" & "t"
using StandardAnalyzer. A WhiteSpaceTokenizer tokenized to "mission" "AD-T"
which could be better in some situation. But when field is "mission AD-34T" is tokenized
to "mission" "ad-34t" using StandardAnalyzer due to number.
doeleman: UUID must be case insensitive, as its parts are hexadecimal numbers which
are not case sensitive. StandardAnalyzer is recommended for UUIDS.
A list of analyzer is available http://lucene.apache.org/java/2_4_0/api/org/apache/lucene/analysis/Analyzer.html
Commons analyzer:
* org.apache.lucene.analysis.standard.StandardAnalyzer
* org.apache.lucene.analysis.WhitespaceAnalyzer
* org.apache.lucene.analysis.SimpleAnalyzer
* org.fao.geonet.kernel.search.GeoNetworkAnalyzer (recommended for wildcard query support)
The analyzer must be in the classpath.
-->
<fieldSpecificAnalyzer>
<Field name="_uuid" analyzer="org.fao.geonet.kernel.search.GeoNetworkAnalyzer"/>
<Field name="standardName" analyzer="org.apache.lucene.analysis.core.KeywordAnalyzer"/>
<Field name="parentUuid" analyzer="org.fao.geonet.kernel.search.GeoNetworkAnalyzer"/>
<Field name="operatesOn" analyzer="org.fao.geonet.kernel.search.GeoNetworkAnalyzer"/>
<Field name="operatesOnIdentifier" analyzer="org.fao.geonet.kernel.search.GeoNetworkAnalyzer"/>
<Field name="hassource" analyzer="org.fao.geonet.kernel.search.GeoNetworkAnalyzer"/>
<Field name="hasfeaturecat" analyzer="org.fao.geonet.kernel.search.GeoNetworkAnalyzer"/>
<!--
<Field name="any" analyzer="org.fao.geonet.kernel.search.GeoNetworkAnalyzer"/>
<Field name="any" analyzer="org.apache.lucene.analysis.standard.StandardAnalyzer">
<Param name="version" type="org.apache.lucene.util.Version"/>
<Param name="stopWords" type="java.io.File" value="/path/to/resources/stopwords/en.txt"/>
</Field>
-->
<Field name="subject" analyzer="org.apache.lucene.analysis.core.KeywordAnalyzer"/>
<Field name="_cat" analyzer="org.apache.lucene.analysis.core.KeywordAnalyzer"/>
<Field name="keyword" analyzer="org.apache.lucene.analysis.core.KeywordAnalyzer"/>
<Field name="inspirerelated" analyzer="org.apache.lucene.analysis.core.KeywordAnalyzer"/>
<Field name="_groupPublished" analyzer="org.apache.lucene.analysis.core.KeywordAnalyzer"/>
<Field name="_source" analyzer="org.apache.lucene.analysis.core.KeywordAnalyzer"/>
</fieldSpecificAnalyzer>
<fieldSpecificSearchAnalyzer>
<!-- Define here analyzers to be overriden from fieldSpecificAnalyzer.
By default, indexing and searching per field analyzers are the same.
In some case, an analyzer could be applied at indexing and another
at searching time. For example when using a SynonymAnalyzer. When adding
Londres, Londinium, Londona for London at indexing time, then there is no
need to add those synonyms for search if users search for Londres, it will match.
We only need synonym expansion during indexing or during searching, not both.
-->
</fieldSpecificSearchAnalyzer>
<!-- Document boosting configuration.
Document boosting allows to define custom boost factor for a document
or for document fields at index time which
will be use for score computation at search time.
Note: Do not abuse of document boost
because it may lead to promote too much some kind of documents
or the contrary and make end-user not so confident of your search engine!
-->
<!-- Boosting factor for fields. This parameter will impact queries using OR operator.
<fieldBoosting>
<Field name="title" boost="1.5F"/>
</fieldBoosting>
It could be relevant to also remove some field from scoring using this:
* If not, user privileges will be take into account to compute the score.
-->
<fieldBoosting>
<Field name="_op0" boost="0.0F"/>
<Field name="_op1" boost="0.0F"/>
<Field name="_op2" boost="0.0F"/>
<Field name="_dummy" boost="0.0F"/>
<Field name="_isTemplate" boost="0.0F"/>
<Field name="_owner" boost="0.0F"/>
</fieldBoosting>
<!-- Boosting factor for document based on document values.
Parameters:
* fields: A comma separated value of fields.
* values: A comma separated value of value for each field. Use NOTNULL to check any field values.
* boost: A comma separated value of boost factor for each field (Float).
"The boost factor values you should use depend on what you’re trying to achieve;
you’ll need to do some experimentation and
tuning to achieve the desired effect." source: Lucene In Action
<boostDocument name="org.fao.geonet.kernel.search.function.ImportantDocument">
<!- - Example to promote series and not promote records part of a series - ->
<Param name="fields" type="java.lang.String" value="type,parentUuid"/>
<Param name="values" type="java.lang.String" value="series,NOTNULL"/>
<Param name="boosts" type="java.lang.String" value=".1F,-.3F"/>
<!- - Example to promote series and service and records with keyword containing elevation
<Param name="fields" type="java.lang.String" value="type,type,keyword"/>
<Param name="values" type="java.lang.String" value="series,service,Elevation"/>
<Param name="boosts" type="java.lang.String" value=".1F,-.3F,.4F"/>- ->
</boostDocument>
-->
<!-- All Lucene fields that are tokenized must be kept here because it
is impossible unfortunately from Lucene API to work out which fields are
tokenized and which aren't unless we read documents and we may not have
an index to do this on so since most fields are not tokenized we
keep a list of tokenized fields here
-->
<tokenized>
<Field name="any"/>
<Field name="anylight"/>
<Field name="abstract"/>
<Field name="title"/>
<Field name="altTitle"/>
<Field name="keywordType"/>
<!--<Field name="orgName"/>-->
<Field name="specificationTitle"/>
<Field name="levelName"/>
<Field name="subject"/>
</tokenized>
<!-- All Lucene numeric fields.
Use of numeric field will increase index size.
It could give better search results for numeric values.
Type attribute : int (default), double, long, float
Precision attribute : see NumericUtils.PRECISION_STEP_DEFAULT
-->
<numeric>
<Field name="denominator"/>
<Field name="feedbackCount"/>
<Field name="_rating"/>
<Field name="northBL" type="double"/>
<Field name="eastBL" type="double"/>
<Field name="southBL" type="double"/>
<Field name="westBL" type="double"/>
</numeric>
<!-- List all of the lucene field names which will have similarity applied to them
To disable set enabled to false.
If enabled is false or not present then similarity will be applied to all fields (except some special case instances)
-->
<fuzzyMatching enabled="true">
<Field name="any"/>
<Field name="anylight"/>
<Field name="title"/>
<Field name="altTitle"/>
<Field name="abstract"/>
<Field name="specificationTitle"/>
</fuzzyMatching>
</config>