Permalink
Browse files

Improve index configuration: whitespace; proper handling of nested xm…

…l...
  • Loading branch information...
1 parent 6c04b99 commit 2a97fa78f3010a99bd0bae96aab00c8852319dda @wolfgangmm wolfgangmm committed Aug 19, 2013
View
41 extensions/indexes/range/src/org/exist/indexing/range/ComplexRangeIndexConfigElement.java
@@ -2,18 +2,14 @@
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
-import org.exist.dom.QName;
import org.exist.storage.NodePath;
import org.exist.util.DatabaseConfigurationException;
import org.exist.xquery.value.Type;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
public class ComplexRangeIndexConfigElement extends RangeIndexConfigElement {
@@ -25,17 +21,7 @@
public ComplexRangeIndexConfigElement(Element node, NodeList children, Map<String, String> namespaces)
throws DatabaseConfigurationException {
- super();
- String match = node.getAttribute("match");
- if (match != null) {
- try {
- path = new NodePath(namespaces, match, false);
- if (path.length() == 0)
- throw new DatabaseConfigurationException("Range index module: Invalid match path in collection config: " + match);
- } catch (IllegalArgumentException e) {
- throw new DatabaseConfigurationException("Range index module: invalid qname in configuration: " + e.getMessage());
- }
- }
+ super(node, namespaces);
for (int i = 0; i < children.getLength(); i++) {
Node child = children.item(i);
@@ -57,9 +43,9 @@ public boolean isComplex() {
@Override
public boolean match(NodePath other) {
- if (path.match(other))
- return true;
- return false;
+ if (isQNameIndex)
+ return other.getLastComponent().equalsSimple(path.getLastComponent());
+ return path.match(other);
}
@Override
@@ -68,15 +54,14 @@ public boolean find(NodePath other) {
}
@Override
- public TextCollector getCollector() {
- return new ComplexTextCollector(this);
+ public TextCollector getCollector(NodePath path) {
+ return new ComplexTextCollector(this, path);
}
@Override
public Analyzer getAnalyzer(String fieldName) {
- RangeIndexConfigField field = fields.get(fieldName);
- if (field != null) {
- return field.getAnalyzer();
+ if (fields.containsKey(fieldName)) {
+ return analyzer;
}
return null;
}
@@ -89,6 +74,14 @@ public RangeIndexConfigField getField(NodePath path) {
return null;
}
+ public RangeIndexConfigField getField(NodePath parentPath, NodePath path) {
+ for (RangeIndexConfigField field: fields.values()) {
+ if (field.match(parentPath, path))
+ return field;
+ }
+ return null;
+ }
+
@Override
public int getType(String fieldName) {
RangeIndexConfigField field = fields.get(fieldName);
View
20 extensions/indexes/range/src/org/exist/indexing/range/ComplexTextCollector.java
@@ -10,21 +10,23 @@
public class ComplexTextCollector implements TextCollector {
+ private NodePath parentPath;
private ComplexRangeIndexConfigElement config;
private List<Field> fields = new LinkedList<Field>();
private RangeIndexConfigField currentField = null;
private int length = 0;
- public ComplexTextCollector(ComplexRangeIndexConfigElement configuration) {
- config = configuration;
+ public ComplexTextCollector(ComplexRangeIndexConfigElement configuration, NodePath parentPath) {
+ this.config = configuration;
+ this.parentPath = new NodePath(parentPath);
}
@Override
public void startElement(QName qname, NodePath path) {
- RangeIndexConfigField fieldConf = config.getField(path);
+ RangeIndexConfigField fieldConf = config.getField(parentPath, path);
if (fieldConf != null) {
currentField = fieldConf;
- Field field = new Field(currentField.getName(), false);
+ Field field = new Field(currentField.getName(), false, fieldConf.whitespaceTreatment());
fields.add(field);
}
@@ -39,9 +41,9 @@ public void endElement(QName qname, NodePath path) {
@Override
public void attribute(AttrImpl attribute, NodePath path) {
- RangeIndexConfigField fieldConf = config.getField(path);
+ RangeIndexConfigField fieldConf = config.getField(parentPath, path);
if (fieldConf != null) {
- Field field = new Field(fieldConf.getName(), true);
+ Field field = new Field(fieldConf.getName(), true, fieldConf.whitespaceTreatment());
field.content.append(attribute.getValue());
fields.add(0, field);
}
@@ -51,7 +53,7 @@ public void attribute(AttrImpl attribute, NodePath path) {
public void characters(CharacterDataImpl text, NodePath path) {
if (currentField != null) {
Field field = fields.get(fields.size() - 1);
- if (!field.isAttribute()) {
+ if (!field.isAttribute() && (currentField.includeNested() || currentField.match(path))) {
field.content.append(text.getXMLString());
length += text.getXMLString().length();
}
@@ -66,4 +68,8 @@ public int length() {
public List<Field> getFields() {
return fields;
}
+
+ public ComplexRangeIndexConfigElement getConfig() {
+ return config;
+ }
}
View
23 extensions/indexes/range/src/org/exist/indexing/range/RangeIndexConfig.java
@@ -88,10 +88,31 @@ private void parseChildren(NodeList configNodes, Map<String, String> namespaces)
}
}
}
+ // default analyzer
analyzer = new KeywordAnalyzer();
}
- public Analyzer getAnalyzer() {
+ public Analyzer getDefaultAnalyzer() {
+ return analyzer;
+ }
+
+ public Analyzer getAnalyzer(QName qname, String fieldName) {
+ Analyzer analyzer = null;
+ if (qname != null) {
+ RangeIndexConfigElement idxConf = paths.get(qname);
+ if (idxConf != null) {
+ analyzer = idxConf.getAnalyzer(null);
+ }
+ } else {
+ for (RangeIndexConfigElement idxConf: paths.values()) {
+ if (idxConf.isComplex()) {
+ analyzer = idxConf.getAnalyzer(fieldName);
+ if (analyzer != null) {
+ break;
+ }
+ }
+ }
+ }
return analyzer;
}
View
148 extensions/indexes/range/src/org/exist/indexing/range/RangeIndexConfigElement.java
@@ -1,30 +1,21 @@
package org.exist.indexing.range;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.core.KeywordAnalyzer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.collation.CollationKeyAnalyzer;
import org.apache.lucene.document.*;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.search.*;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.NumericUtils;
-import org.apache.lucene.util.Version;
-import org.exist.dom.DocumentSet;
import org.exist.dom.QName;
import org.exist.indexing.lucene.LuceneIndexConfig;
import org.exist.storage.NodePath;
import org.exist.util.Collations;
import org.exist.util.DatabaseConfigurationException;
-import org.exist.xquery.Constants;
+import org.exist.util.XMLString;
import org.exist.xquery.XPathException;
import org.exist.xquery.value.*;
import org.w3c.dom.Element;
import java.io.IOException;
-import java.io.StringReader;
import java.util.Map;
public class RangeIndexConfigElement {
@@ -41,11 +32,10 @@
protected NodePath path = null;
private int type = Type.STRING;
private RangeIndexConfigElement nextConfig = null;
- private boolean isQNameIndex = false;
+ protected boolean isQNameIndex = false;
protected Analyzer analyzer = null;
-
- public RangeIndexConfigElement() {
- }
+ protected boolean includeNested = false;
+ protected int wsTreatment = XMLString.SUPPRESS_NONE;
public RangeIndexConfigElement(Element node, Map<String, String> namespaces) throws DatabaseConfigurationException {
String match = node.getAttribute("match");
@@ -59,7 +49,8 @@ public RangeIndexConfigElement(Element node, Map<String, String> namespaces) thr
}
} else if (node.hasAttribute("qname")) {
QName qname = LuceneIndexConfig.parseQName(node, namespaces);
- path = new NodePath(qname);
+ path = new NodePath(NodePath.SKIP);
+ path.addComponent(qname);
isQNameIndex = true;
}
String typeStr = node.getAttribute("type");
@@ -73,16 +64,27 @@ public RangeIndexConfigElement(Element node, Map<String, String> namespaces) thr
String collation = node.getAttribute("collation");
if (collation != null && collation.length() > 0) {
try {
- analyzer = new CollationKeyAnalyzer(Version.LUCENE_43, Collations.getCollationFromURI(null, collation));
+ analyzer = new CollationKeyAnalyzer(RangeIndex.LUCENE_VERSION_IN_USE, Collations.getCollationFromURI(null, collation));
} catch (XPathException e) {
throw new DatabaseConfigurationException(e.getMessage(), e);
}
}
+ String nested = node.getAttribute("nested");
+ includeNested = (nested == null || nested.equalsIgnoreCase("yes"));
+
+ // normalize whitespace if whitespace="normalize"
+ String whitespace = node.getAttribute("whitespace");
+ if (whitespace != null) {
+ if ("trim".equalsIgnoreCase(whitespace)) {
+ wsTreatment = XMLString.SUPPRESS_BOTH;
+ } else if ("normalize".equalsIgnoreCase(whitespace)) {
+ wsTreatment = XMLString.NORMALIZE;
+ }
+ }
}
public Field convertToField(String fieldName, String content) throws IOException {
int fieldType = getType(fieldName);
- Analyzer analyzer = getAnalyzer(fieldName);
try {
switch (fieldType) {
case Type.INTEGER:
@@ -104,11 +106,6 @@ public Field convertToField(String fieldName, String content) throws IOException
float fvalue = Float.parseFloat(content);
return new FloatField(fieldName, fvalue, FloatField.TYPE_NOT_STORED);
default:
- // default: treat as text string
- if (analyzer != null) {
- TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(content));
- return new TextField(fieldName, stream);
- }
return new TextField(fieldName, content, Field.Store.NO);
}
} catch (NumberFormatException e) {
@@ -117,73 +114,6 @@ public Field convertToField(String fieldName, String content) throws IOException
return null;
}
- public static Query toQuery(String field, AtomicValue content, RangeIndex.Operator operator,
- DocumentSet docs, RangeIndexWorker worker) throws XPathException {
- final int type = content.getType();
- BytesRef bytes;
- if (Type.subTypeOf(type, Type.STRING)) {
- BytesRef key = worker.analyzeContent(field, content, docs);
- switch (operator) {
- case EQ:
- return new TermQuery(new Term(field, key));
- case STARTS_WITH:
- return new PrefixQuery(new Term(field, key));
- case ENDS_WITH:
- bytes = new BytesRef("*");
- bytes.append(key);
- return new WildcardQuery(new Term(field, bytes));
- case CONTAINS:
- bytes = new BytesRef("*");
- bytes.append(key);
- bytes.append(new BytesRef("*"));
- return new WildcardQuery(new Term(field, bytes));
- }
- }
- if (operator == RangeIndex.Operator.EQ) {
- return new TermQuery(new Term(field, convertToBytes(content)));
- }
- final boolean includeUpper = operator == RangeIndex.Operator.LE;
- final boolean includeLower = operator == RangeIndex.Operator.GE;
- switch (type) {
- case Type.INTEGER:
- case Type.LONG:
- case Type.UNSIGNED_LONG:
- if (operator == RangeIndex.Operator.LT || operator == RangeIndex.Operator.LE) {
- return NumericRangeQuery.newLongRange(field, null, ((NumericValue)content).getLong(), includeLower, includeUpper);
- } else {
- return NumericRangeQuery.newLongRange(field, ((NumericValue)content).getLong(), null, includeLower, includeUpper);
- }
- case Type.INT:
- case Type.UNSIGNED_INT:
- case Type.SHORT:
- case Type.UNSIGNED_SHORT:
- if (operator == RangeIndex.Operator.LT || operator == RangeIndex.Operator.LE) {
- return NumericRangeQuery.newIntRange(field, null, ((NumericValue) content).getInt(), includeLower, includeUpper);
- } else {
- return NumericRangeQuery.newIntRange(field, ((NumericValue) content).getInt(), null, includeLower, includeUpper);
- }
- case Type.DECIMAL:
- case Type.DOUBLE:
- if (operator == RangeIndex.Operator.LT || operator == RangeIndex.Operator.LE) {
- return NumericRangeQuery.newDoubleRange(field, null, ((NumericValue) content).getDouble(), includeLower, includeUpper);
- } else {
- return NumericRangeQuery.newDoubleRange(field, ((NumericValue) content).getDouble(), null, includeLower, includeUpper);
- }
- case Type.FLOAT:
- if (operator == RangeIndex.Operator.LT || operator == RangeIndex.Operator.LE) {
- return NumericRangeQuery.newFloatRange(field, null, (float) ((NumericValue) content).getDouble(), includeLower, includeUpper);
- } else {
- return NumericRangeQuery.newFloatRange(field, (float) ((NumericValue) content).getDouble(), null, includeLower, includeUpper);
- }
- default:
- if (operator == RangeIndex.Operator.LT || operator == RangeIndex.Operator.LE) {
- return new TermRangeQuery(field, null, convertToBytes(content), includeLower, includeUpper);
- } else {
- return new TermRangeQuery(field, convertToBytes(content), null, includeLower, includeUpper);
- }
- }
- }
-
public static BytesRef convertToBytes(AtomicValue content) throws XPathException {
BytesRef bytes;
switch(content.getType()) {
@@ -220,44 +150,12 @@ public static BytesRef convertToBytes(AtomicValue content) throws XPathException
}
}
- public static Term convertToTerm(String fieldName, AtomicValue content) throws XPathException {
- BytesRef bytes;
- switch(content.getType()) {
- case Type.INTEGER:
- case Type.LONG:
- case Type.UNSIGNED_LONG:
- bytes = new BytesRef(NumericUtils.BUF_SIZE_LONG);
- NumericUtils.longToPrefixCoded(((IntegerValue)content).getLong(), 0, bytes);
- return new Term(fieldName, bytes);
- case Type.SHORT:
- case Type.UNSIGNED_SHORT:
- case Type.INT:
- case Type.UNSIGNED_INT:
- bytes = new BytesRef(NumericUtils.BUF_SIZE_INT);
- NumericUtils.intToPrefixCoded(((IntegerValue)content).getInt(), 0, bytes);
- return new Term(fieldName, bytes);
- case Type.DECIMAL:
- long dv = NumericUtils.doubleToSortableLong(((DecimalValue)content).getDouble());
- bytes = new BytesRef(NumericUtils.BUF_SIZE_LONG);
- NumericUtils.longToPrefixCoded(dv, 0, bytes);
- return new Term(fieldName, bytes);
- case Type.DOUBLE:
- long lv = NumericUtils.doubleToSortableLong(((DoubleValue)content).getDouble());
- bytes = new BytesRef(NumericUtils.BUF_SIZE_LONG);
- NumericUtils.longToPrefixCoded(lv, 0, bytes);
- return new Term(fieldName, bytes);
- case Type.FLOAT:
- int iv = NumericUtils.floatToSortableInt(((FloatValue)content).getValue());
- bytes = new BytesRef(NumericUtils.BUF_SIZE_INT);
- NumericUtils.longToPrefixCoded(iv, 0, bytes);
- return new Term(fieldName, bytes);
- default:
- return new Term(fieldName, content.getStringValue());
- }
+ public TextCollector getCollector(NodePath path) {
+ return new SimpleTextCollector(this, includeNested, wsTreatment);
}
- public TextCollector getCollector() {
- return new SimpleTextCollector();
+ public Analyzer getAnalyzer() {
+ return analyzer;
}
public Analyzer getAnalyzer(String field) {
View
56 extensions/indexes/range/src/org/exist/indexing/range/RangeIndexConfigField.java
@@ -3,9 +3,12 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.collation.CollationKeyAnalyzer;
import org.apache.lucene.util.Version;
+import org.exist.dom.QName;
+import org.exist.indexing.lucene.LuceneIndexConfig;
import org.exist.storage.NodePath;
import org.exist.util.Collations;
import org.exist.util.DatabaseConfigurationException;
+import org.exist.util.XMLString;
import org.exist.xquery.XPathException;
import org.exist.xquery.value.Type;
import org.w3c.dom.Element;
@@ -16,25 +19,34 @@
private String name;
private NodePath path = null;
+ private NodePath relPath = null;
private int type = Type.STRING;
- private Analyzer analyzer = null;
+ protected boolean includeNested = false;
+ protected int wsTreatment = XMLString.SUPPRESS_NONE;
+ protected boolean isQNameIndex = false;
public RangeIndexConfigField(NodePath parentPath, Element elem, Map<String, String> namespaces) throws DatabaseConfigurationException {
name = elem.getAttribute("name");
+ path = parentPath;
if (name == null || name.length() == 0) {
throw new DatabaseConfigurationException("Range index module: field element requires a name attribute");
}
String match = elem.getAttribute("match");
- if (match != null) {
+ if (match != null && match.length() > 0) {
try {
- NodePath relPath = new NodePath(namespaces, match);
+ relPath = new NodePath(namespaces, match);
if (relPath.length() == 0)
throw new DatabaseConfigurationException("Range index module: Invalid match path in collection config: " + match);
path = new NodePath(parentPath);
path.append(relPath);
} catch (IllegalArgumentException e) {
throw new DatabaseConfigurationException("Range index module: invalid qname in configuration: " + e.getMessage());
}
+ } else if (elem.hasAttribute("qname")) {
+ QName qname = LuceneIndexConfig.parseQName(elem, namespaces);
+ path = new NodePath(qname);
+ relPath = path;
+ isQNameIndex = true;
}
String typeStr = elem.getAttribute("type");
if (typeStr != null && typeStr.length() > 0) {
@@ -44,12 +56,16 @@ public RangeIndexConfigField(NodePath parentPath, Element elem, Map<String, Stri
throw new DatabaseConfigurationException("Invalid type declared for range index on " + match + ": " + typeStr);
}
}
- String collation = elem.getAttribute("collation");
- if (collation != null && collation.length() > 0) {
- try {
- analyzer = new CollationKeyAnalyzer(Version.LUCENE_43, Collations.getCollationFromURI(null, collation));
- } catch (XPathException e) {
- throw new DatabaseConfigurationException(e.getMessage(), e);
+ String nested = elem.getAttribute("nested");
+ includeNested = (nested == null || nested.equalsIgnoreCase("yes"));
+
+ // normalize whitespace if whitespace="normalize"
+ String whitespace = elem.getAttribute("whitespace");
+ if (whitespace != null) {
+ if ("trim".equalsIgnoreCase(whitespace)) {
+ wsTreatment = XMLString.SUPPRESS_BOTH;
+ } else if ("normalize".equalsIgnoreCase(whitespace)) {
+ wsTreatment = XMLString.NORMALIZE;
}
}
}
@@ -62,15 +78,29 @@ public NodePath getPath() {
return path;
}
- public int getType() {
- return type;
+ public NodePath getRelPath() {
+ return relPath;
}
- public Analyzer getAnalyzer() {
- return analyzer;
+ public int getType() {
+ return type;
}
public boolean match(NodePath other) {
return path.match(other);
}
+
+ public boolean match(NodePath parentPath, NodePath other) {
+ NodePath absPath = new NodePath(parentPath);
+ absPath.append(relPath);
+ return absPath.match(other);
+ }
+
+ public int whitespaceTreatment() {
+ return wsTreatment;
+ }
+
+ public boolean includeNested() {
+ return includeNested;
+ }
}
View
108 extensions/indexes/range/src/org/exist/indexing/range/RangeIndexWorker.java
@@ -3,12 +3,9 @@
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
-import org.apache.lucene.collation.tokenattributes.CollatedTermAttributeImpl;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
-import org.apache.lucene.queries.TermsFilter;
import org.apache.lucene.search.*;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
@@ -74,6 +71,77 @@ public RangeIndexWorker(RangeIndex index, DBBroker broker) {
this.broker = broker;
}
+ public Query toQuery(String field, QName qname, AtomicValue content, RangeIndex.Operator operator, DocumentSet docs) throws XPathException {
+ final int type = content.getType();
+ BytesRef bytes;
+ if (Type.subTypeOf(type, Type.STRING)) {
+ BytesRef key = analyzeContent(field, qname, content, docs);
+ WildcardQuery query;
+ switch (operator) {
+ case EQ:
+ return new TermQuery(new Term(field, key));
+ case STARTS_WITH:
+ return new PrefixQuery(new Term(field, key));
+ case ENDS_WITH:
+ bytes = new BytesRef("*");
+ bytes.append(key);
+ query = new WildcardQuery(new Term(field, bytes));
+ query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE);
+ return query;
+ case CONTAINS:
+ bytes = new BytesRef("*");
+ bytes.append(key);
+ bytes.append(new BytesRef("*"));
+ query = new WildcardQuery(new Term(field, bytes));
+ query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE);
+ return query;
+ }
+ }
+ if (operator == RangeIndex.Operator.EQ) {
+ return new TermQuery(new Term(field, RangeIndexConfigElement.convertToBytes(content)));
+ }
+ final boolean includeUpper = operator == RangeIndex.Operator.LE;
+ final boolean includeLower = operator == RangeIndex.Operator.GE;
+ switch (type) {
+ case Type.INTEGER:
+ case Type.LONG:
+ case Type.UNSIGNED_LONG:
+ if (operator == RangeIndex.Operator.LT || operator == RangeIndex.Operator.LE) {
+ return NumericRangeQuery.newLongRange(field, null, ((NumericValue)content).getLong(), includeLower, includeUpper);
+ } else {
+ return NumericRangeQuery.newLongRange(field, ((NumericValue)content).getLong(), null, includeLower, includeUpper);
+ }
+ case Type.INT:
+ case Type.UNSIGNED_INT:
+ case Type.SHORT:
+ case Type.UNSIGNED_SHORT:
+ if (operator == RangeIndex.Operator.LT || operator == RangeIndex.Operator.LE) {
+ return NumericRangeQuery.newIntRange(field, null, ((NumericValue) content).getInt(), includeLower, includeUpper);
+ } else {
+ return NumericRangeQuery.newIntRange(field, ((NumericValue) content).getInt(), null, includeLower, includeUpper);
+ }
+ case Type.DECIMAL:
+ case Type.DOUBLE:
+ if (operator == RangeIndex.Operator.LT || operator == RangeIndex.Operator.LE) {
+ return NumericRangeQuery.newDoubleRange(field, null, ((NumericValue) content).getDouble(), includeLower, includeUpper);
+ } else {
+ return NumericRangeQuery.newDoubleRange(field, ((NumericValue) content).getDouble(), null, includeLower, includeUpper);
+ }
+ case Type.FLOAT:
+ if (operator == RangeIndex.Operator.LT || operator == RangeIndex.Operator.LE) {
+ return NumericRangeQuery.newFloatRange(field, null, (float) ((NumericValue) content).getDouble(), includeLower, includeUpper);
+ } else {
+ return NumericRangeQuery.newFloatRange(field, (float) ((NumericValue) content).getDouble(), null, includeLower, includeUpper);
+ }
+ default:
+ if (operator == RangeIndex.Operator.LT || operator == RangeIndex.Operator.LE) {
+ return new TermRangeQuery(field, null, RangeIndexConfigElement.convertToBytes(content), includeLower, includeUpper);
+ } else {
+ return new TermRangeQuery(field, RangeIndexConfigElement.convertToBytes(content), null, includeLower, includeUpper);
+ }
+ }
+ }
+
@Override
public String getIndexId() {
return index.getIndexId();
@@ -325,18 +393,22 @@ private void write() {
for (TextCollector.Field field : pending.getCollector().getFields()) {
String contentField;
if (field.isNamed())
- contentField = field.name;
+ contentField = field.getName();
else
contentField = LuceneUtil.encodeQName(pending.getQName(), index.getBrokerPool().getSymbols());
- Field fld = pending.getConfig().convertToField(contentField, field.content.toString());
+ Field fld = pending.getConfig().convertToField(contentField, field.getContent().toString());
if (fld != null) {
doc.add(fld);
}
}
fDocIdIdx.setIntValue(currentDoc.getDocId());
doc.add(fDocIdIdx);
- writer.addDocument(doc, config.getAnalyzer());
+ Analyzer analyzer = pending.getConfig().getAnalyzer();
+ if (analyzer == null) {
+ analyzer = config.getDefaultAnalyzer();
+ }
+ writer.addDocument(doc, analyzer);
}
} catch (IOException e) {
LOG.warn("An exception was caught while indexing document: " + e.getMessage(), e);
@@ -359,11 +431,11 @@ public NodeSet query(int contextId, DocumentSet docs, NodeSet contextSet, List<Q
if (keys.length > 1) {
BooleanQuery bool = new BooleanQuery();
for (AtomicValue key: keys) {
- bool.add(RangeIndexConfigElement.toQuery(field, key, operator, docs, this), BooleanClause.Occur.SHOULD);
+ bool.add(toQuery(field, qname, key, operator, docs), BooleanClause.Occur.SHOULD);
}
query = bool;
} else {
- query = RangeIndexConfigElement.toQuery(field, keys[0], operator, docs, this);
+ query = toQuery(field, qname, keys[0], operator, docs);
}
resultSet = doQuery(contextId, docs, contextSet, axis, searcher, qname, query, null);
@@ -395,12 +467,12 @@ public NodeSet queryField(int contextId, DocumentSet docs, NodeSet contextSet, S
bool.setMinimumNumberShouldMatch(1);
for (SequenceIterator ki = keys[j].iterate(); ki.hasNext(); ) {
Item key = ki.nextItem();
- Query q = RangeIndexConfigElement.toQuery(field, key.atomize(), operator, docs, this);
+ Query q = toQuery(field, null, key.atomize(), operator, docs);
bool.add(q, BooleanClause.Occur.SHOULD);
}
query.add(bool, BooleanClause.Occur.MUST);
} else {
- Query q = RangeIndexConfigElement.toQuery(field, keys[j].itemAt(0).atomize(), operator, docs, this);
+ Query q = toQuery(field, null, keys[j].itemAt(0).atomize(), operator, docs);
query.add(q, BooleanClause.Occur.MUST);
}
}
@@ -602,9 +674,13 @@ public Status needsField(FieldInfo fieldInfo) throws IOException {
return indexes;
}
- protected BytesRef analyzeContent(String field, AtomicValue content, DocumentSet docs) throws XPathException {
+ protected BytesRef analyzeContent(String field, QName qname, AtomicValue content, DocumentSet docs) throws XPathException {
+ Analyzer analyzer = getAnalyzer(qname, field, docs);
+ if (analyzer == null) {
+ return new BytesRef(content.getStringValue());
+ }
try {
- TokenStream stream = getAnalyzer(docs).tokenStream(field, new StringReader(content.getStringValue()));
+ TokenStream stream = analyzer.tokenStream(field, new StringReader(content.getStringValue()));
TermToBytesRefAttribute termAttr = stream.addAttribute(TermToBytesRefAttribute.class);
BytesRef token = null;
try {
@@ -627,20 +703,20 @@ protected BytesRef analyzeContent(String field, AtomicValue content, DocumentSet
* Return the analyzer to be used for the given field or qname. Either field
* or qname should be specified.
*/
- private Analyzer getAnalyzer(DocumentSet docs) {
+ private Analyzer getAnalyzer(QName qname, String fieldName, DocumentSet docs) {
for (Iterator<Collection> i = docs.getCollectionIterator(); i.hasNext(); ) {
Collection collection = i.next();
IndexSpec idxConf = collection.getIndexConfiguration(broker);
if (idxConf != null) {
RangeIndexConfig config = (RangeIndexConfig) idxConf.getCustomIndexSpec(RangeIndex.ID);
if (config != null) {
- Analyzer analyzer = config.getAnalyzer();
+ Analyzer analyzer = config.getAnalyzer(qname, fieldName);
if (analyzer != null)
return analyzer;
}
}
}
- return index.getDefaultAnalyzer();
+ return null;
}
private static boolean matchQName(QName qname, QName candidate) {
@@ -668,7 +744,7 @@ public void startElement(Txn transaction, ElementImpl element, NodePath path) {
while (configIter.hasNext()) {
RangeIndexConfigElement configuration = configIter.next();
if (configuration.match(path)) {
- contentStack.push(configuration.getCollector());
+ contentStack.push(configuration.getCollector(path));
}
}
}
View
14 extensions/indexes/range/src/org/exist/indexing/range/SimpleTextCollector.java
@@ -11,9 +11,15 @@
public class SimpleTextCollector implements TextCollector {
+ private boolean includeNested = true;
+ private RangeIndexConfigElement config = null;
private XMLString buf = new XMLString();
+ private int wsTreatment = XMLString.SUPPRESS_NONE;
- public SimpleTextCollector() {
+ public SimpleTextCollector(RangeIndexConfigElement config, boolean includeNested, int wsTreatment) {
+ this.config = config;
+ this.includeNested = includeNested;
+ this.wsTreatment = wsTreatment;
}
public SimpleTextCollector(String content) {
@@ -30,7 +36,9 @@ public void endElement(QName qname, NodePath path) {
@Override
public void characters(CharacterDataImpl text, NodePath path) {
- buf.append(text.getXMLString());
+ if (includeNested || config.match(path)) {
+ buf.append(text.getXMLString());
+ }
}
@Override
@@ -45,7 +53,7 @@ public int length() {
@Override
public List<Field> getFields() {
List<Field> fields = new ArrayList<Field>(1);
- fields.add(new Field(buf));
+ fields.add(new Field(buf, wsTreatment));
return fields;
}
}
View
24 extensions/indexes/range/src/org/exist/indexing/range/TextCollector.java
@@ -24,22 +24,36 @@
public List<Field> getFields();
public static class Field {
- final boolean attribute;
- final String name;
- final XMLString content;
+ protected final boolean attribute;
+ protected final String name;
+ protected final XMLString content;
+ protected final int wsTreatment;
- public Field(XMLString content) {
+ public Field(XMLString content, int wsTreatment) {
this.content = content;
this.attribute = false;
this.name = null;
+ this.wsTreatment = wsTreatment;
}
- public Field(String name, boolean isAttribute) {
+ public Field(String name, boolean isAttribute, int wsTreatment) {
this.name = name;
this.attribute = isAttribute;
+ this.wsTreatment = wsTreatment;
this.content = new XMLString();
}
+ public String getContent() {
+ if (wsTreatment != XMLString.SUPPRESS_NONE) {
+ return content.normalize(wsTreatment).toString();
+ }
+ return content.toString();
+ }
+
+ public String getName() {
+ return name;
+ }
+
public boolean isNamed() {
return name != null;
}
View
177 extensions/indexes/range/test/src/xquery/fields.xql
@@ -0,0 +1,177 @@
+xquery version "3.0";
+
+module namespace rt="http://exist-db.org/xquery/range/test/fields";
+
+import module namespace range="http://exist-db.org/xquery/range" at "java:org.exist.xquery.modules.range.RangeIndexModule";
+import module namespace test="http://exist-db.org/xquery/xqsuite" at "resource:org/exist/xquery/lib/xqsuite/xqsuite.xql";
+
+declare namespace tei="http://www.tei-c.org/ns/1.0";
+
+declare variable $rt:COLLECTION_CONFIG :=
+ <collection xmlns="http://exist-db.org/collection-config/1.0">
+ <index xmlns:xs="http://www.w3.org/2001/XMLSchema"
+ xmlns:tei="http://www.tei-c.org/ns/1.0">
+ <fulltext default="none" attributes="false"/>
+ <range>
+ <create match="//tei:div">
+ <field name="xml-id" match="@xml:id" type="xs:string"/>
+ <field name="line-id" match="tei:sp/tei:l/@xml:id" type="xs:string"/>
+ <field name="speaker" match="tei:sp/tei:speaker" type="xs:string"/>
+ <field name="stage" match="tei:stage" type="xs:string"/>
+ <field name="head" match="tei:head" type="xs:string"/>
+ </create>
+ </range>
+ </index>
+ </collection>;
+
+declare variable $rt:DATA :=
+ <TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="sha-mac">
+ <text>
+ <body>
+ <div xml:id="sha-mac1">
+ <head>Act 1</head>
+ <div xml:id="sha-mac101">
+ <head>Act 1, Scene 1</head>
+ <stage>A desert place. Thunder and lightning.</stage>
+ <stage>Enter three Witches.</stage>
+ <sp who="mac-first-witch.">
+ <speaker>First Witch</speaker>
+ <l xml:id="sha-mac101001" n="1">When shall we three meet again</l>
+ <l xml:id="sha-mac101002" n="2">In thunder, lightning, or in rain?</l>
+ </sp>
+ <sp who="mac-sec.-witch.">
+ <speaker>Second Witch</speaker>
+ <l xml:id="sha-mac101003" n="3">When the hurlyburly's done,</l>
+ <l xml:id="sha-mac101004" n="4">When the battle's lost and won.</l>
+ </sp>
+ <sp who="mac-third-witch.">
+ <speaker>Third Witch</speaker>
+ <l xml:id="sha-mac101005" n="5">That will be ere the set of sun.</l>
+ </sp>
+ </div>
+ </div>
+ </body>
+ </text>
+ </TEI>;
+
+declare variable $rt:COLLECTION_NAME := "fieldstest";
+declare variable $rt:COLLECTION := "/db/" || $rt:COLLECTION_NAME;
+
+declare
+ %test:setUp
+function rt:setup() {
+ xmldb:create-collection("/db/system/config/db", $rt:COLLECTION_NAME),
+ xmldb:store("/db/system/config/db/" || $rt:COLLECTION_NAME, "collection.xconf", $rt:COLLECTION_CONFIG),
+ xmldb:create-collection("/db", $rt:COLLECTION_NAME),
+ xmldb:store($rt:COLLECTION, "test.xml", $rt:DATA)
+};
+
+declare
+ %test:tearDown
+function rt:cleanup() {
+ xmldb:remove($rt:COLLECTION),
+ xmldb:remove("/db/system/config/db/" || $rt:COLLECTION_NAME)
+};
+
+declare
+ %test:args("sha-mac101")
+ %test:assertEquals(1)
+ %test:args("sha-mac101005")
+ %test:assertEquals(0)
+function rt:field-id($id as xs:string) {
+ count(collection($rt:COLLECTION)//range:field-eq("xml-id", $id))
+};
+
+declare
+ %test:args("sha-mac101")
+ %test:assertEquals(1)
+ %test:args("sha-mac101005")
+ %test:assertEquals(0)
+function rt:field-div-eq($id as xs:string) {
+ count(collection($rt:COLLECTION)//tei:div[@xml:id = $id])
+};
+
+declare
+ %test:stats
+ %test:args("sha-mac101")
+ %test:assertXPath("$result//stats:index[@type = 'new-range'][@optimization = 2]")
+function rt:field-div-eq-optimize($id as xs:string) {
+ count(collection($rt:COLLECTION)//tei:div[@xml:id = $id])
+};
+
+declare
+ %test:stats
+ %test:args("sha-mac101005")
+ %test:assertXPath("not($result//stats:index[@type = 'new-range'][@optimization = 2])")
+function rt:field-l-eq-optimize-no($id as xs:string) {
+ collection($rt:COLLECTION)//tei:sp/tei:l[@xml:id = $id]
+};
+
+declare
+ %test:stats
+ %test:args("sha-mac101005")
+ %test:assertXPath("$result//stats:index[@type = 'new-range'][@optimization = 2]")
+function rt:field-l-eq-optimize($id as xs:string) {
+ collection($rt:COLLECTION)//tei:div[tei:sp/tei:l/@xml:id = $id]
+};
+
+declare
+ %test:args("sha-mac101")
+ %test:assertEquals(0)
+ %test:args("sha-mac101005")
+ %test:assertEquals(1)
+function rt:field-l-eq($id as xs:string) {
+ count(collection($rt:COLLECTION)//tei:sp/tei:l[@xml:id = $id])
+};
+
+declare
+ %test:args("First Witch")
+ %test:assertEquals(1)
+function rt:field-speaker-eq($id as xs:string) {
+ count(collection($rt:COLLECTION)//tei:div[tei:sp/tei:speaker = $id])
+};
+
+declare
+ %test:stats
+ %test:args("First Witch")
+ %test:assertXPath("$result//stats:index[@type = 'new-range'][@optimization = 2]")
+function rt:field-speaker-eq-optimize($id as xs:string) {
+ count(collection($rt:COLLECTION)//tei:div[tei:sp/tei:speaker = $id])
+};
+
+declare
+ %test:args("Enter three Witches.")
+ %test:assertEquals(1)
+function rt:field-stage-eq($stage as xs:string) {
+ count(collection($rt:COLLECTION)//tei:div[tei:stage = $stage])
+};
+
+declare
+ %test:stats
+ %test:args("Enter three Witches.")
+ %test:assertXPath("$result//stats:index[@type = 'new-range'][@optimization = 2]")
+function rt:field-stage-eq-optimize($stage as xs:string) {
+ count(collection($rt:COLLECTION)//tei:div[tei:stage = $stage])
+};
+
+declare
+ %test:args("Act 1, Scene 1")
+ %test:assertEquals(1)
+function rt:field-head-eq($head as xs:string) {
+ count(collection($rt:COLLECTION)//tei:div[tei:head = $head])
+};
+
+declare
+ %test:args("Scene 1")
+ %test:assertEquals(1)
+function rt:field-head-ends-with($head as xs:string) {
+ count(collection($rt:COLLECTION)//tei:div[ends-with(tei:head, $head)])
+};
+
+declare
+ %test:stats
+ %test:args("Scene 1")
+ %test:assertXPath("$result//stats:index[@type = 'new-range'][@optimization = 2]")
+function rt:field-head-ends-with-optimize($head as xs:string) {
+ count(collection($rt:COLLECTION)//tei:div[ends-with(tei:head, $head)])
+};
View
11 extensions/indexes/range/test/src/xquery/range.xql
@@ -19,7 +19,7 @@ declare variable $rt:COLLECTION_CONFIG :=
<field name="address-code" match="city/@code" type="xs:integer"/>
</create>
<create match="/test/address/name"/>
- <create match="/test/address/city" type="xs:string"/>
+ <create match="/test/address/city" type="xs:string" collation="?lang=de-DE&amp;strength=primary"/>
<create match="/test/address/city/@code" type="xs:integer"/>
<create qname="@id" type="xs:string"/>
</range>
@@ -107,6 +107,15 @@ function rt:equality-qname-string-attribute($id as xs:string) {
//address[range:eq(@id, $id)]/name/text()
};
+declare
+ %test:args("russelsheim")
+ %test:assertEquals("Rüsselsheim")
+ %test:args("almweide")
+ %test:assertEquals("Almweide")
+function rt:equality-string-collation($name as xs:string) {
+ //address[range:eq(city, $name)]/city/text()
+};
+
declare
%test:args("Berta Muh")
%test:assertEquals("Almweide")
View
2 src/org/exist/storage/NodePath.java
@@ -158,7 +158,7 @@ public final boolean match(NodePath other, int j) {
skip = true;
}
if((components[i] == WILDCARD || other.components[j].compareTo(components[i]) == 0) &&
- (j + 1 == other.pos || other.components[j + 1].compareTo(components[i]) != 0)) {
+ (!skip || j + 1 == other.pos || other.components[j + 1].compareTo(components[i]) != 0)) {
++i;
skip = false;
} else if(skip) {
View
26 src/org/exist/util/XMLString.java
@@ -32,7 +32,9 @@
public final static int SUPPRESS_NONE = 0;
public final static int SUPPRESS_LEADING_WS = 0x01;
public final static int SUPPRESS_TRAILING_WS = 0x02;
+ public final static int COLLAPSE_WS = 0x04;
public final static int SUPPRESS_BOTH = SUPPRESS_LEADING_WS | SUPPRESS_TRAILING_WS;
+ public final static int NORMALIZE = SUPPRESS_LEADING_WS | SUPPRESS_TRAILING_WS | COLLAPSE_WS;
public final static int DEFAULT_CAPACITY = 16;
@@ -118,6 +120,30 @@ public final XMLString normalize(int mode) {
--length_;
}
}
+ if ((mode & COLLAPSE_WS) != 0) {
+ XMLString copy = new XMLString(length_);
+ boolean inWhitespace = true;
+ for (int i = start_; i < start_ + length_; i++) {
+ switch (value_[i]) {
+ case '\n':
+ case '\r':
+ case '\t':
+ case ' ':
+ if (inWhitespace) {
+ // remove the whitespace
+ } else {
+ copy.append(' ');
+ inWhitespace = true;
+ }
+ break;
+ default:
+ copy.append(value_[i]);
+ inWhitespace = false;
+ break;
+ }
+ }
+ return copy;
+ }
return this;
}
View
16 test/src/org/exist/storage/NodePathTest.java
@@ -39,6 +39,14 @@ public void basicPaths() {
assertTrue(path.match(new NodePath(null, "/a/b/c")));
assertTrue(path.match(new NodePath(null, "/a/b/c/d")));
assertTrue(path.match(new NodePath(null, "/a/b/c/d/e")));
+
+ path = new NodePath(null, "/a/a/b");
+ assertTrue(path.match(new NodePath(null, "/a/a/b")));
+ assertFalse(path.match(new NodePath(null, "/a/b/c")));
+
+ path = new NodePath(null, "/a/b/c/c");
+ assertTrue(path.match(new NodePath(null, "/a/b/c/c")));
+ assertFalse(path.match(new NodePath(null, "/a/b/c/d")));
}
@Test
@@ -50,6 +58,11 @@ public void testWildcards() {
assertFalse(path.match(new NodePath(null, "/a/b/c/d")));
assertTrue(path.match(new NodePath(null, "/a/c")));
+ path = new NodePath(null, "//c");
+ assertTrue(path.match(new NodePath(null, "/a/b/c")));
+ assertTrue(path.match(new NodePath(null, "/a/b/c/c/c")));
+ assertFalse(path.match(new NodePath(null, "/a/b/b")));
+
path = new NodePath(null, "/a/b/*", true);
assertTrue(path.match(new NodePath(null, "/a/b/c")));
assertTrue(path.match(new NodePath(null, "/a/b/c/d")));
@@ -62,5 +75,8 @@ public void testWildcards() {
path = new NodePath(null, "/a/b//*", true);
assertTrue(path.match(new NodePath(null, "/a/b/c")));
assertTrue(path.match(new NodePath(null, "/a/b/c/d")));
+
+ path = new NodePath(null, "//c/d");
+ assertTrue(path.match(new NodePath(null, "/a/b/c/c/d")));
}
}
View
10 test/src/org/exist/util/XMLStringTest.java
@@ -38,6 +38,16 @@ public void testNormalize() {
assertEquals(r, "Hello World");
}
+ public void testCollapse() {
+ XMLString s = new XMLString();
+ char ch[] = "\n Hello World\r\n".toCharArray();
+ s.append(ch, 0, ch.length);
+ s = s.normalize(XMLString.NORMALIZE);
+ String r = s.toString();
+ System.out.println('"' + r + '"');
+ assertEquals(r, "Hello World");
+ }
+
public void testSubstring() {
XMLString s = new XMLString();
char ch[] = "\n Hello World\r\n".toCharArray();

0 comments on commit 2a97fa7

Please sign in to comment.