Permalink
Browse files

[performance] ngram index: method for computing string offsets by uni…

…code code points turned out to be a major bottleneck. Replacing the code to avoid String.offsetByCodePoints improves indexing performance by at least factor 2.
  • Loading branch information...
1 parent 4b889ea commit 340364cdb8b3d4e917e47dda29bf144087997c50 @wolfgangmm wolfgangmm committed Jul 21, 2014
@@ -578,18 +578,18 @@ public StoredNode getReindexRoot(StoredNode node, NodePath path, boolean insert,
}
private void indexText(NodeId nodeId, QName qname, String text) {
- String[] ngram = tokenize(text);
- int len = ngram.length;
- for (int i = 0; i < len; i++) {
- int offset = text.offsetByCodePoints(0, i);
- QNameTerm key = new QNameTerm(qname, ngram[i]);
+ final String[] ngram = tokenize(text);
+ final int len = text.length();
+ for (int i = 0, j = 0, cp; i < len; i += Character.charCount(cp), j++) {
+ cp = text.codePointAt(i);
+ final QNameTerm key = new QNameTerm(qname, ngram[j]);
OccurrenceList list = ngrams.get(key);
if (list == null) {
list = new OccurrenceList();
- list.add(nodeId, offset);
+ list.add(nodeId, i);
ngrams.put(key, list);
} else {
- list.add(nodeId, offset);
+ list.add(nodeId, i);
}
}
}
@@ -136,6 +136,7 @@ public NGramSearch(XQueryContext context, FunctionSignature signature) {
@Override
public void setArguments(List<Expression> arguments) throws XPathException {
+ steps.clear();
Expression path = arguments.get(0);
steps.add(path);

0 comments on commit 340364c

Please sign in to comment.