forked from elastic/elasticsearch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
MlClassicTokenizer.java
120 lines (99 loc) · 4.38 KB
/
MlClassicTokenizer.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.job.categorization;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import java.io.IOException;
/**
* Java port of the classic ML categorization tokenizer, as implemented in the ML C++ code.
*
* In common with the original ML C++ code, there are no configuration options.
*/
public class MlClassicTokenizer extends Tokenizer {
public static String NAME = "ml_classic";
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private int nextOffset;
private int skippedPositions;
MlClassicTokenizer() {
}
/**
* Basically tokenise into [a-zA-Z0-9]+ strings, but also allowing underscores, dots and dashes in the middle.
* Then discard tokens that are hex numbers or begin with a digit.
*/
@Override
public final boolean incrementToken() throws IOException {
clearAttributes();
skippedPositions = 0;
int start = -1;
int length = 0;
boolean haveNonHex = false;
int curChar;
while ((curChar = input.read()) >= 0) {
++nextOffset;
if (Character.isLetterOrDigit(curChar) || (length > 0 && (curChar == '_' || curChar == '.' || curChar == '-'))) {
if (length == 0) {
// We're at the first character of a candidate token, so record the offset
start = nextOffset - 1;
}
termAtt.append((char) curChar);
++length;
// We don't return tokens that are hex numbers, and it's most efficient to keep a running note of this
haveNonHex = haveNonHex ||
// Count dots and dashes as numeric
(Character.digit(curChar, 16) == -1 && curChar != '.' && curChar != '-');
} else if (length > 0) {
// If we get here, we've found a separator character having built up a candidate token
if (haveNonHex && Character.isDigit(termAtt.charAt(0)) == false) {
// The candidate token is valid to return
break;
}
// The candidate token is not valid to return, i.e. it's hex or begins with a digit, so wipe it and carry on searching
++skippedPositions;
start = -1;
length = 0;
termAtt.setEmpty();
}
}
// We need to recheck whether we've got a valid token after the loop because
// the loop can also be exited on reaching the end of the stream
if (length == 0) {
return false;
}
if (haveNonHex == false || Character.isDigit(termAtt.charAt(0))) {
++skippedPositions;
return false;
}
// Strip dots, dashes and underscores at the end of the token
char toCheck;
while ((toCheck = termAtt.charAt(length - 1)) == '_' || toCheck == '.' || toCheck == '-') {
--length;
}
// Characters that may exist in the term attribute beyond its defined length are ignored
termAtt.setLength(length);
offsetAtt.setOffset(start, start + length);
posIncrAtt.setPositionIncrement(skippedPositions + 1);
return true;
}
@Override
public final void end() throws IOException {
super.end();
// Set final offset
int finalOffset = nextOffset + (int) input.skip(Integer.MAX_VALUE);
offsetAtt.setOffset(finalOffset, finalOffset);
// Adjust any skipped tokens
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
}
@Override
public void reset() throws IOException {
super.reset();
nextOffset = 0;
skippedPositions = 0;
}
}