diff --git a/jre_emul/JreEmulation.xcodeproj/project.pbxproj b/jre_emul/JreEmulation.xcodeproj/project.pbxproj index 37170ead70..f4b6f16bed 100644 --- a/jre_emul/JreEmulation.xcodeproj/project.pbxproj +++ b/jre_emul/JreEmulation.xcodeproj/project.pbxproj @@ -802,6 +802,8 @@ FA1DD7BD215418F300D0B800 /* TestCertUtils.m in Sources */ = {isa = PBXBuildFile; fileRef = FA1DD7BC215418F200D0B800 /* TestCertUtils.m */; }; FA1DD7BF2154190800D0B800 /* mySSLSession.m in Sources */ = {isa = PBXBuildFile; fileRef = FA1DD7BE2154190700D0B800 /* mySSLSession.m */; }; FA1DD7C1215419D400D0B800 /* TestSSLSocketPair.m in Sources */ = {isa = PBXBuildFile; fileRef = FA1DD7C0215419D400D0B800 /* TestSSLSocketPair.m */; }; + FA1F0236229749AF00AE9C73 /* BreakIteratorTest.m in Sources */ = {isa = PBXBuildFile; fileRef = FA1F02352297499F00AE9C73 /* BreakIteratorTest.m */; }; + FA1F0238229749FC00AE9C73 /* BreakIteratorTest.m in Sources */ = {isa = PBXBuildFile; fileRef = FA1F0237229749FB00AE9C73 /* BreakIteratorTest.m */; }; FA2EECB521EFB55000B88A06 /* CompletableFutureTest.m in Sources */ = {isa = PBXBuildFile; fileRef = FA2EECB421EFB54F00B88A06 /* CompletableFutureTest.m */; }; FA3ACD812229A27B00ECCC8F /* IosHttpURLConnectionTest.m in Sources */ = {isa = PBXBuildFile; fileRef = FA3ACD802229A27A00ECCC8F /* IosHttpURLConnectionTest.m */; }; FA40676421C07E6100988DFD /* MethodTest.m in Sources */ = {isa = PBXBuildFile; fileRef = FA40675021C07E5E00988DFD /* MethodTest.m */; }; @@ -3974,6 +3976,12 @@ FA1DD7BC215418F200D0B800 /* TestCertUtils.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = TestCertUtils.m; path = org/apache/harmony/security/tests/support/TestCertUtils.m; sourceTree = ""; }; FA1DD7BE2154190700D0B800 /* mySSLSession.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = mySSLSession.m; path = org/apache/harmony/xnet/tests/support/mySSLSession.m; sourceTree = ""; }; FA1DD7C0215419D400D0B800 /* TestSSLSocketPair.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = TestSSLSocketPair.m; path = libcore/javax/net/ssl/TestSSLSocketPair.m; sourceTree = ""; }; + FA1F0231229748EB00AE9C73 /* BreakIterator.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = BreakIterator.h; path = build_result/Classes/java/text/BreakIterator.h; sourceTree = ""; }; + FA1F0232229748EC00AE9C73 /* IcuIteratorWrapper.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; name = IcuIteratorWrapper.m; path = build_result/Classes/java/text/IcuIteratorWrapper.m; sourceTree = ""; }; + FA1F0233229748EC00AE9C73 /* BreakIterator.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; name = BreakIterator.m; path = build_result/Classes/java/text/BreakIterator.m; sourceTree = ""; }; + FA1F0234229748EC00AE9C73 /* IcuIteratorWrapper.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = IcuIteratorWrapper.h; path = build_result/Classes/java/text/IcuIteratorWrapper.h; sourceTree = ""; }; + FA1F02352297499F00AE9C73 /* BreakIteratorTest.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = BreakIteratorTest.m; path = libcore/java/text/BreakIteratorTest.m; sourceTree = ""; }; + FA1F0237229749FB00AE9C73 /* BreakIteratorTest.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = BreakIteratorTest.m; path = org/apache/harmony/tests/java/text/BreakIteratorTest.m; sourceTree = ""; }; FA2EECB421EFB54F00B88A06 /* CompletableFutureTest.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = CompletableFutureTest.m; path = jsr166/CompletableFutureTest.m; sourceTree = ""; }; FA3ACD802229A27A00ECCC8F /* IosHttpURLConnectionTest.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = IosHttpURLConnectionTest.m; path = com/google/j2objc/net/IosHttpURLConnectionTest.m; sourceTree = ""; }; FA3ACD822229A94300ECCC8F /* CompletableFuture.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; name = CompletableFuture.m; path = build_result/Classes/java/util/concurrent/CompletableFuture.m; sourceTree = ""; }; @@ -4320,8 +4328,6 @@ 067219221559B9D200645E1B /* Transpiled Classes */ = { isa = PBXGroup; children = ( - FA3ACD832229A94300ECCC8F /* CompletableFuture.h */, - FA3ACD822229A94300ECCC8F /* CompletableFuture.m */, 06721AC51559CB4200645E1B /* AbstractCollection.h */, 06721A291559C91600645E1B /* AbstractCollection.m */, 06F290BD18972DF800676F1D /* AbstractExecutorService.h */, @@ -4512,6 +4518,8 @@ 06F290C418972DF800676F1D /* BlockingQueue.m */, 06721B341559CC0300645E1B /* Boolean.h */, 067219D11559C87100645E1B /* Boolean.m */, + FA1F0231229748EB00AE9C73 /* BreakIterator.h */, + FA1F0233229748EC00AE9C73 /* BreakIterator.m */, 06F290C518972DF800676F1D /* BrokenBarrierException.h */, 06F290C618972DF800676F1D /* BrokenBarrierException.m */, 06DD329716A13FA000AD7ACE /* Buffer.h */, @@ -4706,6 +4714,8 @@ 06F2908218972DA200676F1D /* Comparator.m */, 0671E4E421BDCEF4002E0681 /* Comparators.h */, 0671E4CD21BDCEF0002E0681 /* Comparators.m */, + FA3ACD832229A94300ECCC8F /* CompletableFuture.h */, + FA3ACD822229A94300ECCC8F /* CompletableFuture.m */, 06F290CB18972DF800676F1D /* CompletionService.h */, 06F290CC18972DF800676F1D /* CompletionService.m */, 06721ADF1559CB9200645E1B /* ConcurrentHashMap.h */, @@ -5137,6 +5147,8 @@ FA6C098D2187C1B6006C646A /* icu */, 06ED3DE217D1666A0014750E /* ICU.h */, 06ED3DE317D1666A0014750E /* ICU.m */, + FA1F0234229748EC00AE9C73 /* IcuIteratorWrapper.h */, + FA1F0232229748EC00AE9C73 /* IcuIteratorWrapper.m */, 06721AE91559CB9200645E1B /* IdentityHashMap.h */, 06721A4F1559C99E00645E1B /* IdentityHashMap.m */, 0671E52821BDEE4F002E0681 /* IDN.m */, @@ -6529,6 +6541,8 @@ 067B13141EEDC3DC0013CBAF /* BlockingQueueTest.m */, FA5C0C7E21BFFE67000B6275 /* BooleanTest.m */, 062028931D512E78001EE455 /* BoundedGenericMethodsTests.m */, + FA1F02352297499F00AE9C73 /* BreakIteratorTest.m */, + FA1F0237229749FB00AE9C73 /* BreakIteratorTest.m */, 063A62CC1F67267C001DC971 /* BrokenInputStream.m */, 0620267F1D512D08001EE455 /* BufferTest.m */, FA5C0C6D21BFFE64000B6275 /* ByteTest.m */, @@ -8036,6 +8050,7 @@ 062029441D512F25001EE455 /* InvalidParameterSpecExceptionTest.m in Sources */, 062028AC1D512E78001EE455 /* AccessibleObjectTest.m in Sources */, FA5C0CAA21BFFE6B000B6275 /* ErrorTest.m in Sources */, + FA1F0238229749FC00AE9C73 /* BreakIteratorTest.m in Sources */, 062026A01D512D08001EE455 /* CookiesTest.m in Sources */, 063A62E81F67269E001DC971 /* DefaultHandler2Test.m in Sources */, 062026BF1D512D08001EE455 /* DecimalFormatTest.m in Sources */, @@ -8291,6 +8306,7 @@ 062B4F291ED76C1200D4E7D8 /* OldJarFileTest.m in Sources */, 062024F41D512AA2001EE455 /* AllJreTests.m in Sources */, 062026A21D512D08001EE455 /* InetAddressTest.m in Sources */, + FA1F0236229749AF00AE9C73 /* BreakIteratorTest.m in Sources */, 062028AE1D512E78001EE455 /* BoundedGenericMethodsTests.m in Sources */, 062029411D512F25001EE455 /* ECPublicKeySpecTest.m in Sources */, FA5C0CC521BFFFAA000B6275 /* SpliteratorTester.m in Sources */, diff --git a/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/impl/CSCharacterIterator.java b/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/impl/CSCharacterIterator.java new file mode 100644 index 0000000000..4b234dbd97 --- /dev/null +++ b/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/impl/CSCharacterIterator.java @@ -0,0 +1,108 @@ +/* GENERATED SOURCE. DO NOT MODIFY. */ +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package android.icu.impl; + +import java.text.CharacterIterator; + +/** + * Implement the Java CharacterIterator interface on a CharSequence. + * Intended for internal use by ICU only. + * @hide Only a subset of ICU is exposed in Android + */ +public class CSCharacterIterator implements CharacterIterator { + + private int index; + private CharSequence seq; + + + /** + * Constructor. + * @param text The CharSequence to iterate over. + */ + public CSCharacterIterator(CharSequence text) { + if (text == null) { + throw new NullPointerException(); + } + seq = text; + index = 0; + } + + /** @{inheritDoc} */ + @Override + public char first() { + index = 0; + return current(); + } + + /** @{inheritDoc} */ + @Override + public char last() { + index = seq.length(); + return previous(); + } + + /** @{inheritDoc} */ + @Override + public char current() { + if (index == seq.length()) { + return DONE; + } + return seq.charAt(index); + } + + /** @{inheritDoc} */ + @Override + public char next() { + if (index < seq.length()) { + ++index; + } + return current(); + } + + /** @{inheritDoc} */ + @Override + public char previous() { + if (index == 0) { + return DONE; + } + --index; + return current(); + } + + /** @{inheritDoc} */ + @Override + public char setIndex(int position) { + if (position < 0 || position > seq.length()) { + throw new IllegalArgumentException(); + } + index = position; + return current(); + } + + /** @{inheritDoc} */ + @Override + public int getBeginIndex() { + return 0; + } + + /** @{inheritDoc} */ + @Override + public int getEndIndex() { + return seq.length(); + } + + /** @{inheritDoc} */ + @Override + public int getIndex() { + return index; + } + + /** @{inheritDoc} */ + @Override + public Object clone() { + CSCharacterIterator copy = new CSCharacterIterator(seq); + copy.setIndex(index); + return copy; + } +} diff --git a/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/impl/SimpleFilteredSentenceBreakIterator.java b/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/impl/SimpleFilteredSentenceBreakIterator.java index 8ed7d8724b..8521f03a76 100644 --- a/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/impl/SimpleFilteredSentenceBreakIterator.java +++ b/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/impl/SimpleFilteredSentenceBreakIterator.java @@ -11,6 +11,7 @@ import java.text.CharacterIterator; import java.util.HashSet; +import java.util.Locale; import android.icu.impl.ICUResourceBundle.OpenType; import android.icu.text.BreakIterator; @@ -59,7 +60,7 @@ private final void resetState() { /** * Is there an exception at this point? * - * @param n + * @param n the location of the possible break * @return */ private final boolean breakExceptionAt(int n) { @@ -73,6 +74,8 @@ private final boolean breakExceptionAt(int n) { backwardsTrie.reset(); int uch; + + // Assume a space is following the '.' (so we handle the case: "Mr. /Brown") if ((uch = text.previousCodePoint()) == ' ') { // TODO: skip a class of chars here?? // TODO only do this the 1st time? @@ -272,13 +275,16 @@ public static class Builder extends FilteredBreakIteratorBuilder { /** * filter set to store all exceptions */ - private HashSet filterSet = new HashSet(); + private HashSet filterSet = new HashSet(); static final int PARTIAL = (1 << 0); // < partial - need to run through forward trie static final int MATCH = (1 << 1); // < exact match - skip this one. static final int SuppressInReverse = (1 << 0); static final int AddToForward = (1 << 1); + public Builder(Locale loc) { + this(ULocale.forLocale(loc)); + } /** * Create SimpleFilteredBreakIteratorBuilder using given locale * @param loc the locale to get filtered iterators @@ -302,28 +308,20 @@ public Builder(ULocale loc) { * Create SimpleFilteredBreakIteratorBuilder with no exception */ public Builder() { - filterSet = new HashSet(); } @Override - public boolean suppressBreakAfter(String str) { - if (filterSet == null) { - filterSet = new HashSet(); - } + public boolean suppressBreakAfter(CharSequence str) { return filterSet.add(str); } @Override - public boolean unsuppressBreakAfter(String str) { - if (filterSet == null) { - return false; - } else { - return filterSet.remove(str); - } + public boolean unsuppressBreakAfter(CharSequence str) { + return filterSet.remove(str); } @Override - public BreakIterator build(BreakIterator adoptBreakIterator) { + public BreakIterator wrapIteratorWithFilter(BreakIterator adoptBreakIterator) { if( filterSet.isEmpty() ) { // Short circuit - nothing to except. return adoptBreakIterator; @@ -336,29 +334,30 @@ public BreakIterator build(BreakIterator adoptBreakIterator) { int fwdCount = 0; int subCount = filterSet.size(); - String[] ustrs = new String[subCount]; + CharSequence[] ustrs = new CharSequence[subCount]; int[] partials = new int[subCount]; CharsTrie backwardsTrie = null; // i.e. ".srM" for Mrs. CharsTrie forwardsPartialTrie = null; // Has ".a" for "a.M." int i = 0; - for (String s : filterSet) { + for (CharSequence s : filterSet) { ustrs[i] = s; // copy by value? partials[i] = 0; // default: no partial i++; } for (i = 0; i < subCount; i++) { - int nn = ustrs[i].indexOf('.'); // TODO: non-'.' abbreviations - if (nn > -1 && (nn + 1) != ustrs[i].length()) { + String thisStr = ustrs[i].toString(); // TODO: don't cast to String? + int nn = thisStr.indexOf('.'); // TODO: non-'.' abbreviations + if (nn > -1 && (nn + 1) != thisStr.length()) { // is partial. // is it unique? int sameAs = -1; for (int j = 0; j < subCount; j++) { if (j == i) continue; - if (ustrs[i].regionMatches(0, ustrs[j], 0, nn + 1)) { + if (thisStr.regionMatches(0, ustrs[j].toString() /* TODO */, 0, nn + 1)) { if (partials[j] == 0) { // hasn't been processed yet partials[j] = SuppressInReverse | AddToForward; } else if ((partials[j] & SuppressInReverse) != 0) { @@ -368,7 +367,7 @@ public BreakIterator build(BreakIterator adoptBreakIterator) { } if ((sameAs == -1) && (partials[i] == 0)) { - StringBuilder prefix = new StringBuilder(ustrs[i].substring(0, nn + 1)); + StringBuilder prefix = new StringBuilder(thisStr.substring(0, nn + 1)); // first one - add the prefix to the reverse table. prefix.reverse(); builder.add(prefix, PARTIAL); @@ -379,8 +378,9 @@ public BreakIterator build(BreakIterator adoptBreakIterator) { } for (i = 0; i < subCount; i++) { + final String thisStr = ustrs[i].toString(); // TODO if (partials[i] == 0) { - StringBuilder reversed = new StringBuilder(ustrs[i]).reverse(); + StringBuilder reversed = new StringBuilder(thisStr).reverse(); builder.add(reversed, MATCH); revCount++; } else { @@ -389,7 +389,7 @@ public BreakIterator build(BreakIterator adoptBreakIterator) { // forward, // instead of "Ph.D." since we already know the "Ph." part is a match. // would need the trie to be able to hold 0-length strings, though. - builder2.add(ustrs[i], MATCH); // forward + builder2.add(thisStr, MATCH); // forward fwdCount++; } } diff --git a/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/BreakIterator.java b/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/BreakIterator.java index 691593450f..8a3c9db634 100644 --- a/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/BreakIterator.java +++ b/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/BreakIterator.java @@ -15,6 +15,7 @@ import java.util.Locale; import java.util.MissingResourceException; +import android.icu.impl.CSCharacterIterator; import android.icu.impl.CacheValue; import android.icu.impl.ICUDebug; import android.icu.util.ICUCloneNotSupportedException; @@ -81,7 +82,10 @@ * * BreakIterator accesses the text it analyzes through a CharacterIterator, which makes * it possible to use BreakIterator to analyze text in any text-storage vehicle that - * provides a CharacterIterator interface. + * provides a CharacterIterator interface. When BreakIterator.setText(CharacterIterator) or + * getText() was called, the CharacterIterator must not be modified, or else the BreakIterator + * behavior is undefined. In particular, call BreakIterator.setText(), + * not CharacterIterator.setText(). * * Note: Some types of BreakIterator can take a long time to create, and * instances of BreakIterator are not currently cached by the system. For @@ -181,30 +185,28 @@ * public static int nextWordStartAfter(int pos, String text) { * BreakIterator wb = BreakIterator.getWordInstance(); * wb.setText(text); - * int last = wb.following(pos); - * int current = wb.next(); - * while (current != BreakIterator.DONE) { - * for (int p = last; p < current; p++) { - * if (Character.isLetter(text.charAt(p))) - * return last; + * int wordStart = wb.following(pos); + * for (;;) { + * int wordLimit = wb.next(); + * if (wordLimit == BreakIterator.DONE) { + * return BreakIterator.DONE; * } - * last = current; - * current = wb.next(); - * } - * return BreakIterator.DONE; + * int wordStatus = wb.getRuleStatus(); + * if (wordStatus != BreakIterator.WORD_NONE) { + * return wordStart; + * } + * wordStart = wordLimit; + * } * } * - * (The iterator returned by BreakIterator.getWordInstance() is unique in that + * The iterator returned by {@link #getWordInstance} is unique in that * the break positions it returns don't represent both the start and end of the * thing being iterated over. That is, a sentence-break iterator returns breaks * that each represent the end of one sentence and the beginning of the next. * With the word-break iterator, the characters between two boundaries might be a * word, or they might be the punctuation or whitespace between two words. The - * above code uses a simple heuristic to determine which boundary is the beginning - * of a word: If the characters between this boundary and the next boundary - * include at least one letter (this can be an alphabetical letter, a CJK ideograph, - * a Hangul syllable, a Kana character, etc.), then the text between this boundary - * and the next is a word; otherwise, it's the material between words.) + * above code uses {@link #getRuleStatus} to identify and ignore boundaries associated + * with punctuation or other non-word characters. * * * @see CharacterIterator @@ -488,6 +490,19 @@ public void setText(String newText) setText(new StringCharacterIterator(newText)); } + /** + * Sets the iterator to analyze a new piece of text. The new + * piece of text is passed in as a CharSequence, and the current + * iteration position is reset to the beginning of the text. + * (The old text is dropped.) + * @param newText A CharSequence containing the text to analyze with + * this BreakIterator. + * @hide draft / provisional / internal are hidden on Android + */ + public void setText(CharSequence newText) { + setText(new CSCharacterIterator(newText)); + } + /** * Sets the iterator to analyze a new piece of text. The * BreakIterator is passed a CharacterIterator through which diff --git a/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/BreakIteratorFactory.java b/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/BreakIteratorFactory.java index 8e89dfcac5..c8a022e01b 100644 --- a/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/BreakIteratorFactory.java +++ b/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/BreakIteratorFactory.java @@ -169,7 +169,7 @@ private static BreakIterator createBreakInstance(ULocale locale, int kind) { final String ssKeyword = locale.getKeywordValue("ss"); if (ssKeyword != null && ssKeyword.equals("standard")) { final ULocale base = new ULocale(locale.getBaseName()); - return FilteredBreakIteratorBuilder.createInstance(base).build(iter); + return FilteredBreakIteratorBuilder.getInstance(base).wrapIteratorWithFilter(iter); } } diff --git a/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/DictionaryBreakEngine.java b/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/DictionaryBreakEngine.java index 463c112b59..9d7d2a5df5 100644 --- a/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/DictionaryBreakEngine.java +++ b/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/DictionaryBreakEngine.java @@ -90,11 +90,18 @@ public void markCurrent() { * For internal use only. * @hide draft / provisional / internal are hidden on Android */ - static class DequeI { + static class DequeI implements Cloneable { private int[] data = new int[50]; private int lastIdx = 4; // or base of stack. Index of element. private int firstIdx = 4; // or Top of Stack. Index of element + 1. + @Override + public Object clone() throws CloneNotSupportedException { + DequeI result = (DequeI)super.clone(); + result.data = data.clone(); + return result; + } + int size() { return firstIdx - lastIdx; } @@ -151,6 +158,15 @@ boolean contains(int v) { } return false; } + + int elementAt(int i) { + assert i < size(); + return data[lastIdx + i]; + } + + void removeAllElements() { + lastIdx = firstIdx = 4; + } } UnicodeSet fSet = new UnicodeSet(); @@ -174,8 +190,8 @@ public boolean handles(int c, int breakType) { @Override public int findBreaks(CharacterIterator text, int startPos, int endPos, - boolean reverse, int breakType, DequeI foundBreaks) { - int result = 0; + int breakType, DequeI foundBreaks) { + int result = 0; // Find the span of characters included in the set. // The span to break begins at the current position int the text, and @@ -186,24 +202,15 @@ public int findBreaks(CharacterIterator text, int startPos, int endPos, int rangeStart; int rangeEnd; int c = CharacterIteration.current32(text); - if (reverse) { - boolean isDict = fSet.contains(c); - while ((current = text.getIndex()) > startPos && isDict) { - c = CharacterIteration.previous32(text); - isDict = fSet.contains(c); - } - rangeStart = (current < startPos) ? startPos : - current + (isDict ? 0 : 1); - rangeEnd = start + 1; - } else { - while ((current = text.getIndex()) < endPos && fSet.contains(c)) { - CharacterIteration.next32(text); - c = CharacterIteration.current32(text); - } - rangeStart = start; - rangeEnd = current; + while ((current = text.getIndex()) < endPos && fSet.contains(c)) { + CharacterIteration.next32(text); + c = CharacterIteration.current32(text); } + rangeStart = start; + rangeEnd = current; + // if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) { + // TODO: Why does icu4c have this? result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); text.setIndex(current); diff --git a/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/FilteredBreakIteratorBuilder.java b/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/FilteredBreakIteratorBuilder.java index d6456c3749..6a6c09fcf8 100644 --- a/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/FilteredBreakIteratorBuilder.java +++ b/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/FilteredBreakIteratorBuilder.java @@ -9,6 +9,8 @@ */ package android.icu.text; +import java.util.Locale; + import android.icu.impl.SimpleFilteredSentenceBreakIterator; import android.icu.util.ULocale; @@ -20,30 +22,16 @@ * in the string "Mr. Smith" (resulting in two segments), * but with "Mr." as an exception, a filtered break iterator * would consider the string "Mr. Smith" to be a single segment. - * - *

Note: An instance of {@link BreakIterator} returned by this builder - * class currently does not support following operations in this technology preview - * version: - *

    - *
  • {@link BreakIterator#next(int) next(int n)}
  • - *
  • {@link BreakIterator#previous() previous()}
  • - *
  • {@link BreakIterator#following(int) following(int offset)}
  • - *
  • {@link BreakIterator#preceding(int) preceding(int offset)}
  • - *
- * When one of above methods is called, {@link UnsupportedOperationException} will be - * thrown. * - * @author tomzhang - * - * @deprecated This API might change or be removed in a future release. + *

This class is not intended for public subclassing. + * * @hide Only a subset of ICU is exposed in Android * @hide draft / provisional / internal are hidden on Android */ -@Deprecated public abstract class FilteredBreakIteratorBuilder { /** - * Construct a FilteredBreakIteratorBuilder based on rules in a locale. + * Construct a FilteredBreakIteratorBuilder based on sentence break exception rules in a locale. * The rules are taken from CLDR exception data for the locale, * see http://www.unicode.org/reports/tr35/tr35-general.html#Segmentation_Exceptions * This is the equivalent of calling createInstance(UErrorCode&) @@ -51,26 +39,35 @@ public abstract class FilteredBreakIteratorBuilder { * of the CLDR exception data. * @param where the locale. * @return the new builder - * @deprecated This API might change or be removed in a future release. * @hide draft / provisional / internal are hidden on Android */ - @Deprecated - public static FilteredBreakIteratorBuilder createInstance(ULocale where) { - FilteredBreakIteratorBuilder ret = new SimpleFilteredSentenceBreakIterator.Builder(where); - return ret; + public static final FilteredBreakIteratorBuilder getInstance(Locale where) { + return new SimpleFilteredSentenceBreakIterator.Builder(where); + } + + /** + * Construct a FilteredBreakIteratorBuilder based on sentence break exception rules in a locale. + * The rules are taken from CLDR exception data for the locale, + * see http://www.unicode.org/reports/tr35/tr35-general.html#Segmentation_Exceptions + * This is the equivalent of calling createInstance(UErrorCode&) + * and then repeatedly calling addNoBreakAfter(...) with the contents + * of the CLDR exception data. + * @param where the locale. + * @return the new builder + * @hide draft / provisional / internal are hidden on Android + */ + public static final FilteredBreakIteratorBuilder getInstance(ULocale where) { + return new SimpleFilteredSentenceBreakIterator.Builder(where); } /** * Construct an empty FilteredBreakIteratorBuilder. * In this state, it will not suppress any segment boundaries. * @return the new builder - * @deprecated This API might change or be removed in a future release. * @hide draft / provisional / internal are hidden on Android */ - @Deprecated - public static FilteredBreakIteratorBuilder createInstance() { - FilteredBreakIteratorBuilder ret = new SimpleFilteredSentenceBreakIterator.Builder(); - return ret; + public static final FilteredBreakIteratorBuilder getEmptyInstance() { + return new SimpleFilteredSentenceBreakIterator.Builder(); } /** @@ -78,13 +75,11 @@ public static FilteredBreakIteratorBuilder createInstance() { * For example, suppressing "Mr.", then segments ending in "Mr." will not be returned * by the iterator. * @param str the string to suppress, such as "Mr." - * @return returns true if the string was not present and now added, + * @return true if the string was not present and now added, * false if the call was a no-op because the string was already being suppressed. - * @deprecated This API might change or be removed in a future release. * @hide draft / provisional / internal are hidden on Android */ - @Deprecated - public abstract boolean suppressBreakAfter(String str); + public abstract boolean suppressBreakAfter(CharSequence str); /** * Stop suppressing a certain string from being the end of the segment. @@ -92,34 +87,29 @@ public static FilteredBreakIteratorBuilder createInstance() { * the effect of earlier calls to suppressBreakAfter, or to un-do the effect of * locale data which may be suppressing certain strings. * @param str the str the string to unsuppress, such as "Mr." - * @return returns true if the string was present and now removed, + * @return true if the string was present and now removed, * false if the call was a no-op because the string was not being suppressed. - * @deprecated This API might change or be removed in a future release. * @hide draft / provisional / internal are hidden on Android */ - @Deprecated - public abstract boolean unsuppressBreakAfter(String str); + public abstract boolean unsuppressBreakAfter(CharSequence str); /** * Wrap (adopt) an existing break iterator in a new filtered instance. - * The resulting BreakIterator is owned by the caller. - * The BreakIteratorFilter may be destroyed before the BreakIterator is destroyed. - * Note that the adoptBreakIterator is adopted by the new BreakIterator + * Note that the wrappedBreakIterator is adopted by the new BreakIterator * and should no longer be used by the caller. * The FilteredBreakIteratorBuilder may be reused. - * @param adoptBreakIterator the break iterator to adopt - * @return the new BreakIterator, owned by the caller. - * @deprecated This API might change or be removed in a future release. + * @param wrappedBreakIterator the break iterator to wrap + * @return the new BreakIterator * @hide draft / provisional / internal are hidden on Android */ - @Deprecated - public abstract BreakIterator build(BreakIterator adoptBreakIterator); + public abstract BreakIterator wrapIteratorWithFilter(BreakIterator wrappedBreakIterator); /** * For subclass use - * @deprecated This API might change or be removed in a future release. + * @deprecated internal to ICU * @hide draft / provisional / internal are hidden on Android */ @Deprecated - protected FilteredBreakIteratorBuilder() {} + protected FilteredBreakIteratorBuilder() { + } } \ No newline at end of file diff --git a/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/LanguageBreakEngine.java b/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/LanguageBreakEngine.java index affe098acb..20e7391a81 100644 --- a/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/LanguageBreakEngine.java +++ b/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/LanguageBreakEngine.java @@ -12,7 +12,7 @@ import java.text.CharacterIterator; /** - * The LanguageBreakEngine interface is to be used to implement any + * The LanguageBreakEngine interface is to be used to implement any * language-specific logic for break iteration. */ interface LanguageBreakEngine { @@ -25,21 +25,20 @@ interface LanguageBreakEngine { boolean handles(int c, int breakType); /** - * Implements the actual breaking logic. - * @param text The text to break over - * @param startPos The index of the beginning of our range + * Implements the actual breaking logic. Find any breaks within a run in the supplied text. + * @param text The text to break over. The iterator is left at + * the end of the run of characters which the engine has handled. + * @param startPos The index of the beginning of the range * @param endPos The index of the possible end of our range. It is possible, - * however, that our range ends earlier - * @param reverse true iff we are iterating backwards (in a call to - * previous(), for example) + * however, that the range ends earlier * @param breakType The kind of break iterator that is wanting to make use * of this engine - character, word, line, sentence - * @param foundBreaks A Stack that the breaks found will be added to - * @return the number of words found + * @param foundBreaks A data structure to receive the break positions. + * @return the number of breaks found */ int findBreaks(CharacterIterator text, int startPos, int endPos, - boolean reverse, int breakType, DictionaryBreakEngine.DequeI foundBreaks); + int breakType, DictionaryBreakEngine.DequeI foundBreaks); } - - - + + + diff --git a/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/RBBIDataWrapper.java b/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/RBBIDataWrapper.java index 4fa14a58ae..dcdde54d74 100644 --- a/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/RBBIDataWrapper.java +++ b/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/RBBIDataWrapper.java @@ -14,10 +14,9 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; -import android.icu.impl.CharTrie; import android.icu.impl.ICUBinary; import android.icu.impl.ICUBinary.Authenticate; -import android.icu.impl.Trie; +import android.icu.impl.Trie2; /** *

Internal class used for Rule Based Break Iterators

@@ -34,20 +33,20 @@ final class RBBIDataWrapper { short fRTable[]; short fSFTable[]; short fSRTable[]; - CharTrie fTrie; + Trie2 fTrie; String fRuleSource; int fStatusTable[]; private boolean isBigEndian; - static final int DATA_FORMAT = 0x42726b20; // "Brk " - static final int FORMAT_VERSION = 0x03010000; // 3.1 + static final int DATA_FORMAT = 0x42726b20; // "Brk " + static final int FORMAT_VERSION = 0x04000000; // 4.0.0.0 private static final class IsAcceptable implements Authenticate { - // @Override when we switch to Java 6 @Override public boolean isDataVersionAcceptable(byte version[]) { - return version[0] == (FORMAT_VERSION >>> 24); + int intVersion = (version[0] << 24) + (version[1] << 16) + (version[2] << 8) + version[3]; + return intVersion == FORMAT_VERSION; } } private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); @@ -106,7 +105,6 @@ public boolean isDataVersionAcceptable(byte version[]) { */ final static class RBBIDataHeader { int fMagic; // == 0xbla0 - int fVersion; // == 1 (for ICU 3.2 and earlier. byte[] fFormatVersion; // For ICU 3.4 and later. int fLength; // Total length in bytes of this RBBI Data, // including all sections, not just the header. @@ -148,19 +146,6 @@ int getRowIndex(int state){ return ROW_DATA + state * (fHeader.fCatCount + 4); } - static class TrieFoldingFunc implements Trie.DataManipulate { - @Override - public int getFoldingOffset(int data) { - if ((data & 0x8000) != 0) { - return data & 0x7fff; - } else { - return 0; - } - } - } - static TrieFoldingFunc fTrieFoldingFunc = new TrieFoldingFunc(); - - RBBIDataWrapper() { } @@ -177,10 +162,6 @@ static RBBIDataWrapper get(ByteBuffer bytes) throws IOException { // Read in the RBBI data header... This.fHeader = new RBBIDataHeader(); This.fHeader.fMagic = bytes.getInt(); - // Read the same 4 bytes as an int and as a byte array: The data format could be - // the old fVersion=1 (TODO: probably not with a real ICU data header?) - // or the new fFormatVersion=3.x. - This.fHeader.fVersion = bytes.getInt(bytes.position()); This.fHeader.fFormatVersion[0] = bytes.get(); This.fHeader.fFormatVersion[1] = bytes.get(); This.fHeader.fFormatVersion[2] = bytes.get(); @@ -204,10 +185,7 @@ static RBBIDataWrapper get(ByteBuffer bytes) throws IOException { ICUBinary.skipBytes(bytes, 6 * 4); // uint32_t fReserved[6]; - if (This.fHeader.fMagic != 0xb1a0 || - ! (This.fHeader.fVersion == 1 || // ICU 3.2 and earlier - This.fHeader.fFormatVersion[0] == 3) // ICU 3.4 - ) { + if (This.fHeader.fMagic != 0xb1a0 || !IS_ACCEPTABLE.isDataVersionAcceptable(This.fHeader.fFormatVersion)) { throw new IOException("Break Iterator Rule Data Magic Number Incorrect, or unsupported data version."); } @@ -272,6 +250,15 @@ static RBBIDataWrapper get(ByteBuffer bytes) throws IOException { pos += This.fHeader.fSRTableLen; } + // Rule Compatibility Hacks + // If a rule set includes reverse rules but does not explicitly include safe reverse rules, + // the reverse rules are to be treated as safe reverse rules. + + if (This.fSRTable == null && This.fRTable != null) { + This.fSRTable = This.fRTable; + This.fRTable = null; + } + // // Unserialize the Character categories TRIE // Because we can't be absolutely certain where the Trie deserialize will @@ -287,7 +274,7 @@ static RBBIDataWrapper get(ByteBuffer bytes) throws IOException { // as we don't go more than 100 bytes past the // past the end of the TRIE. - This.fTrie = new CharTrie(bytes, fTrieFoldingFunc); // Deserialize the TRIE, leaving buffer + This.fTrie = Trie2.createFromSerialized(bytes); // Deserialize the TRIE, leaving buffer // at an unknown position, preceding the // padding between TRIE and following section. @@ -393,7 +380,7 @@ static public String intToHexString(int n, int width) { ///CLOVER:OFF /** Dump a state table. (A full set of RBBI rules has 4 state tables.) */ private void dumpTable(java.io.PrintStream out, short table[]) { - if (table == null) { + if (table == null || table.length == 0) { out.println(" -- null -- "); } else { int n; @@ -462,7 +449,7 @@ private void dumpCharCategories(java.io.PrintStream out) { out.println("\nCharacter Categories"); out.println("--------------------"); for (char32 = 0; char32<=0x10ffff; char32++) { - category = fTrie.getCodePointValue(char32); + category = fTrie.get(char32); category &= ~0x4000; // Mask off dictionary bit. if (category < 0 || category > fHeader.fCatCount) { out.println("Error, bad category " + Integer.toHexString(category) + diff --git a/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/RuleBasedBreakIterator.java b/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/RuleBasedBreakIterator.java index 46d2294bf2..76c03f369e 100644 --- a/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/RuleBasedBreakIterator.java +++ b/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/RuleBasedBreakIterator.java @@ -15,19 +15,21 @@ import static android.icu.impl.CharacterIteration.nextTrail32; import static android.icu.impl.CharacterIteration.previous32; +import com.google.j2objc.annotations.WeakOuter; + import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.nio.ByteBuffer; import java.text.CharacterIterator; -import java.util.concurrent.ConcurrentHashMap; +import java.util.ArrayList; +import java.util.List; -import android.icu.impl.Assert; -import android.icu.impl.CharTrie; import android.icu.impl.CharacterIteration; import android.icu.impl.ICUBinary; import android.icu.impl.ICUDebug; +import android.icu.impl.Trie2; import android.icu.lang.UCharacter; import android.icu.lang.UProperty; import android.icu.lang.UScript; @@ -47,9 +49,10 @@ public class RuleBasedBreakIterator extends BreakIterator { * private constructor */ private RuleBasedBreakIterator() { - fLastStatusIndexValid = true; fDictionaryCharCount = 0; - fBreakEngines.put(-1, fUnhandledBreakEngine); + synchronized(gAllBreakEngines) { + fBreakEngines = new ArrayList(gAllBreakEngines); + } } /** @@ -126,15 +129,22 @@ public RuleBasedBreakIterator(String rules) { * behavior as this one. */ @Override - public Object clone() - { - RuleBasedBreakIterator result = (RuleBasedBreakIterator)super.clone(); + public Object clone() { + RuleBasedBreakIterator result; + result = (RuleBasedBreakIterator)super.clone(); if (fText != null) { result.fText = (CharacterIterator)(fText.clone()); } + synchronized (gAllBreakEngines) { + result.fBreakEngines = new ArrayList(gAllBreakEngines); + } + result.fLookAheadMatches = new LookAheadResults(); + result.fBreakCache = result.new BreakCache(fBreakCache); + result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache); return result; } + /** * Returns true if both BreakIterators are of the same class, have the same * rules, and iterate over the same text. @@ -159,10 +169,10 @@ public boolean equals(Object that) { if (fText == null && other.fText == null) { return true; } - if (fText == null || other.fText == null) { + if (fText == null || other.fText == null || !fText.equals(other.fText)) { return false; } - return fText.equals(other.fText); + return fPosition == other.fPosition; } catch(ClassCastException e) { return false; @@ -213,17 +223,32 @@ public int hashCode() */ RBBIDataWrapper fRData; - /* + /** + * The iteration state - current position, rule status for the current position, + * and whether the iterator ran off the end, yielding UBRK_DONE. + * Current position is pinned to be 0 < position <= text.length. + * Current position is always set to a boundary. + * + * The current position of the iterator. Pinned, 0 < fPosition <= text.length. + * Never has the value UBRK_DONE (-1). + */ + private int fPosition; + + /** * Index of the Rule {tag} values for the most recent match. */ - private int fLastRuleStatusIndex; + private int fRuleStatusIndex; - /* - * Rule tag value valid flag. - * Some iterator operations don't intrinsically set the correct tag value. - * This flag lets us lazily compute the value if we are ever asked for it. + /** + * True when iteration has run off the end, and iterator functions should return UBRK_DONE. */ - private boolean fLastStatusIndexValid; + private boolean fDone; + + /** + * Cache of previously determined boundary positions. + */ + private BreakCache fBreakCache = new BreakCache(); + /** * Counter for the number of characters encountered with the "dictionary" @@ -234,6 +259,8 @@ public int hashCode() */ private int fDictionaryCharCount; + private DictionaryCache fDictionaryCache = new DictionaryCache(); + /* * ICU debug argument name for RBBI */ @@ -246,46 +273,43 @@ public int hashCode() && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0; /** - * What kind of break iterator this is. Set to KIND_LINE by default, - * since this produces sensible output. + * What kind of break iterator this is. + * Defaulting BreakType to word gives reasonable dictionary behavior for + * Break Iterators that are built from rules. */ - private int fBreakType = KIND_LINE; + private int fBreakType = KIND_WORD; /** * The "default" break engine - just skips over ranges of dictionary words, * producing no breaks. Should only be used if characters need to be handled * by a dictionary but we have no dictionary implementation for them. + * + * Only one instance; shared by all break iterators. */ - private final UnhandledBreakEngine fUnhandledBreakEngine = new UnhandledBreakEngine(); - - /** - * when a range of characters is divided up using the dictionary, the break - * positions that are discovered are stored here, preventing us from having - * to use either the dictionary or the state table again until the iterator - * leaves this range of text - */ - private int[] fCachedBreakPositions; + private static final UnhandledBreakEngine gUnhandledBreakEngine; /** - * if fCachedBreakPositions is not null, this indicates which item in the - * cache the current iteration position refers to + * List of all known break engines, common for all break iterators. + * Lazily updated as break engines are needed, because instantiation of + * break engines is expensive. + * + * Because gAllBreakEngines can be referenced concurrently from different + * BreakIterator instances, all access is synchronized. */ - private int fPositionInCache; + private static final List gAllBreakEngines; + static { + gUnhandledBreakEngine = new UnhandledBreakEngine(); + gAllBreakEngines = new ArrayList(); + gAllBreakEngines.add(gUnhandledBreakEngine); + } - private final ConcurrentHashMap fBreakEngines = - new ConcurrentHashMap(); /** - * Dumps caches and performs other actions associated with a complete change - * in text or iteration position. + * List of all known break engines. Similar to gAllBreakEngines, but local to a + * break iterator, allowing it to be used without synchronization. */ - private void reset() { - fCachedBreakPositions = null; - // fNumCachedBreakPositions = 0; - fDictionaryCharCount = 0; - fPositionInCache = 0; + private List fBreakEngines; - } /** * Dump the contents of the state table and character classes for this break iterator. * For debugging only. @@ -329,16 +353,17 @@ public static void compileRules(String rules, OutputStream ruleBinary) throws IO */ @Override public int first() { - fCachedBreakPositions = null; - fDictionaryCharCount = 0; - fPositionInCache = 0; - fLastRuleStatusIndex = 0; - fLastStatusIndexValid = true; if (fText == null) { return BreakIterator.DONE; } fText.first(); - return fText.getIndex(); + int start = fText.getIndex(); + if (!fBreakCache.seek(start)) { + fBreakCache.populateNear(start); + } + fBreakCache.current(); + assert(fPosition == start); + return fPosition; } /** @@ -348,24 +373,16 @@ public int first() { */ @Override public int last() { - fCachedBreakPositions = null; - fDictionaryCharCount = 0; - fPositionInCache = 0; - if (fText == null) { - fLastRuleStatusIndex = 0; - fLastStatusIndexValid = true; return BreakIterator.DONE; } - - // t.last() returns the offset of the last character, - // rather than the past-the-end offset - // so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ... - // will work correctly. - fLastStatusIndexValid = false; - int pos = fText.getEndIndex(); - fText.setIndex(pos); - return pos; + int endPos = fText.getEndIndex(); + boolean endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position. + assert(endShouldBeBoundary); + if (fPosition != endPos) { + assert(fPosition == endPos); + } + return endPos; } /** @@ -379,14 +396,17 @@ public int last() { */ @Override public int next(int n) { - int result = current(); - while (n > 0) { - result = next(); - --n; - } - while (n < 0) { - result = previous(); - ++n; + int result = 0; + if (n > 0) { + for (; n > 0 && result != DONE; --n) { + result = next(); + } + } else if (n < 0) { + for (; n < 0 && result != DONE; ++n) { + result = previous(); + } + } else { + result = current(); } return result; } @@ -397,399 +417,42 @@ public int next(int n) { */ @Override public int next() { - // if we have cached break positions and we're still in the range - // covered by them, just move one step forward in the cache - if (fCachedBreakPositions != null) { - if (fPositionInCache < fCachedBreakPositions.length - 1) { - ++fPositionInCache; - int pos = fCachedBreakPositions[fPositionInCache]; - fText.setIndex(pos); - return pos; - } - else { - reset(); - } - } - - int startPos = current(); - fDictionaryCharCount = 0; - int result = handleNext(fRData.fFTable); - if (fDictionaryCharCount > 0) { - result = checkDictionary(startPos, result, false); - } - return result; + fBreakCache.next(); + return fDone ? DONE : fPosition; } /** - * checkDictionary This function handles all processing of characters in - * the "dictionary" set. It will determine the appropriate - * course of action, and possibly set up a cache in the - * process. - */ - private int checkDictionary(int startPos, int endPos, boolean reverse) { - - // Reset the old break cache first. - reset(); - - // note: code segment below assumes that dictionary chars are in the - // startPos-endPos range - // value returned should be next character in sequence - if ((endPos - startPos) <= 1) { - return (reverse ? startPos : endPos); - } - - // Starting from the starting point, scan towards the proposed result, - // looking for the first dictionary character (which may be the one - // we're on, if we're starting in the middle of a range). - fText.setIndex(reverse ? endPos : startPos); - if (reverse) { - CharacterIteration.previous32(fText); - } - - int rangeStart = startPos; - int rangeEnd = endPos; - - int category; - int current; - DictionaryBreakEngine.DequeI breaks = new DictionaryBreakEngine.DequeI(); - int foundBreakCount = 0; - int c = CharacterIteration.current32(fText); - category = (short)fRData.fTrie.getCodePointValue(c); - - // Is the character we're starting on a dictionary character? If so, we - // need to back up to include the entire run; otherwise the results of - // the break algorithm will differ depending on where we start. Since - // the result is cached and there is typically a non-dictionary break - // within a small number of words, there should be little performance impact. - if ((category & 0x4000) != 0) { - if (reverse) { - do { - CharacterIteration.next32(fText); - c = CharacterIteration.current32(fText); - category = (short)fRData.fTrie.getCodePointValue(c); - } while (c != CharacterIteration.DONE32 && ((category & 0x4000)) != 0); - - // Back up to the last dictionary character - rangeEnd = fText.getIndex(); - if (c == CharacterIteration.DONE32) { - // c = fText->last32(); - // TODO: why was this if needed? - c = CharacterIteration.previous32(fText); - } - else { - c = CharacterIteration.previous32(fText); - } - } - else { - do { - c = CharacterIteration.previous32(fText); - category = (short)fRData.fTrie.getCodePointValue(c); - } - while (c != CharacterIteration.DONE32 && ((category & 0x4000) != 0)); - // Back up to the last dictionary character - if (c == CharacterIteration.DONE32) { - // c = fText->first32(); - c = CharacterIteration.current32(fText); - } - else { - CharacterIteration.next32(fText); - c = CharacterIteration.current32(fText); - } - rangeStart = fText.getIndex(); - } - category = (short)fRData.fTrie.getCodePointValue(c); - } - - - // Loop through the text, looking for ranges of dictionary characters. - // For each span, find the appropriate break engine, and ask it to find - // any breaks within the span. - // Note: we always do this in the forward direction, so that the break - // cache is built in the right order. - if (reverse) { - fText.setIndex(rangeStart); - c = CharacterIteration.current32(fText); - category = (short)fRData.fTrie.getCodePointValue(c); - } - LanguageBreakEngine lbe = null; - while(true) { - while((current = fText.getIndex()) < rangeEnd && (category & 0x4000) == 0) { - CharacterIteration.next32(fText); - c = CharacterIteration.current32(fText); - category = (short)fRData.fTrie.getCodePointValue(c); - } - if (current >= rangeEnd) { - break; - } - - // We now have a dictionary character. Get the appropriate language object - // to deal with it. - lbe = getLanguageBreakEngine(c); - - // Ask the language object if there are any breaks. It will leave the text - // pointer on the other side of its range, ready to search for the next one. - if (lbe != null) { - int startingIdx = fText.getIndex(); - foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, false, fBreakType, breaks); - assert fText.getIndex() > startingIdx; - } - - // Reload the loop variables for the next go-round - c = CharacterIteration.current32(fText); - category = (short)fRData.fTrie.getCodePointValue(c); - } - - // If we found breaks, build a new break cache. The first and last entries must - // be the original starting and ending position. - if (foundBreakCount > 0) { - if (foundBreakCount != breaks.size()) { - System.out.println("oops, foundBreakCount != breaks.size(). LBE = " + lbe.getClass()); - } - assert foundBreakCount == breaks.size(); - if (startPos < breaks.peekLast()) { - breaks.offer(startPos); - } - if (endPos > breaks.peek()) { - breaks.push(endPos); - } - - // TODO: get rid of this array, use results from the deque directly - fCachedBreakPositions = new int[breaks.size()]; - - int i = 0; - while (breaks.size() > 0) { - fCachedBreakPositions[i++] = breaks.pollLast(); - } - - // If there are breaks, then by definition, we are replacing the original - // proposed break by one of the breaks we found. Use following() and - // preceding() to do the work. They should never recurse in this case. - if (reverse) { - return preceding(endPos); - } - else { - return following(startPos); - } - } - - // If we get here, there were no language-based breaks. Set the text pointer - // to the original proposed break. - fText.setIndex(reverse ? startPos : endPos); - return (reverse ? startPos : endPos); - - } - - - /** - * Moves the iterator backwards, to the last boundary preceding this one. - * @return The position of the last boundary position preceding this one. + * Moves the iterator backwards, to the boundary preceding the current one. + * @return The position of the boundary position immediately preceding the starting position. */ @Override public int previous() { - int result; - int startPos; - - CharacterIterator text = getText(); - - fLastStatusIndexValid = false; - - // if we have cached break positions and we're still in the range - // covered by them, just move one step backward in the cache - if (fCachedBreakPositions != null) { - if (fPositionInCache > 0) { - --fPositionInCache; - // If we're at the beginning of the cache, need to reevaluate the - // rule status - if (fPositionInCache <= 0) { - fLastStatusIndexValid = false; - } - int pos = fCachedBreakPositions[fPositionInCache]; - text.setIndex(pos); - return pos; - } else { - reset(); - } - } - - // if we're already sitting at the beginning of the text, return DONE - startPos = current(); - if (fText == null || startPos == fText.getBeginIndex()) { - fLastRuleStatusIndex = 0; - fLastStatusIndexValid = true; - return BreakIterator.DONE; - } - - // Rules with an exact reverse table are handled here. - if (fRData.fSRTable != null || fRData.fSFTable != null) { - result = handlePrevious(fRData.fRTable); - if (fDictionaryCharCount > 0) { - result = checkDictionary(result, startPos, true); - } - return result; - } - - // old rule syntax - // set things up. handlePrevious() will back us up to some valid - // break position before the current position (we back our internal - // iterator up one step to prevent handlePrevious() from returning - // the current position), but not necessarily the last one before - // where we started - - int start = current(); - - previous32(fText); - int lastResult = handlePrevious(fRData.fRTable); - if (lastResult == BreakIterator.DONE) { - lastResult = fText.getBeginIndex(); - fText.setIndex(lastResult); - } - result = lastResult; - int lastTag = 0; - boolean breakTagValid = false; - - // iterate forward from the known break position until we pass our - // starting point. The last break position before the starting - // point is our return value - - for (;;) { - result = next(); - if (result == BreakIterator.DONE || result >= start) { - break; - } - lastResult = result; - lastTag = fLastRuleStatusIndex; - breakTagValid = true; - } - - // fLastBreakTag wants to have the value for section of text preceding - // the result position that we are to return (in lastResult.) If - // the backwards rules overshot and the above loop had to do two or more - // handleNext()s to move up to the desired return position, we will have a valid - // tag value. But, if handlePrevious() took us to exactly the correct result position, - // we wont have a tag value for that position, which is only set by handleNext(). - - // Set the current iteration position to be the last break position - // before where we started, and then return that value. - fText.setIndex(lastResult); - fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() - fLastStatusIndexValid = breakTagValid; - return lastResult; + fBreakCache.previous(); + return fDone ? DONE : fPosition; } /** * Sets the iterator to refer to the first boundary position following * the specified position. - * @param offset The position from which to begin searching for a break position. + * @param startPos The position from which to begin searching for a break position. * @return The position of the first break after the current position. */ @Override - public int following(int offset) { - CharacterIterator text = getText(); - - // if we have no cached break positions, or if "offset" is outside the - // range covered by the cache, then dump the cache and call our - // inherited following() method. This will call other methods in this - // class that may refresh the cache. - if (fCachedBreakPositions == null || offset < fCachedBreakPositions[0] || - offset >= fCachedBreakPositions[fCachedBreakPositions.length - 1]) { - fCachedBreakPositions = null; - return rulesFollowing(offset); - } - - // on the other hand, if "offset" is within the range covered by the - // cache, then just search the cache for the first break position - // after "offset" - else { - fPositionInCache = 0; - while (fPositionInCache < fCachedBreakPositions.length - && offset >= fCachedBreakPositions[fPositionInCache]) - ++fPositionInCache; - text.setIndex(fCachedBreakPositions[fPositionInCache]); - return text.getIndex(); - } - } - - private int rulesFollowing(int offset) { - // if the offset passed in is already past the end of the text, - // just return DONE; if it's before the beginning, return the + public int following(int startPos) { + // if the supplied position is before the beginning, return the // text's starting offset - fLastRuleStatusIndex = 0; - fLastStatusIndexValid = true; - if (fText == null || offset >= fText.getEndIndex()) { - last(); - return next(); - } - else if (offset < fText.getBeginIndex()) { + if (startPos < fText.getBeginIndex()) { return first(); } - // otherwise, set our internal iteration position (temporarily) - // to the position passed in. If this is the _beginning_ position, - // then we can just use next() to get our return value - - int result = 0; - - if (fRData.fSRTable != null) { - // Safe Point Reverse rules exist. - // This allows us to use the optimum algorithm. - fText.setIndex(offset); - // move forward one codepoint to prepare for moving back to a - // safe point. - // this handles offset being between a supplementary character - next32(fText); - // handlePrevious will move most of the time to < 1 boundary away - handlePrevious(fRData.fSRTable); - result = next(); - while (result <= offset) { - result = next(); - } - return result; - } - if (fRData.fSFTable != null) { - // No Safe point reverse table, but there is a safe pt forward table. - // - fText.setIndex(offset); - previous32(fText); - // handle next will give result >= offset - handleNext(fRData.fSFTable); - // previous will give result 0 or 1 boundary away from offset, - // most of the time - // we have to - int oldresult = previous(); - while (oldresult > offset) { - result = previous(); - if (result <= offset) { - return oldresult; - } - oldresult = result; - } - result = next(); - if (result <= offset) { - return next(); - } - return result; - } - // otherwise, we have to sync up first. Use handlePrevious() to back - // us up to a known break position before the specified position (if - // we can determine that the specified position is a break position, - // we don't back up at all). This may or may not be the last break - // position at or before our starting position. Advance forward - // from here until we've passed the starting position. The position - // we stop on will be the first break position after the specified one. - // old rule syntax - - fText.setIndex(offset); - if (offset == fText.getBeginIndex()) { - return next(); - } - result = previous(); + // Move requested offset to a code point start. It might be on a trail surrogate. + // Or it may be beyond the end of the text. + startPos = CISetIndex32(fText, startPos); + fBreakCache.following(startPos); + return fDone ? DONE : fPosition; + } - while (result != BreakIterator.DONE && result <= offset) { - result = next(); - } - return result; - } /** * Sets the iterator to refer to the last boundary position before the * specified position. @@ -798,95 +461,21 @@ else if (offset < fText.getBeginIndex()) { */ @Override public int preceding(int offset) { - CharacterIterator text = getText(); - - // if we have no cached break positions, or "offset" is outside the - // range covered by the cache, we can just call the inherited routine - // (which will eventually call other routines in this class that may - // refresh the cache) - if (fCachedBreakPositions == null || offset <= fCachedBreakPositions[0] || - offset > fCachedBreakPositions[fCachedBreakPositions.length - 1]) { - fCachedBreakPositions = null; - return rulesPreceding(offset); - } - - // on the other hand, if "offset" is within the range covered by the cache, - // then all we have to do is search the cache for the last break position - // before "offset" - else { - fPositionInCache = 0; - while (fPositionInCache < fCachedBreakPositions.length - && offset > fCachedBreakPositions[fPositionInCache]) - ++fPositionInCache; - --fPositionInCache; - text.setIndex(fCachedBreakPositions[fPositionInCache]); - return text.getIndex(); - } - } - - private int rulesPreceding(int offset) { - // if the offset passed in is already past the end of the text, - // just return DONE; if it's before the beginning, return the - - // text's starting offset if (fText == null || offset > fText.getEndIndex()) { - // return BreakIterator::DONE; return last(); - } - else if (offset < fText.getBeginIndex()) { + } else if (offset < fText.getBeginIndex()) { return first(); } - // if we start by updating the current iteration position to the - // position specified by the caller, we can just use previous() - // to carry out this operation + // Move requested offset to a code point start. It might be on a trail surrogate. + // int adjustedOffset = CISetIndex32(fText, offset); // TODO: restore to match ICU4C behavior. + int adjustedOffset = offset; + fBreakCache.preceding(adjustedOffset); + return fDone ? DONE : fPosition; - int result; - if (fRData.fSFTable != null) { - /// todo synwee - // new rule syntax - fText.setIndex(offset); - // move backwards one codepoint to prepare for moving forwards to a - // safe point. - // this handles offset being between a supplementary character - previous32(fText); - handleNext(fRData.fSFTable); - result = previous(); - while (result >= offset) { - result = previous(); - } - return result; - } - if (fRData.fSRTable != null) { - // backup plan if forward safe table is not available - fText.setIndex(offset); - next32(fText); - // handle previous will give result <= offset - handlePrevious(fRData.fSRTable); - - // next will give result 0 or 1 boundary away from offset, - // most of the time - // we have to - int oldresult = next(); - while (oldresult < offset) { - result = next(); - if (result >= offset) { - return oldresult; - } - oldresult = result; - } - result = previous(); - if (result >= offset) { - return previous(); - } - return result; - } - - // old rule syntax - fText.setIndex(offset); - return previous(); } + /** * Throw IllegalArgumentException unless begin <= offset < end. */ @@ -906,64 +495,41 @@ protected static final void checkOffset(int offset, CharacterIterator text) { */ @Override public boolean isBoundary(int offset) { + // TODO: behavior difference with ICU4C, which considers out-of-range offsets + // to not be boundaries, and to not be errors. checkOffset(offset, fText); - // the beginning index of the iterator is always a boundary position by definition - if (offset == fText.getBeginIndex()) { - first(); // For side effects on current position, tag values. - return true; - } + // Adjust offset to be on a code point boundary and not beyond the end of the text. + // Note that isBoundary() is always be false for offsets that are not on code point boundaries. + // But we still need the side effect of leaving iteration at the following boundary. + int adjustedOffset = CISetIndex32(fText, offset); - if (offset == fText.getEndIndex()) { - last(); // For side effects on current position, tag values. - return true; + boolean result = false; + if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) { + result = (fBreakCache.current() == offset); } - // otherwise, we can use following() on the position before the specified - // one and return true if the position we get back is the one the user - // specified - - // return following(offset - 1) == offset; - // TODO: check whether it is safe to revert to the simpler offset-1 code - // The safe rules may take care of unpaired surrogates ok. - fText.setIndex(offset); - previous32(fText); - int pos = fText.getIndex(); - boolean result = following(pos) == offset; + if (!result) { + // Not on a boundary. isBoundary() must leave iterator on the following boundary. + // fBreakCache.seek(), above, left us on the preceding boundary, so advance one. + next(); + } return result; + } /** - * Returns the current iteration position. + * Returns the current iteration position. Note that UBRK_DONE is never + * returned from this function; if iteration has run to the end of a + * string, current() will return the length of the string while + * next() will return BreakIterator.DONE). * @return The current iteration position. */ @Override public int current() { - return (fText != null) ? fText.getIndex() : BreakIterator.DONE; + return (fText != null) ? fPosition : BreakIterator.DONE; } - private void makeRuleStatusValid() { - if (fLastStatusIndexValid == false) { - // No cached status is available. - int curr = current(); - if (curr == BreakIterator.DONE || curr == fText.getBeginIndex()) { - // At start of text, or there is no text. Status is always zero. - fLastRuleStatusIndex = 0; - fLastStatusIndexValid = true; - } else { - // Not at start of text. Find status the tedious way. - int pa = fText.getIndex(); - first(); - int pb = current(); - while (fText.getIndex() < pa) { - pb = next(); - } - Assert.assrt(pa == pb); - } - Assert.assrt(fLastStatusIndexValid == true); - Assert.assrt(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fRData.fStatusTable.length); - } - } /** * Return the status tag from the break rule that determined the most recently @@ -972,7 +538,7 @@ private void makeRuleStatusValid() { * status, a default value of 0 is returned. If more than one rule applies, * the numerically largest of the possible status values is returned. *

- * Of the standard types of ICU break iterators, only the word break + * Of the standard types of ICU break iterators, only the word and line break * iterator provides status values. The values are defined in * class RuleBasedBreakIterator, and allow distinguishing between words * that contain alphabetic letters, "words" that appear to be numbers, @@ -983,13 +549,10 @@ private void makeRuleStatusValid() { *

* @return the status from the break rule that determined the most recently * returned break position. - * - * @hide draft / provisional / internal are hidden on Android */ @Override public int getRuleStatus() { - makeRuleStatusValid(); // Status records have this form: // Count N <-- fLastRuleStatusIndex points here. // Status val 0 @@ -998,7 +561,7 @@ public int getRuleStatus() { // Status val N-1 <-- the value we need to return // The status values are sorted in ascending order. // This function returns the last (largest) of the array of status values. - int idx = fLastRuleStatusIndex + fRData.fStatusTable[fLastRuleStatusIndex]; + int idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex]; int tagVal = fRData.fStatusTable[idx]; return tagVal; } @@ -1022,16 +585,14 @@ public int getRuleStatus() { * In the event that the array is too small, the return value * is the total number of status values that were available, * not the reduced number that were actually returned. - * @hide draft / provisional / internal are hidden on Android */ @Override public int getRuleStatusVec(int[] fillInArray) { - makeRuleStatusValid(); - int numStatusVals = fRData.fStatusTable[fLastRuleStatusIndex]; + int numStatusVals = fRData.fStatusTable[fRuleStatusIndex]; if (fillInArray != null) { int numToCopy = Math.min(numStatusVals, fillInArray.length); for (int i=0; i engine. - script = UScript.HAN; - } + synchronized (gAllBreakEngines) { + // This break iterator's list of break engines didn't handle the character. + // Check the global list, another break iterator may have instantiated the + // desired engine. + for (LanguageBreakEngine candidate : gAllBreakEngines) { + if (candidate.handles(c, fBreakType)) { + fBreakEngines.add(candidate); + return candidate; + } + } + + // The global list doesn't have an existing engine, build one. + int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT); + if (script == UScript.KATAKANA || script == UScript.HIRAGANA) { + // Katakana, Hiragana and Han are handled by the same dictionary engine. + // Fold them together for mapping from script -> engine. + script = UScript.HAN; + } - LanguageBreakEngine eng = fBreakEngines.get(script); - /* - if (eng != null && !eng.handles(c, fBreakType)) { - fUnhandledBreakEngine.handleChar(c, getBreakType()); - eng = fUnhandledBreakEngine; - } else */ { + LanguageBreakEngine eng; /* J2ObjC removed: Only "gregorian" and "julian" calendars are supported. try { switch (script) { @@ -1127,40 +699,33 @@ private LanguageBreakEngine getLanguageBreakEngine(int c) { eng = new CjkBreakEngine(false); } else { - fUnhandledBreakEngine.handleChar(c, getBreakType()); - eng = fUnhandledBreakEngine; + gUnhandledBreakEngine.handleChar(c, getBreakType()); + eng = gUnhandledBreakEngine; } break; case UScript.HANGUL: if (getBreakType() == KIND_WORD) { eng = new CjkBreakEngine(true); } else { - fUnhandledBreakEngine.handleChar(c, getBreakType()); - eng = fUnhandledBreakEngine; + gUnhandledBreakEngine.handleChar(c, getBreakType()); + eng = gUnhandledBreakEngine; } break; - default: - fUnhandledBreakEngine.handleChar(c, getBreakType()); - eng = fUnhandledBreakEngine; - break; + default: */ + gUnhandledBreakEngine.handleChar(c, getBreakType()); + eng = gUnhandledBreakEngine; + /* break; } } catch (IOException e) { eng = null; } */ - fUnhandledBreakEngine.handleChar(c, getBreakType()); - eng = fUnhandledBreakEngine; - } - if (eng != null && eng != fUnhandledBreakEngine) { - LanguageBreakEngine existingEngine = fBreakEngines.putIfAbsent(script, eng); - if (existingEngine != null) { - // There was a race & another thread was first to register an engine for this script. - // Use theirs and discard the one we just created. - eng = existingEngine; + if (eng != null && eng != gUnhandledBreakEngine) { + gAllBreakEngines.add(eng); + fBreakEngines.add(eng); } - // assert eng.handles(c, fBreakType); - } - return eng; + return eng; + } // end synchronized(gAllBreakEngines) } private static final int kMaxLookaheads = 8; @@ -1214,7 +779,14 @@ void reset() { * The State Machine Engine for moving forward is here. * This function is the heart of the RBBI run time engine. * - * @param stateTable + * Input + * fPosition, the position in the text to begin from. + * Output + * fPosition: the boundary following the starting position. + * fDictionaryCharCount the number of dictionary characters encountered. + * If > 0, the segment will be further subdivided + * fRuleStatusIndex Info from the state table indicating which rules caused the boundary. + * * @return the new iterator position * * A note on supplementary characters and the position of underlying @@ -1225,29 +797,34 @@ void reset() { * This is different from everywhere else, where an iterator always * points at the lead surrogate of a supplementary. */ - private int handleNext(short stateTable[]) { + private int handleNext() { if (TRACE) { System.out.println("Handle Next pos char state category"); } - // No matter what, handleNext alway correctly sets the break tag value. - fLastStatusIndexValid = true; - fLastRuleStatusIndex = 0; + // handleNext always sets the break tag value. + // Set the default for it. + fRuleStatusIndex = 0; + fDictionaryCharCount = 0; // caches for quicker access CharacterIterator text = fText; - CharTrie trie = fRData.fTrie; + Trie2 trie = fRData.fTrie; + + short[] stateTable = fRData.fFTable; + int initialPosition = fPosition; + text.setIndex(initialPosition); + int result = initialPosition; // Set up the starting char - int c = text.current(); + int c = text.current(); if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { c = nextTrail32(text, c); if (c == DONE32) { + fDone = true; return BreakIterator.DONE; } } - int initialPosition = text.getIndex(); - int result = initialPosition; // Set the initial state for the state machine int state = START_STATE; @@ -1289,7 +866,7 @@ else if (mode == RBBI_RUN) { // look up the current character's character category, which tells us // which column in the state table to look at. // - category = (short) trie.getCodePointValue(c); + category = (short) trie.get(c); // Check the dictionary bit in the character's category. // Counter is only used by dictionary based iterators (subclasses). @@ -1334,7 +911,7 @@ else if (mode == RBBI_RUN) { } // Remember the break status (tag) values. - fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX]; + fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX]; } int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING]; @@ -1342,8 +919,8 @@ else if (mode == RBBI_RUN) { // Lookahead match is completed int lookaheadResult = fLookAheadMatches.getPosition(completedRule); if (lookaheadResult >= 0) { - fLastRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX]; - text.setIndex(lookaheadResult); + fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX]; + fPosition = lookaheadResult; return lookaheadResult; } } @@ -1376,21 +953,32 @@ else if (mode == RBBI_RUN) { text.setIndex(initialPosition); next32(text); result = text.getIndex(); + fRuleStatusIndex = 0; } - else { - // Leave the iterator at our result position. - // (we may have advanced beyond the last accepting position chasing after - // longer matches that never completed.) - text.setIndex(result); - } + + // Leave the iterator at our result position. + // (we may have advanced beyond the last accepting position chasing after + // longer matches that never completed.) + fPosition = result; + if (TRACE) { System.out.println("result = " + result); } return result; } - private int handlePrevious(short stateTable[]) { - if (fText == null || stateTable == null) { + /** + * Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules. + * This locates a "Safe Position" from which the forward break rules + * will operate correctly. A Safe Position is not necessarily a boundary itself. + * + * The logic of this function is very similar to handleNext(), above. + * + * @param fromPosition the position in the input text to begin the iteration. + * @hide draft / provisional / internal are hidden on Android + */ + private int handlePrevious(int fromPosition) { + if (fText == null) { return 0; } @@ -1400,18 +988,15 @@ private int handlePrevious(short stateTable[]) { int row; int c; int result = 0; - int initialPosition = 0; + int initialPosition = fromPosition; fLookAheadMatches.reset(); - - // handlePrevious() never gets the rule status. - // Flag the status as invalid; if the user ever asks for status, we will need - // to back up, then re-find the break position using handleNext(), which does - // get the status value. - fLastStatusIndexValid = false; - fLastRuleStatusIndex = 0; + short[] stateTable = fRData.fSRTable; + CISetIndex32(fText, fromPosition); + if (fromPosition == fText.getBeginIndex()) { + return BreakIterator.DONE; + } // set up the starting char - initialPosition = fText.getIndex(); result = initialPosition; c = previous32(fText); @@ -1434,17 +1019,9 @@ private int handlePrevious(short stateTable[]) { mainLoop: for (;;) { if (c == DONE32) { // Reached end of input string. - if (mode == RBBI_END || fRData.fHeader.fVersion == 1) { - // Either this is the old (ICU 3.2 and earlier) format data which - // does not support explicit support for matching {eof}, or - // we have already done the {eof} iteration. Now is the time + if (mode == RBBI_END) { + // We have already done the {eof} iteration. Now is the time // to unconditionally bail out. - if (result == initialPosition) { - // Ran off start, no match found. - // Move one position (towards the start, since we are doing previous.) - fText.setIndex(initialPosition); - previous32(fText); - } break mainLoop; } mode = RBBI_END; @@ -1455,21 +1032,11 @@ private int handlePrevious(short stateTable[]) { // look up the current character's category, which tells us // which column in the state table to look at. // - category = (short) fRData.fTrie.getCodePointValue(c); - - // Check the dictionary bit in the character's category. - // Counter is only used by dictionary based iterators (subclasses). - // Chars that need to be handled by a dictionary have a flag bit set - // in their category values. - // - if ((category & 0x4000) != 0) { - fDictionaryCharCount++; - // And off the dictionary flag bit. - category &= ~0x4000; - } + // And off the dictionary flag bit. For reverse iteration it is not used. + category = (short) fRData.fTrie.get(c); + category &= ~0x4000; } - if (TRACE) { System.out.print(" " + fText.getIndex() + " "); if (0x20 <= c && c < 0x7f) { @@ -1528,21 +1095,775 @@ private int handlePrevious(short stateTable[]) { // The state machine is done. Check whether it found a match... // - // If the iterator failed to advance in the match engine, force it ahead by one. + // If the iterator failed to move in the match engine, force it back by one code point. // (This really indicates a defect in the break rules. They should always match // at least one character.) if (result == initialPosition) { - result = fText.setIndex(initialPosition); + CISetIndex32(fText, initialPosition); previous32(fText); result = fText.getIndex(); } - fText.setIndex(result); if (TRACE) { System.out.println("Result = " + result); } return result; } + + /** + * Set the index of a CharacterIterator. + * Pin the index to the valid range range of BeginIndex <= index <= EndIndex. + * If the index points to a trail surrogate of a supplementary character, adjust it + * to the start (lead surrogate) index. + * + * @param ci A CharacterIterator to set + * @param index the index to set + * @return the resulting index, possibly pinned or adjusted. + */ + private static int CISetIndex32(CharacterIterator ci, int index) { + if (index <= ci.getBeginIndex()) { + ci.first(); + } else if (index >= ci.getEndIndex()) { + ci.setIndex(ci.getEndIndex()); + } else if (Character.isLowSurrogate(ci.setIndex(index))) { + if (!Character.isHighSurrogate(ci.previous())) { + ci.next(); + } + } + return ci.getIndex(); + } + + /* DictionaryCache stores the boundaries obtained from a run of dictionary characters. + * Dictionary boundaries are moved first to this cache, then from here + * to the main BreakCache, where they may inter-leave with non-dictionary + * boundaries. The public BreakIterator API always fetches directly + * from the main BreakCache, not from here. + * + * In common situations, the number of boundaries in a single dictionary run + * should be quite small, it will be terminated by punctuation, spaces, + * or any other non-dictionary characters. The main BreakCache may end + * up with boundaries from multiple dictionary based runs. + * + * The boundaries are stored in a simple ArrayList (vector), with the + * assumption that they will be accessed sequentially. + */ + @WeakOuter + class DictionaryCache { + + void reset() { + fPositionInCache = -1; + fStart = 0; + fLimit = 0; + fFirstRuleStatusIndex = 0; + fOtherRuleStatusIndex = 0; + fBreaks.removeAllElements(); + }; + + boolean following(int fromPos) { + if (fromPos >= fLimit || fromPos < fStart) { + fPositionInCache = -1; + return false; + } + + // Sequential iteration, move from previous boundary to the following + + int r = 0; + if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { + ++fPositionInCache; + if (fPositionInCache >= fBreaks.size()) { + fPositionInCache = -1; + return false; + } + r = fBreaks.elementAt(fPositionInCache); + assert(r > fromPos); + fBoundary = r; + fStatusIndex = fOtherRuleStatusIndex; + return true; + } + + // Random indexing. Linear search for the boundary following the given position. + + for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) { + r= fBreaks.elementAt(fPositionInCache); + if (r > fromPos) { + fBoundary = r; + fStatusIndex = fOtherRuleStatusIndex; + return true; + } + } + + // Internal error. fStart <= fromPos < fLimit, but no cached boundary. + assert(false); + fPositionInCache = -1; + return false; + }; + + boolean preceding(int fromPos) { + if (fromPos <= fStart || fromPos > fLimit) { + fPositionInCache = -1; + return false; + } + + if (fromPos == fLimit) { + fPositionInCache = fBreaks.size() - 1; + if (fPositionInCache >= 0) { + assert(fBreaks.elementAt(fPositionInCache) == fromPos); + } + } + + int r; + if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { + --fPositionInCache; + r = fBreaks.elementAt(fPositionInCache); + assert(r < fromPos); + fBoundary = r; + fStatusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; + return true; + } + + if (fPositionInCache == 0) { + fPositionInCache = -1; + return false; + } + + for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) { + r = fBreaks.elementAt(fPositionInCache); + if (r < fromPos) { + fBoundary = r; + fStatusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; + return true; + } + } + assert(false); + fPositionInCache = -1; + return false; + }; + + /** + * Populate the cache with the dictionary based boundaries within a region of text. + * @param startPos The start position of a range of text + * @param endPos The end position of a range of text + * @param firstRuleStatus The rule status index that applies to the break at startPos + * @param otherRuleStatus The rule status index that applies to boundaries other than startPos + * @hide draft / provisional / internal are hidden on Android + */ + void populateDictionary(int startPos, int endPos, + int firstRuleStatus, int otherRuleStatus) { + if ((endPos - startPos) <= 1) { + return; + } + + reset(); + fFirstRuleStatusIndex = firstRuleStatus; + fOtherRuleStatusIndex = otherRuleStatus; + + int rangeStart = startPos; + int rangeEnd = endPos; + + int category; + int current; + int foundBreakCount = 0; + + // Loop through the text, looking for ranges of dictionary characters. + // For each span, find the appropriate break engine, and ask it to find + // any breaks within the span. + + fText.setIndex(rangeStart); + int c = CharacterIteration.current32(fText); + category = (short)fRData.fTrie.get(c); + + while(true) { + while((current = fText.getIndex()) < rangeEnd && (category & 0x4000) == 0) { + c = CharacterIteration.next32(fText); // pre-increment + category = (short)fRData.fTrie.get(c); + } + if (current >= rangeEnd) { + break; + } + + // We now have a dictionary character. Get the appropriate language object + // to deal with it. + LanguageBreakEngine lbe = getLanguageBreakEngine(c); + + // Ask the language object if there are any breaks. It will add them to the cache and + // leave the text pointer on the other side of its range, ready to search for the next one. + if (lbe != null) { + foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreakType, fBreaks); + } + + // Reload the loop variables for the next go-round + c = CharacterIteration.current32(fText); + category = (short)fRData.fTrie.get(c); + } + + // If we found breaks, ensure that the first and last entries are + // the original starting and ending position. And initialize the + // cache iteration position to the first entry. + + // System.out.printf("foundBreakCount = %d%n", foundBreakCount); + if (foundBreakCount > 0) { + assert(foundBreakCount == fBreaks.size()); + if (startPos < fBreaks.elementAt(0)) { + // The dictionary did not place a boundary at the start of the segment of text. + // Add one now. This should not commonly happen, but it would be easy for interactions + // of the rules for dictionary segments and the break engine implementations to + // inadvertently cause it. Cover it here, just in case. + fBreaks.offer(startPos); + } + if (endPos > fBreaks.peek()) { + fBreaks.push(endPos); + } + fPositionInCache = 0; + // Note: Dictionary matching may extend beyond the original limit. + fStart = fBreaks.elementAt(0); + fLimit = fBreaks.peek(); + } else { + // there were no language-based breaks, even though the segment contained + // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache + // for this range will fail, and the calling code will fall back to the rule based boundaries. + } + + }; + + + DictionaryCache() { + fPositionInCache = -1; + fBreaks = new DictionaryBreakEngine.DequeI(); + } + + /** + * copy constructor. Used by RuleBasedBreakIterator.clone(). + * + * @param src the source object to be copied. + */ + DictionaryCache(DictionaryCache src) { + try { + fBreaks = (DictionaryBreakEngine.DequeI)src.fBreaks.clone(); + } + catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + fPositionInCache = src.fPositionInCache; + fStart = src.fStart; + fLimit = src.fLimit; + fFirstRuleStatusIndex = src.fFirstRuleStatusIndex; + fOtherRuleStatusIndex = src.fOtherRuleStatusIndex; + fBoundary = src.fBoundary; + fStatusIndex = src.fStatusIndex; + } + + // A data structure containing the boundaries themselves. Essentially a vector of raw ints. + DictionaryBreakEngine.DequeI fBreaks; + int fPositionInCache; // Index in fBreaks of last boundary returned by following() + // // or preceding(). Optimizes sequential access. + int fStart; // Text position of first boundary in cache. + int fLimit; // Last boundary in cache. Which is the limit of the + // // text segment being handled by the dictionary. + int fFirstRuleStatusIndex; // Rule status info for first boundary. + int fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries. + int fBoundary; // Current boundary. Set by preceding(), following(). + int fStatusIndex; // Current rule status index. Set by preceding, following(). + }; + + + + +/* + * class BreakCache + * + * Cache of break boundary positions and rule status values. + * Break iterator API functions, next(), previous(), etc., will use cached results + * when possible, and otherwise cache new results as they are obtained. + * + * Uniformly caches both dictionary and rule based (non-dictionary) boundaries. + * + * The cache is implemented as a single circular buffer. + */ + +/* + * size of the circular cache buffer. + */ +@WeakOuter +class BreakCache { + + BreakCache() { + reset(); + }; + + void reset(int pos, int ruleStatus) { + fStartBufIdx = 0; + fEndBufIdx = 0; + fTextIdx = pos; + fBufIdx = 0; + fBoundaries[0] = pos; + fStatuses[0] = (short)ruleStatus; + } + + void reset() {reset(0, 0); }; + + void next() { + if (fBufIdx == fEndBufIdx) { + fDone = !populateFollowing(); + fPosition = fTextIdx; + fRuleStatusIndex = fStatuses[fBufIdx]; + } else { + fBufIdx = modChunkSize(fBufIdx + 1); + fTextIdx = fPosition = fBoundaries[fBufIdx]; + fRuleStatusIndex = fStatuses[fBufIdx]; + } + }; + + void previous() { + int initialBufIdx = fBufIdx; + if (fBufIdx == fStartBufIdx) { + // At start of cache. Prepend to it. + populatePreceding(); + } else { + // Cache already holds the next boundary + fBufIdx = modChunkSize(fBufIdx - 1); + fTextIdx = fBoundaries[fBufIdx]; + } + fDone = (fBufIdx == initialBufIdx); + fPosition = fTextIdx; + fRuleStatusIndex = fStatuses[fBufIdx]; + return; + }; + + // Move the iteration state to the position following the startPosition. + // Input position must be pinned to the input length. + void following(int startPos) { + if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { + // startPos is in the cache. Do a next() from that position. + // TODO: an awkward set of interactions with bi->fDone + // seek() does not clear it; it can't because of interactions with populateNear(). + // next() does not clear it in the fast-path case, where everything matters. Maybe it should. + // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end. + fDone = false; + next(); + } + + }; + + void preceding(int startPos) { + if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { + if (startPos == fTextIdx) { + previous(); + } else { + // seek() leaves the BreakCache positioned at the preceding boundary + // if the requested position is between two bounaries. + // current() pushes the BreakCache position out to the BreakIterator itself. + assert(startPos > fTextIdx); + current(); + } + } + return; + }; + + /* + * Update the state of the public BreakIterator (fBI) to reflect the + * current state of the break iterator cache (this). + */ + int current() { + fPosition = fTextIdx; + fRuleStatusIndex = fStatuses[fBufIdx]; + fDone = false; + return fTextIdx; + }; + + /** + * Add boundaries to the cache near the specified position. + * The given position need not be a boundary itself. + * The input position must be within the range of the text, and + * on a code point boundary. + * If the requested position is a break boundary, leave the iteration + * position on it. + * If the requested position is not a boundary, leave the iteration + * position on the preceding boundary and include both the the + * preceding and following boundaries in the cache. + * Additional boundaries, either preceding or following, may be added + * to the cache as a side effect. + * + * Return false if the operation failed. + */ + boolean populateNear(int position) { + assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]); + + // Find a boundary somewhere in the vicinity of the requested position. + // Depending on the safe rules and the text data, it could be either before, at, or after + // the requested position. + + + // If the requested position is not near already cached positions, clear the existing cache, + // find a near-by boundary and begin new cache contents there. + + if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) { + int aBoundary = fText.getBeginIndex(); + int ruleStatusIndex = 0; + // TODO: check for position == length of text. Although may still need to back up to get rule status. + if (position > aBoundary + 20) { + int backupPos = handlePrevious(position); + fPosition = backupPos; + aBoundary = handleNext(); // Ignore dictionary, just finding a rule based boundary. + ruleStatusIndex = fRuleStatusIndex; + } + reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. + } + + // Fill in boundaries between existing cache content and the new requested position. + + if (fBoundaries[fEndBufIdx] < position) { + // The last position in the cache precedes the requested position. + // Add following position(s) to the cache. + while (fBoundaries[fEndBufIdx] < position) { + if (!populateFollowing()) { + assert false; + return false; + } + } + fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer. + fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries. + while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos. + previous(); + } + return true; + } + + if (fBoundaries[fStartBufIdx] > position) { + // The first position in the cache is beyond the requested position. + // back up more until we get a boundary <= the requested position. + while (fBoundaries[fStartBufIdx] > position) { + populatePreceding(); + } + fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer. + fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries. + while (fTextIdx < position) { // Move forwards to a position at or following the requested pos. + next(); + } + if (fTextIdx > position) { + // If position is not itself a boundary, the next() loop above will overshoot. + // Back up one, leaving cache position at the boundary preceding the requested position. + previous(); + } + return true; + } + + assert fTextIdx == position; + return true; + + }; + + /** + * Add boundary(s) to the cache following the current last boundary. + * Return false if at the end of the text, and no more boundaries can be added. + * Leave iteration position at the first newly added boundary, or unchanged if no boundary was added. + */ + boolean populateFollowing() { + int fromPosition = fBoundaries[fEndBufIdx]; + int fromRuleStatusIdx = fStatuses[fEndBufIdx]; + int pos = 0; + int ruleStatusIdx = 0; + + if (fDictionaryCache.following(fromPosition)) { + addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); + return true; + } + + fPosition = fromPosition; + pos = handleNext(); + if (pos == BreakIterator.DONE) { + return false; + } + + ruleStatusIdx = fRuleStatusIndex; + if (fDictionaryCharCount > 0) { + // The text segment obtained from the rules includes dictionary characters. + // Subdivide it, with subdivided results going into the dictionary cache. + fDictionaryCache.populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx); + if (fDictionaryCache.following(fromPosition)) { + addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); + return true; + // TODO: may want to move a sizable chunk of the dictionary cache to the break cache at this point. + // But be careful with interactions with populateNear(). + } + } + + // Rule based segment did not include dictionary characters. + // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them, + // meaning that we didn't take the return, above. + // Add its end point to the cache. + addFollowing(pos, ruleStatusIdx, UpdateCachePosition); + + // Add several non-dictionary boundaries at this point, to optimize straight forward iteration. + // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results. + // + for (int count=0; count<6; ++count) { + pos = handleNext(); + if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) { + break; + } + addFollowing(pos, fRuleStatusIndex, RetainCachePosition); + } + return true; + }; + + /** + * Add one or more boundaries to the cache preceding the first currently cached boundary. + * Leave the iteration position on the first added boundary. + * Return false if no boundaries could be added (if at the start of the text.) + */ + boolean populatePreceding() { + int textBegin = fText.getBeginIndex(); + int fromPosition = fBoundaries[fStartBufIdx]; + if (fromPosition == textBegin) { + return false; + } + + int position = textBegin; + int positionStatusIdx = 0; + + if (fDictionaryCache.preceding(fromPosition)) { + addPreceding(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); + return true; + } + + int backupPosition = fromPosition; + + // Find a boundary somewhere preceding the first already-cached boundary + do { + backupPosition = backupPosition - 30; + if (backupPosition <= textBegin) { + backupPosition = textBegin; + } else { + backupPosition = handlePrevious(backupPosition); + } + if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) { + position = textBegin; + positionStatusIdx = 0; + } else { + fPosition = backupPosition; // TODO: pass starting position in a clearer way. + position = handleNext(); + positionStatusIdx = fRuleStatusIndex; + + } + } while (position >= fromPosition); + + // Find boundaries between the one we just located and the first already-cached boundary + // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.. + + fSideBuffer.removeAllElements(); + fSideBuffer.push(position); + fSideBuffer.push(positionStatusIdx); + + do { + int prevPosition = fPosition = position; + int prevStatusIdx = positionStatusIdx; + position = handleNext(); + positionStatusIdx = fRuleStatusIndex; + if (position == BreakIterator.DONE) { + break; + } + + boolean segmentHandledByDictionary = false; + if (fDictionaryCharCount != 0) { + // Segment from the rules includes dictionary characters. + // Subdivide it, with subdivided results going into the dictionary cache. + int dictSegEndPosition = position; + fDictionaryCache.populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx); + while (fDictionaryCache.following(prevPosition)) { + position = fDictionaryCache.fBoundary; + positionStatusIdx = fDictionaryCache.fStatusIndex; + segmentHandledByDictionary = true; + assert(position > prevPosition); + if (position >= fromPosition) { + break; + } + assert(position <= dictSegEndPosition); + fSideBuffer.push(position); + fSideBuffer.push(positionStatusIdx); + prevPosition = position; + } + assert(position==dictSegEndPosition || position>=fromPosition); + } + + if (!segmentHandledByDictionary && position < fromPosition) { + fSideBuffer.push(position); + fSideBuffer.push(positionStatusIdx); + } + } while (position < fromPosition); + + // Move boundaries from the side buffer to the main circular buffer. + boolean success = false; + if (!fSideBuffer.isEmpty()) { + positionStatusIdx = fSideBuffer.pop(); + position = fSideBuffer.pop(); + addPreceding(position, positionStatusIdx, UpdateCachePosition); + success = true; + } + + while (!fSideBuffer.isEmpty()) { + positionStatusIdx = fSideBuffer.pop(); + position = fSideBuffer.pop(); + if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) { + // No space in circular buffer to hold a new preceding result while + // also retaining the current cache (iteration) position. + // Bailing out is safe; the cache will refill again if needed. + break; + } + } + return success; + }; + + + static final boolean RetainCachePosition = false; + static final boolean UpdateCachePosition = true; + + /* + * Add the boundary following the current position. + * The current position can be left as it was, or changed to the newly added boundary, + * as specified by the update parameter. + */ + void addFollowing(int position, int ruleStatusIdx, boolean update) { + assert(position > fBoundaries[fEndBufIdx]); + assert(ruleStatusIdx <= Short.MAX_VALUE); + int nextIdx = modChunkSize(fEndBufIdx + 1); + if (nextIdx == fStartBufIdx) { + fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1. + } + fBoundaries[nextIdx] = position; + fStatuses[nextIdx] = (short)ruleStatusIdx; + fEndBufIdx = nextIdx; + if (update == UpdateCachePosition) { + // Set current position to the newly added boundary. + fBufIdx = nextIdx; + fTextIdx = position; + } else { + // Retaining the original cache position. + // Check if the added boundary wraps around the buffer, and would over-write the original position. + // It's the responsibility of callers of this function to not add too many. + assert(nextIdx != fBufIdx); + } + + }; + + + /* + * Add the boundary preceding the current position. + * The current position can be left as it was, or changed to the newly added boundary, + * as specified by the update parameter. + */ + boolean addPreceding(int position, int ruleStatusIdx, boolean update) { + assert(position < fBoundaries[fStartBufIdx]); + assert(ruleStatusIdx <= Short.MAX_VALUE); + int nextIdx = modChunkSize(fStartBufIdx - 1); + if (nextIdx == fEndBufIdx) { + if (fBufIdx == fEndBufIdx && update == RetainCachePosition) { + // Failure. The insertion of the new boundary would claim the buffer position that is the + // current iteration position. And we also want to retain the current iteration position. + // (The buffer is already completely full of entries that precede the iteration position.) + return false; + } + fEndBufIdx = modChunkSize(fEndBufIdx - 1); + } + fBoundaries[nextIdx] = position; + fStatuses[nextIdx] = (short)ruleStatusIdx; + fStartBufIdx = nextIdx; + if (update == UpdateCachePosition) { + fBufIdx = nextIdx; + fTextIdx = position; + } + return true; + }; + + /** + * Set the cache position to the specified position, or, if the position + * falls between to cached boundaries, to the preceding boundary. + * Fails if the requested position is outside of the range of boundaries currently held by the cache. + * The startPosition must be on a code point boundary. + * + * Return true if successful, false if the specified position is after + * the last cached boundary or before the first. + */ + boolean seek(int pos) { + if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) { + return false; + } + if (pos == fBoundaries[fStartBufIdx]) { + // Common case: seek(0), from BreakIterator::first() + fBufIdx = fStartBufIdx; + fTextIdx = fBoundaries[fBufIdx]; + return true; + } + if (pos == fBoundaries[fEndBufIdx]) { + fBufIdx = fEndBufIdx; + fTextIdx = fBoundaries[fBufIdx]; + return true; + } + + int min = fStartBufIdx; + int max = fEndBufIdx; + while (min != max) { + int probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2; + probe = modChunkSize(probe); + if (fBoundaries[probe] > pos) { + max = probe; + } else { + min = modChunkSize(probe + 1); + } + } + assert(fBoundaries[max] > pos); + fBufIdx = modChunkSize(max - 1); + fTextIdx = fBoundaries[fBufIdx]; + assert(fTextIdx <= pos); + return true; + + }; + + + /** + * copy constructor, used from RuleBasedBreakIterator.clone(). + * + * @param src + */ + BreakCache(BreakCache src) { + fStartBufIdx = src.fStartBufIdx; + fEndBufIdx = src.fEndBufIdx; + fTextIdx = src.fTextIdx; + fBufIdx = src.fBufIdx; + fBoundaries = src.fBoundaries.clone(); + fStatuses = src.fStatuses.clone(); + fSideBuffer = new DictionaryBreakEngine.DequeI(); // Transient, no need to clone contents. + } + + void dumpCache() { + System.out.printf("fTextIdx:%d fBufIdx:%d%n", fTextIdx, fBufIdx); + for (int i=fStartBufIdx; ; i=modChunkSize(i+1)) { + System.out.printf("%d %d%n", i, fBoundaries[i]); + if (i == fEndBufIdx) { + break; + } + } + }; + + private final int modChunkSize(int index) { return index & (CACHE_SIZE - 1); }; + + static final int CACHE_SIZE = 128; + // static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two."); + + int fStartBufIdx; + int fEndBufIdx; // inclusive + + int fTextIdx; + int fBufIdx; + + int[] fBoundaries = new int[CACHE_SIZE]; + short[] fStatuses = new short[CACHE_SIZE]; + + DictionaryBreakEngine.DequeI fSideBuffer = new DictionaryBreakEngine.DequeI(); +}; + + + + } diff --git a/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/UnhandledBreakEngine.java b/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/UnhandledBreakEngine.java index 706cb014d2..a914bb57dc 100644 --- a/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/UnhandledBreakEngine.java +++ b/jre_emul/android/platform/external/icu/android_icu4j/src/main/java/android/icu/text/UnhandledBreakEngine.java @@ -12,6 +12,7 @@ import static android.icu.impl.CharacterIteration.DONE32; import java.text.CharacterIterator; +import java.util.concurrent.atomic.AtomicReferenceArray; import android.icu.impl.CharacterIteration; import android.icu.lang.UCharacter; @@ -20,42 +21,63 @@ final class UnhandledBreakEngine implements LanguageBreakEngine { // TODO: Use two arrays of UnicodeSet, one with all frozen sets, one with unfrozen. // in handleChar(), update the unfrozen version, clone, freeze, replace the frozen one. - private final UnicodeSet[] fHandled = new UnicodeSet[BreakIterator.KIND_TITLE + 1]; + + // Note on concurrency: A single instance of UnhandledBreakEngine is shared across all + // RuleBasedBreakIterators in a process. They may make arbitrary concurrent calls. + // If handleChar() is updating the set of unhandled characters at the same time + // findBreaks() or handles() is referencing it, the referencing functions must see + // a consistent set. It doesn't matter whether they see it before or after the update, + // but they should not see an inconsistent, changing set. + // + // To do this, an update is made by cloning the old set, updating the clone, then + // replacing the old with the new. Once made visible, each set remains constant. + + // TODO: it's odd that findBreaks() can produce different results, depending + // on which scripts have been previously seen by handleChar(). (This is not a + // threading specific issue). Possibly stop on script boundaries? + + final AtomicReferenceArray fHandled = new AtomicReferenceArray(BreakIterator.KIND_TITLE + 1); public UnhandledBreakEngine() { - for (int i = 0; i < fHandled.length; i++) { - fHandled[i] = new UnicodeSet(); + for (int i = 0; i < fHandled.length(); i++) { + fHandled.set(i, new UnicodeSet()); } } - + + @Override public boolean handles(int c, int breakType) { - return (breakType >= 0 && breakType < fHandled.length) && - (fHandled[breakType].contains(c)); + return (breakType >= 0 && breakType < fHandled.length()) && + (fHandled.get(breakType).contains(c)); } + @Override public int findBreaks(CharacterIterator text, int startPos, int endPos, - boolean reverse, int breakType, DictionaryBreakEngine.DequeI foundBreaks) { - if (breakType >= 0 && breakType < fHandled.length) { - int c = CharacterIteration.current32(text); - if (reverse) { - while (text.getIndex() > startPos && fHandled[breakType].contains(c)) { - CharacterIteration.previous32(text); - c = CharacterIteration.current32(text); - } - } else { - while (text.getIndex() < endPos && fHandled[breakType].contains(c)) { - CharacterIteration.next32(text); - c = CharacterIteration.current32(text); - } - } - } + int breakType, DictionaryBreakEngine.DequeI foundBreaks) { + if (breakType >= 0 && breakType < fHandled.length()) { + UnicodeSet uniset = fHandled.get(breakType); + int c = CharacterIteration.current32(text); + while (text.getIndex() < endPos && uniset.contains(c)) { + CharacterIteration.next32(text); + c = CharacterIteration.current32(text); + } + } return 0; } - public synchronized void handleChar(int c, int breakType) { - if (breakType >= 0 && breakType < fHandled.length && c != DONE32) { - if (!fHandled[breakType].contains(c)) { + /** + * Update the set of unhandled characters for the specified breakType to include + * all that have the same script as c. + * May be called concurrently with handles() or findBreaks(). + * Must not be called concurrently with itself. + */ + public void handleChar(int c, int breakType) { + if (breakType >= 0 && breakType < fHandled.length() && c != DONE32) { + UnicodeSet originalSet = fHandled.get(breakType); + if (!originalSet.contains(c)) { int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT); - fHandled[breakType].applyIntPropertyValue(UProperty.SCRIPT, script); + UnicodeSet newSet = new UnicodeSet(); + newSet.applyIntPropertyValue(UProperty.SCRIPT, script); + newSet.addAll(originalSet); + fHandled.set(breakType, newSet); } } } diff --git a/jre_emul/android/platform/libcore/harmony-tests/src/test/java/org/apache/harmony/tests/java/text/BreakIteratorTest.java b/jre_emul/android/platform/libcore/harmony-tests/src/test/java/org/apache/harmony/tests/java/text/BreakIteratorTest.java new file mode 100644 index 0000000000..1210e4ee93 --- /dev/null +++ b/jre_emul/android/platform/libcore/harmony-tests/src/test/java/org/apache/harmony/tests/java/text/BreakIteratorTest.java @@ -0,0 +1,318 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package org.apache.harmony.tests.java.text; + +import java.text.BreakIterator; +import java.text.CharacterIterator; +import java.text.StringCharacterIterator; +import java.util.Locale; + +import junit.framework.TestCase; + +public class BreakIteratorTest extends TestCase { + + private static final String TEXT = "a\u0308abc def, gh-12i?jkl.mno?"; + + BreakIterator iterator; + + /* + * @see TestCase#setUp() + */ + protected void setUp() throws Exception { + super.setUp(); + iterator = BreakIterator.getCharacterInstance(Locale.US); + } + + public void testConsts() { + assertEquals(-1, BreakIterator.DONE); + } + + public void testCache() { + BreakIterator newOne = BreakIterator.getCharacterInstance(Locale.US); + assertNotSame(newOne, iterator); + assertEquals(newOne, iterator); + + newOne = BreakIterator.getCharacterInstance(); + assertEquals(newOne, iterator); + + newOne = BreakIterator.getCharacterInstance(Locale.CHINA); + assertEquals(newOne, iterator); + + BreakIterator wordIterator = BreakIterator.getWordInstance(); + assertFalse(wordIterator.equals(iterator)); + + BreakIterator lineIterator = BreakIterator.getLineInstance(); + assertFalse(lineIterator.equals(iterator)); + + BreakIterator senteIterator = BreakIterator.getSentenceInstance(); + assertFalse(senteIterator.equals(iterator)); + } + + public void testClone() { + BreakIterator cloned = (BreakIterator) iterator.clone(); + assertNotSame(cloned, iterator); + assertEquals(cloned, iterator); + } + + public void testCurrent() { + assertEquals(0, iterator.current()); + iterator.setText(TEXT); + assertEquals(iterator.first(), iterator.current()); + } + + public void testFirst() { + assertEquals(0, iterator.first()); + iterator.setText(TEXT); + assertEquals(0, iterator.first()); + } + + public void testFollowing() { + try { + iterator.following(1); + fail("should throw illegal argument exception"); + } catch (IllegalArgumentException e) { + } + iterator.setText(TEXT); + assertEquals(2, iterator.following(1)); + try { + assertEquals(0, iterator.following(-1)); + fail("should throw illegal argument exception"); + } catch (IllegalArgumentException e) { + } + assertEquals(BreakIterator.DONE, iterator.following(TEXT.length())); + } + + public void testIsBoundary() { + try { + iterator.isBoundary(2); + fail("should throw illegal argument exception"); + } catch (IllegalArgumentException e) { + } + iterator.setText(TEXT); + assertTrue(iterator.isBoundary(2)); + assertFalse(iterator.isBoundary(1)); + assertTrue(iterator.isBoundary(0)); + try { + iterator.isBoundary(-1); + fail("should throw illegal argument exception"); + } catch (IllegalArgumentException e) { + } + assertTrue(iterator.isBoundary(TEXT.length())); + } + + public void testLast() { + assertEquals(0, iterator.last()); + iterator.setText(TEXT); + assertEquals(TEXT.length(), iterator.last()); + } + + /* + * Class under test for int next(int) + */ + public void testNextint() { + assertEquals(BreakIterator.DONE, iterator.next(3)); + iterator.setText(TEXT); + assertEquals(4, iterator.next(3)); + assertEquals(24, iterator.next(20)); + assertEquals(23, iterator.next(-1)); + assertEquals(-1, iterator.next(TEXT.length())); + } + + public void testPreceding() { + try { + iterator.preceding(2); + fail("should throw illegal argument exception"); + } catch (IllegalArgumentException e) { + } + iterator.setText(TEXT); + assertEquals(0, iterator.preceding(2)); + assertEquals(2, iterator.preceding(3)); + assertEquals(16, iterator.preceding(17)); + assertEquals(17, iterator.preceding(18)); + assertEquals(18, iterator.preceding(19)); + try { + iterator.preceding(-1); + fail("should throw illegal argument exception"); + } catch (IllegalArgumentException e) { + } + assertEquals(TEXT.length() - 1, iterator.preceding(TEXT.length())); + assertEquals(BreakIterator.DONE, iterator.preceding(0)); + } + + public void testPrevious() { + assertEquals(-1, iterator.previous()); + iterator.setText(TEXT); + assertEquals(-1, iterator.previous()); + iterator.last(); + assertEquals(TEXT.length() - 1, iterator.previous()); + } + + /* TODO(b/34460433): enable. + public void testGetAvailableLocales() { + Locale[] locales = BreakIterator.getAvailableLocales(); + assertTrue(locales.length > 0); + } */ + + /* + * Class under test for BreakIterator getCharacterInstance() + */ + public void testGetCharacterInstance() { + BreakIterator.getCharacterInstance(); + } + + /* + * Class under test for BreakIterator getCharacterInstance(Locale) + */ + public void testGetCharacterInstanceLocale() { + BreakIterator it = BreakIterator.getCharacterInstance(Locale.US); + BreakIterator it2 = BreakIterator.getCharacterInstance(Locale.CHINA); + assertEquals(it, it2); + } + + /* + * Class under test for BreakIterator getLineInstance() + */ + public void testGetLineInstance() { + BreakIterator it = BreakIterator.getLineInstance(); + assertNotNull(it); + } + + /* + * Class under test for BreakIterator getLineInstance(Locale) + */ + public void testGetLineInstanceLocale() { + BreakIterator it = BreakIterator.getLineInstance(Locale.US); + assertNotNull(it); + BreakIterator.getLineInstance(new Locale("bad locale")); + } + + /* + * Class under test for BreakIterator getSentenceInstance() + */ + public void testGetSentenceInstance() { + BreakIterator it = BreakIterator.getSentenceInstance(); + assertNotNull(it); + } + + /* + * Class under test for BreakIterator getSentenceInstance(Locale) + */ + public void testGetSentenceInstanceLocale() { + BreakIterator it = BreakIterator.getSentenceInstance(Locale.US); + assertNotNull(it); + } + + public void testGetText() { + assertEquals(new StringCharacterIterator(""), iterator.getText()); + iterator.setText(TEXT); + assertEquals(new StringCharacterIterator(TEXT), iterator.getText()); + } + + /* + * Class under test for BreakIterator getWordInstance() + */ + public void testGetWordInstance() { + BreakIterator it = BreakIterator.getWordInstance(); + assertNotNull(it); + } + + /* + * Class under test for BreakIterator getWordInstance(Locale) + */ + public void testGetWordInstanceLocale() { + BreakIterator it = BreakIterator.getWordInstance(Locale.US); + assertNotNull(it); + } + + /* + * Class under test for void setText(CharacterIterator) + */ + public void testSetTextCharacterIterator() { + try { + iterator.setText((CharacterIterator) null); + fail(); + } catch (NullPointerException e) { + } + CharacterIterator it = new StringCharacterIterator("abc"); + iterator.setText(it); + assertSame(it, iterator.getText()); + } + + /* + * Class under test for void setText(String) + */ + public void testSetTextString() { + try { + iterator.setText((String) null); + fail(); + } catch (NullPointerException e) { + } + iterator.setText("abc"); + CharacterIterator it = new StringCharacterIterator("abc"); + assertEquals(it, iterator.getText()); + } + + public void test_next() { + // Regression test for HARMONY-30 + BreakIterator bi = BreakIterator.getWordInstance(Locale.US); + bi.setText("This is the test, WordInstance"); + int n = bi.first(); + n = bi.next(); + assertEquals("Assert 0: next() returns incorrect value ", 4, n); + + assertEquals(BreakIterator.DONE, iterator.next()); + iterator.setText(TEXT); + assertEquals(2, iterator.next()); + } + + /** + * @tests java.text.BreakIterator#getCharacterInstance(Locale) + */ + public void testGetCharacterInstanceLocale_NPE() { + // Regression for HARMONY-265 + try { + BreakIterator.getCharacterInstance(null); + fail("BreakIterator.getCharacterInstance(null); should throw NullPointerException"); + } catch (NullPointerException e) { + } + } + + public void testGetLineInstanceLocale_NPE() { + try { + BreakIterator.getLineInstance(null); + fail("BreakIterator.getLineInstance(null); should throw NullPointerException"); + } catch (NullPointerException e) { + } + } + + public void testGetSentenceInstanceLocale_NPE() { + try { + BreakIterator.getSentenceInstance(null); + fail("BreakIterator.getSentenceInstance(null); should throw NullPointerException"); + } catch (NullPointerException e) { + } + } + + public void testGetWordInstanceLocale_NPE() { + try { + BreakIterator.getWordInstance(null); + fail("BreakIterator.getWordInstance(null); should throw NullPointerException"); + } catch (NullPointerException e) { + } + } +} diff --git a/jre_emul/android/platform/libcore/luni/src/test/java/libcore/java/text/BreakIteratorTest.java b/jre_emul/android/platform/libcore/luni/src/test/java/libcore/java/text/BreakIteratorTest.java new file mode 100644 index 0000000000..2df300a052 --- /dev/null +++ b/jre_emul/android/platform/libcore/luni/src/test/java/libcore/java/text/BreakIteratorTest.java @@ -0,0 +1,176 @@ +/* + * Copyright (C) 2010 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package libcore.java.text; + +import java.text.BreakIterator; +import java.util.ArrayList; +import java.util.Locale; + +public class BreakIteratorTest extends junit.framework.TestCase { + BreakIterator iterator; + + @Override + protected void setUp() throws Exception { + super.setUp(); + iterator = BreakIterator.getCharacterInstance(Locale.US); + } + + /* TODO(b/34460433): enable. + public void testGetAvailableLocales() { + Locale[] locales = BreakIterator.getAvailableLocales(); + assertTrue("Array available locales is null", locales != null); + assertTrue("Array available locales is 0-length", + (locales != null && locales.length != 0)); + boolean found = false; + for (Locale l : locales) { + if (l.equals(Locale.US)) { + // expected + found = true; + } + } + assertTrue("At least locale " + Locale.US + " must be presented", found); + } */ + + public void testGetWordInstanceLocale() { + BreakIterator it1 = BreakIterator.getWordInstance(Locale.CANADA_FRENCH); + assertTrue("Incorrect BreakIterator", it1 != BreakIterator.getWordInstance()); + BreakIterator it2 = BreakIterator.getWordInstance(new Locale("bad locale")); + assertTrue("Incorrect BreakIterator", it2 != BreakIterator.getWordInstance()); + } + + // http://b/7307154 - we used to pin an unbounded number of char[]s, relying on finalization. + public void testStress() throws Exception { + char[] cs = { 'a' }; + for (int i = 0; i < 4096; ++i) { + BreakIterator it = BreakIterator.getWordInstance(Locale.US); + it.setText(new String(cs)); + } + } + + public void testWordBoundaries() { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < 1024; ++i) { + if (i > 0) { + sb.append(' '); + } + sb.append("12345"); + } + String s = sb.toString(); + + BreakIterator it = BreakIterator.getWordInstance(Locale.US); + it.setText(s); + + // Check we're not leaking global references. 2048 would bust the VM's hard-coded limit. + for (int i = 0; i < 2048; ++i) { + it.setText(s); + } + + BreakIterator clone = (BreakIterator) it.clone(); + + assertExpectedWordBoundaries(it, s); + assertExpectedWordBoundaries(clone, s); + } + + private void assertExpectedWordBoundaries(BreakIterator it, String s) { + int expectedPos = 0; + int pos = it.first(); + assertEquals(expectedPos, pos); + while (pos != BreakIterator.DONE) { + expectedPos += 5; // The five characters until the end of this word. + pos = it.next(); + assertEquals(expectedPos, pos); + + expectedPos += 1; // The space before the start of the next word... + if (expectedPos > s.length()) { + expectedPos = BreakIterator.DONE; // ...unless we're done. + } + pos = it.next(); + assertEquals(expectedPos, pos); + } + } + + public void testIsBoundary() { + BreakIterator it = BreakIterator.getCharacterInstance(Locale.US); + it.setText("hello"); + + try { + it.isBoundary(-1); + fail(); + } catch (IllegalArgumentException expected) { + // Note that this exception is not listed in the Java API documentation + } + + assertTrue(it.isBoundary(0)); + assertTrue(it.isBoundary(1)); + assertTrue(it.isBoundary(4)); + assertTrue(it.isBoundary(5)); + + try { + it.isBoundary(6); + fail(); + } catch (IllegalArgumentException expected) { + // Note that this exception is not listed in the Java API documentation + } + } + + public void testFollowing() { + BreakIterator it = BreakIterator.getCharacterInstance(Locale.US); + it.setText("hello"); + + try { + it.following(-1); + fail(); + } catch (IllegalArgumentException expected) { + // Expected exception + } + + assertEquals(1, it.following(0)); + assertEquals(2, it.following(1)); + assertEquals(5, it.following(4)); + assertEquals(BreakIterator.DONE, it.following(5)); + + try { + it.following(6); + fail(); + } catch (IllegalArgumentException expected) { + // Expected exception + } + } + + public void testPreceding() { + BreakIterator it = BreakIterator.getCharacterInstance(Locale.US); + it.setText("hello"); + + try { + it.preceding(-1); + fail(); + } catch (IllegalArgumentException expected) { + // Expected exception + } + + assertEquals(BreakIterator.DONE, it.preceding(0)); + assertEquals(0, it.preceding(1)); + assertEquals(4, it.preceding(5)); + + try { + it.preceding(6); + fail(); + } catch (IllegalArgumentException expected) { + // Expected exception + } + } +} diff --git a/jre_emul/android/platform/libcore/ojluni/src/main/java/java/text/BreakIterator.java b/jre_emul/android/platform/libcore/ojluni/src/main/java/java/text/BreakIterator.java new file mode 100644 index 0000000000..a875ec1491 --- /dev/null +++ b/jre_emul/android/platform/libcore/ojluni/src/main/java/java/text/BreakIterator.java @@ -0,0 +1,554 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved + * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved + * + * The original version of this source code and documentation + * is copyrighted and owned by Taligent, Inc., a wholly-owned + * subsidiary of IBM. These materials are provided under terms + * of a License Agreement between Taligent and Sun. This technology + * is protected by multiple US and International patents. + * + * This notice and attribution to Taligent may not be removed. + * Taligent is a registered trademark of Taligent, Inc. + * + */ + +package java.text; + +import java.util.Locale; + + +// Android-changed: Discourage modification on CharacterIterator after setText. http://b/80456574 +/** + * The BreakIterator class implements methods for finding + * the location of boundaries in text. Instances of BreakIterator + * maintain a current position and scan over text + * returning the index of characters where boundaries occur. + * Internally, BreakIterator scans text using a + * CharacterIterator, and is thus able to scan text held + * by any object implementing that protocol. A StringCharacterIterator + * is used to scan String objects passed to setText. + * The CharacterIterator object must not be modified after having been + * passed to setText. If the text in the CharacterIterator object + * is changed, the caller must reset BreakIterator by calling + * setText. + * + *

+ * You use the factory methods provided by this class to create + * instances of various types of break iterators. In particular, + * use getWordInstance, getLineInstance, + * getSentenceInstance, and getCharacterInstance + * to create BreakIterators that perform + * word, line, sentence, and character boundary analysis respectively. + * A single BreakIterator can work only on one unit + * (word, line, sentence, and so on). You must use a different iterator + * for each unit boundary analysis you wish to perform. + * + *

+ * Line boundary analysis determines where a text string can be + * broken when line-wrapping. The mechanism correctly handles + * punctuation and hyphenated words. Actual line breaking needs + * to also consider the available line width and is handled by + * higher-level software. + * + *

+ * Sentence boundary analysis allows selection with correct interpretation + * of periods within numbers and abbreviations, and trailing punctuation + * marks such as quotation marks and parentheses. + * + *

+ * Word boundary analysis is used by search and replace functions, as + * well as within text editing applications that allow the user to + * select words with a double click. Word selection provides correct + * interpretation of punctuation marks within and following + * words. Characters that are not part of a word, such as symbols + * or punctuation marks, have word-breaks on both sides. + * + *

+ * Character boundary analysis allows users to interact with characters + * as they expect to, for example, when moving the cursor through a text + * string. Character boundary analysis provides correct navigation + * through character strings, regardless of how the character is stored. + * The boundaries returned may be those of supplementary characters, + * combining character sequences, or ligature clusters. + * For example, an accented character might be stored as a base character + * and a diacritical mark. What users consider to be a character can + * differ between languages. + * + *

+ * The BreakIterator instances returned by the factory methods + * of this class are intended for use with natural languages only, not for + * programming language text. It is however possible to define subclasses + * that tokenize a programming language. + * + *

+ * Examples:

+ * Creating and using text boundaries: + *

+ *
+ * public static void main(String args[]) {
+ *      if (args.length == 1) {
+ *          String stringToExamine = args[0];
+ *          //print each word in order
+ *          BreakIterator boundary = BreakIterator.getWordInstance();
+ *          boundary.setText(stringToExamine);
+ *          printEachForward(boundary, stringToExamine);
+ *          //print each sentence in reverse order
+ *          boundary = BreakIterator.getSentenceInstance(Locale.US);
+ *          boundary.setText(stringToExamine);
+ *          printEachBackward(boundary, stringToExamine);
+ *          printFirst(boundary, stringToExamine);
+ *          printLast(boundary, stringToExamine);
+ *      }
+ * }
+ * 
+ *
+ * + * Print each element in order: + *
+ *
+ * public static void printEachForward(BreakIterator boundary, String source) {
+ *     int start = boundary.first();
+ *     for (int end = boundary.next();
+ *          end != BreakIterator.DONE;
+ *          start = end, end = boundary.next()) {
+ *          System.out.println(source.substring(start,end));
+ *     }
+ * }
+ * 
+ *
+ * + * Print each element in reverse order: + *
+ *
+ * public static void printEachBackward(BreakIterator boundary, String source) {
+ *     int end = boundary.last();
+ *     for (int start = boundary.previous();
+ *          start != BreakIterator.DONE;
+ *          end = start, start = boundary.previous()) {
+ *         System.out.println(source.substring(start,end));
+ *     }
+ * }
+ * 
+ *
+ * + * Print first element: + *
+ *
+ * public static void printFirst(BreakIterator boundary, String source) {
+ *     int start = boundary.first();
+ *     int end = boundary.next();
+ *     System.out.println(source.substring(start,end));
+ * }
+ * 
+ *
+ * + * Print last element: + *
+ *
+ * public static void printLast(BreakIterator boundary, String source) {
+ *     int end = boundary.last();
+ *     int start = boundary.previous();
+ *     System.out.println(source.substring(start,end));
+ * }
+ * 
+ *
+ * + * Print the element at a specified position: + *
+ *
+ * public static void printAt(BreakIterator boundary, int pos, String source) {
+ *     int end = boundary.following(pos);
+ *     int start = boundary.previous();
+ *     System.out.println(source.substring(start,end));
+ * }
+ * 
+ *
+ * + * Find the next word: + *
+ *
{@code
+ * public static int nextWordStartAfter(int pos, String text) {
+ *     BreakIterator wb = BreakIterator.getWordInstance();
+ *     wb.setText(text);
+ *     int last = wb.following(pos);
+ *     int current = wb.next();
+ *     while (current != BreakIterator.DONE) {
+ *         for (int p = last; p < current; p++) {
+ *             if (Character.isLetter(text.codePointAt(p)))
+ *                 return last;
+ *         }
+ *         last = current;
+ *         current = wb.next();
+ *     }
+ *     return BreakIterator.DONE;
+ * }
+ * }
+ * (The iterator returned by BreakIterator.getWordInstance() is unique in that + * the break positions it returns don't represent both the start and end of the + * thing being iterated over. That is, a sentence-break iterator returns breaks + * that each represent the end of one sentence and the beginning of the next. + * With the word-break iterator, the characters between two boundaries might be a + * word, or they might be the punctuation or whitespace between two words. The + * above code uses a simple heuristic to determine which boundary is the beginning + * of a word: If the characters between this boundary and the next boundary + * include at least one letter (this can be an alphabetical letter, a CJK ideograph, + * a Hangul syllable, a Kana character, etc.), then the text between this boundary + * and the next is a word; otherwise, it's the material between words.) + *
+ * + * @see CharacterIterator + * + */ + +public abstract class BreakIterator implements Cloneable +{ + /** + * Constructor. BreakIterator is stateless and has no default behavior. + */ + protected BreakIterator() + { + } + + /** + * Create a copy of this iterator + * @return A copy of this + */ + @Override + public Object clone() + { + try { + return super.clone(); + } + catch (CloneNotSupportedException e) { + throw new InternalError(e); + } + } + + /** + * DONE is returned by previous(), next(), next(int), preceding(int) + * and following(int) when either the first or last text boundary has been + * reached. + */ + public static final int DONE = -1; + + /** + * Returns the first boundary. The iterator's current position is set + * to the first text boundary. + * @return The character index of the first text boundary. + */ + public abstract int first(); + + /** + * Returns the last boundary. The iterator's current position is set + * to the last text boundary. + * @return The character index of the last text boundary. + */ + public abstract int last(); + + /** + * Returns the nth boundary from the current boundary. If either + * the first or last text boundary has been reached, it returns + * BreakIterator.DONE and the current position is set to either + * the first or last text boundary depending on which one is reached. Otherwise, + * the iterator's current position is set to the new boundary. + * For example, if the iterator's current position is the mth text boundary + * and three more boundaries exist from the current boundary to the last text + * boundary, the next(2) call will return m + 2. The new text position is set + * to the (m + 2)th text boundary. A next(4) call would return + * BreakIterator.DONE and the last text boundary would become the + * new text position. + * @param n which boundary to return. A value of 0 + * does nothing. Negative values move to previous boundaries + * and positive values move to later boundaries. + * @return The character index of the nth boundary from the current position + * or BreakIterator.DONE if either first or last text boundary + * has been reached. + */ + public abstract int next(int n); + + /** + * Returns the boundary following the current boundary. If the current boundary + * is the last text boundary, it returns BreakIterator.DONE and + * the iterator's current position is unchanged. Otherwise, the iterator's + * current position is set to the boundary following the current boundary. + * @return The character index of the next text boundary or + * BreakIterator.DONE if the current boundary is the last text + * boundary. + * Equivalent to next(1). + * @see #next(int) + */ + public abstract int next(); + + /** + * Returns the boundary preceding the current boundary. If the current boundary + * is the first text boundary, it returns BreakIterator.DONE and + * the iterator's current position is unchanged. Otherwise, the iterator's + * current position is set to the boundary preceding the current boundary. + * @return The character index of the previous text boundary or + * BreakIterator.DONE if the current boundary is the first text + * boundary. + */ + public abstract int previous(); + + /** + * Returns the first boundary following the specified character offset. If the + * specified offset equals to the last text boundary, it returns + * BreakIterator.DONE and the iterator's current position is unchanged. + * Otherwise, the iterator's current position is set to the returned boundary. + * The value returned is always greater than the offset or the value + * BreakIterator.DONE. + * @param offset the character offset to begin scanning. + * @return The first boundary after the specified offset or + * BreakIterator.DONE if the last text boundary is passed in + * as the offset. + * @exception IllegalArgumentException if the specified offset is less than + * the first text boundary or greater than the last text boundary. + */ + public abstract int following(int offset); + + /** + * Returns the last boundary preceding the specified character offset. If the + * specified offset equals to the first text boundary, it returns + * BreakIterator.DONE and the iterator's current position is unchanged. + * Otherwise, the iterator's current position is set to the returned boundary. + * The value returned is always less than the offset or the value + * BreakIterator.DONE. + * @param offset the character offset to begin scanning. + * @return The last boundary before the specified offset or + * BreakIterator.DONE if the first text boundary is passed in + * as the offset. + * @exception IllegalArgumentException if the specified offset is less than + * the first text boundary or greater than the last text boundary. + * @since 1.2 + */ + public int preceding(int offset) { + // NOTE: This implementation is here solely because we can't add new + // abstract methods to an existing class. There is almost ALWAYS a + // better, faster way to do this. + int pos = following(offset); + while (pos >= offset && pos != DONE) { + pos = previous(); + } + return pos; + } + + /** + * Returns true if the specified character offset is a text boundary. + * @param offset the character offset to check. + * @return true if "offset" is a boundary position, + * false otherwise. + * @exception IllegalArgumentException if the specified offset is less than + * the first text boundary or greater than the last text boundary. + * @since 1.2 + */ + public boolean isBoundary(int offset) { + // NOTE: This implementation probably is wrong for most situations + // because it fails to take into account the possibility that a + // CharacterIterator passed to setText() may not have a begin offset + // of 0. But since the abstract BreakIterator doesn't have that + // knowledge, it assumes the begin offset is 0. If you subclass + // BreakIterator, copy the SimpleTextBoundary implementation of this + // function into your subclass. [This should have been abstract at + // this level, but it's too late to fix that now.] + if (offset == 0) { + return true; + } + int boundary = following(offset - 1); + if (boundary == DONE) { + throw new IllegalArgumentException(); + } + return boundary == offset; + } + + /** + * Returns character index of the text boundary that was most + * recently returned by next(), next(int), previous(), first(), last(), + * following(int) or preceding(int). If any of these methods returns + * BreakIterator.DONE because either first or last text boundary + * has been reached, it returns the first or last text boundary depending on + * which one is reached. + * @return The text boundary returned from the above methods, first or last + * text boundary. + * @see #next() + * @see #next(int) + * @see #previous() + * @see #first() + * @see #last() + * @see #following(int) + * @see #preceding(int) + */ + public abstract int current(); + + /** + * Get the text being scanned + * @return the text being scanned + */ + public abstract CharacterIterator getText(); + + /** + * Set a new text string to be scanned. The current scan + * position is reset to first(). + * @param newText new text to scan. + */ + public void setText(String newText) + { + setText(new StringCharacterIterator(newText)); + } + + /** + * Set a new text for scanning. The current scan + * position is reset to first(). + * @param newText new text to scan. + */ + public abstract void setText(CharacterIterator newText); + + // Android-removed: Removed code related to BreakIteratorProvider support. + + /** + * Returns a new BreakIterator instance + * for word breaks + * for the {@linkplain Locale#getDefault() default locale}. + * @return A break iterator for word breaks + */ + public static BreakIterator getWordInstance() + { + return getWordInstance(Locale.getDefault()); + } + + /** + * Returns a new BreakIterator instance + * for word breaks + * for the given locale. + * @param locale the desired locale + * @return A break iterator for word breaks + * @exception NullPointerException if locale is null + */ + public static BreakIterator getWordInstance(Locale locale) + { + // Android-changed: Switched to ICU. + return new IcuIteratorWrapper( + android.icu.text.BreakIterator.getWordInstance(locale)); + } + + /** + * Returns a new BreakIterator instance + * for line breaks + * for the {@linkplain Locale#getDefault() default locale}. + * @return A break iterator for line breaks + */ + public static BreakIterator getLineInstance() + { + return getLineInstance(Locale.getDefault()); + } + + /** + * Returns a new BreakIterator instance + * for line breaks + * for the given locale. + * @param locale the desired locale + * @return A break iterator for line breaks + * @exception NullPointerException if locale is null + */ + public static BreakIterator getLineInstance(Locale locale) + { + // Android-changed: Switched to ICU. + return new IcuIteratorWrapper( + android.icu.text.BreakIterator.getLineInstance(locale)); + } + + /** + * Returns a new BreakIterator instance + * for character breaks + * for the {@linkplain Locale#getDefault() default locale}. + * @return A break iterator for character breaks + */ + public static BreakIterator getCharacterInstance() + { + return getCharacterInstance(Locale.getDefault()); + } + + /** + * Returns a new BreakIterator instance + * for character breaks + * for the given locale. + * @param locale the desired locale + * @return A break iterator for character breaks + * @exception NullPointerException if locale is null + */ + public static BreakIterator getCharacterInstance(Locale locale) + { + // Android-changed: Switched to ICU. + return new IcuIteratorWrapper( + android.icu.text.BreakIterator.getCharacterInstance(locale)); + } + + /** + * Returns a new BreakIterator instance + * for sentence breaks + * for the {@linkplain Locale#getDefault() default locale}. + * @return A break iterator for sentence breaks + */ + public static BreakIterator getSentenceInstance() + { + return getSentenceInstance(Locale.getDefault()); + } + + /** + * Returns a new BreakIterator instance + * for sentence breaks + * for the given locale. + * @param locale the desired locale + * @return A break iterator for sentence breaks + * @exception NullPointerException if locale is null + */ + public static BreakIterator getSentenceInstance(Locale locale) + { + // Android-changed: Switched to ICU. + return new IcuIteratorWrapper( + android.icu.text.BreakIterator.getSentenceInstance(locale)); + } + + // Android-removed: Removed code related to BreakIteratorProvider support. + + // Android-changed: Removed references to BreakIteratorProvider from JavaDoc. + /** + * Returns an array of all locales for which the + * get*Instance methods of this class can return + * localized instances. + * + * @return An array of locales for which localized + * BreakIterator instances are available. + */ + public static synchronized Locale[] getAvailableLocales() + { + // Android-changed: Switched to ICU. + return android.icu.text.BreakIterator.getAvailableLocales(); + } +} diff --git a/jre_emul/android/platform/libcore/ojluni/src/main/java/java/text/IcuIteratorWrapper.java b/jre_emul/android/platform/libcore/ojluni/src/main/java/java/text/IcuIteratorWrapper.java new file mode 100644 index 0000000000..024d6d1f10 --- /dev/null +++ b/jre_emul/android/platform/libcore/ojluni/src/main/java/java/text/IcuIteratorWrapper.java @@ -0,0 +1,404 @@ +/* + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * + * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved + * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved + * + * The original version of this source code and documentation + * is copyrighted and owned by Taligent, Inc., a wholly-owned + * subsidiary of IBM. These materials are provided under terms + * of a License Agreement between Taligent and Sun. This technology + * is protected by multiple US and International patents. + * + * This notice and attribution to Taligent may not be removed. + * Taligent is a registered trademark of Taligent, Inc. + */ + + +package java.text; + +/** + *

A subclass of BreakIterator whose behavior is specified using a list of rules.

+ * + *

There are two kinds of rules, which are separated by semicolons: substitutions + * and regular expressions.

+ * + *

A substitution rule defines a name that can be used in place of an expression. It + * consists of a name, which is a string of characters contained in angle brackets, an equals + * sign, and an expression. (There can be no whitespace on either side of the equals sign.) + * To keep its syntactic meaning intact, the expression must be enclosed in parentheses or + * square brackets. A substitution is visible after its definition, and is filled in using + * simple textual substitution. Substitution definitions can contain other substitutions, as + * long as those substitutions have been defined first. Substitutions are generally used to + * make the regular expressions (which can get quite complex) shorted and easier to read. + * They typically define either character categories or commonly-used subexpressions.

+ * + *

There is one special substitution.  If the description defines a substitution + * called "<ignore>", the expression must be a [] expression, and the + * expression defines a set of characters (the "ignore characters") that + * will be transparent to the BreakIterator.  A sequence of characters will break the + * same way it would if any ignore characters it contains are taken out.  Break + * positions never occur befoer ignore characters.

+ * + *

A regular expression uses a subset of the normal Unix regular-expression syntax, and + * defines a sequence of characters to be kept together. With one significant exception, the + * iterator uses a longest-possible-match algorithm when matching text to regular + * expressions. The iterator also treats descriptions containing multiple regular expressions + * as if they were ORed together (i.e., as if they were separated by |).

+ * + *

The special characters recognized by the regular-expression parser are as follows:

+ * + *
+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
*Specifies that the expression preceding the asterisk may occur any number + * of times (including not at all).
{}Encloses a sequence of characters that is optional.
()Encloses a sequence of characters.  If followed by *, the sequence + * repeats.  Otherwise, the parentheses are just a grouping device and a way to delimit + * the ends of expressions containing |.
|Separates two alternative sequences of characters.  Either one + * sequence or the other, but not both, matches this expression.  The | character can + * only occur inside ().
.Matches any character.
*?Specifies a non-greedy asterisk.  *? works the same way as *, except + * when there is overlap between the last group of characters in the expression preceding the + * * and the first group of characters following the *.  When there is this kind of + * overlap, * will match the longest sequence of characters that match the expression before + * the *, and *? will match the shortest sequence of characters matching the expression + * before the *?.  For example, if you have "xxyxyyyxyxyxxyxyxyy" in the text, + * "x[xy]*x" will match through to the last x (i.e., "xxyxyyyxyxyxxyxyxyy", + * but "x[xy]*?x" will only match the first two xes ("xxyxyyyxyxyxxyxyxyy").
[]Specifies a group of alternative characters.  A [] expression will + * match any single character that is specified in the [] expression.  For more on the + * syntax of [] expressions, see below.
/Specifies where the break position should go if text matches this + * expression.  (e.g., "[a-z]*/[:Zs:]*[1-0]" will match if the iterator sees a + * run + * of letters, followed by a run of whitespace, followed by a digit, but the break position + * will actually go before the whitespace).  Expressions that don't contain / put the + * break position at the end of the matching text.
\Escape character.  The \ itself is ignored, but causes the next + * character to be treated as literal character.  This has no effect for many + * characters, but for the characters listed above, this deprives them of their special + * meaning.  (There are no special escape sequences for Unicode characters, or tabs and + * newlines; these are all handled by a higher-level protocol.  In a Java string, + * "\n" will be converted to a literal newline character by the time the + * regular-expression parser sees it.  Of course, this means that \ sequences that are + * visible to the regexp parser must be written as \\ when inside a Java string.)  All + * characters in the ASCII range except for letters, digits, and control characters are + * reserved characters to the parser and must be preceded by \ even if they currently don't + * mean anything.
!If ! appears at the beginning of a regular expression, it tells the regexp + * parser that this expression specifies the backwards-iteration behavior of the iterator, + * and not its normal iteration behavior.  This is generally only used in situations + * where the automatically-generated backwards-iteration brhavior doesn't produce + * satisfactory results and must be supplemented with extra client-specified rules.
(all others)All other characters are treated as literal characters, which must match + * the corresponding character(s) in the text exactly.
+ *
+ * + *

Within a [] expression, a number of other special characters can be used to specify + * groups of characters:

+ * + *
+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
-Specifies a range of matching characters.  For example + * "[a-p]" matches all lowercase Latin letters from a to p (inclusive).  The - + * sign specifies ranges of continuous Unicode numeric values, not ranges of characters in a + * language's alphabetical order: "[a-z]" doesn't include capital letters, nor does + * it include accented letters such as a-umlaut.
::A pair of colons containing a one- or two-letter code matches all + * characters in the corresponding Unicode category.  The two-letter codes are the same + * as the two-letter codes in the Unicode database (for example, "[:Sc::Sm:]" + * matches all currency symbols and all math symbols).  Specifying a one-letter code is + * the same as specifying all two-letter codes that begin with that letter (for example, + * "[:L:]" matches all letters, and is equivalent to + * "[:Lu::Ll::Lo::Lm::Lt:]").  Anything other than a valid two-letter Unicode + * category code or a single letter that begins a Unicode category code is illegal within + * colons.
[][] expressions can nest.  This has no effect, except when used in + * conjunction with the ^ token.
^Excludes the character (or the characters in the [] expression) following + * it from the group of characters.  For example, "[a-z^p]" matches all Latin + * lowercase letters except p.  "[:L:^[\u4e00-\u9fff]]" matches all letters + * except the Han ideographs.
(all others)All other characters are treated as literal characters.  (For + * example, "[aeiou]" specifies just the letters a, e, i, o, and u.)
+ *
+ * + *

For a more complete explanation, see http://www.ibm.com/java/education/boundaries/boundaries.html. + *   For examples, see the resource data (which is annotated).

+ * + * @author Richard Gillam + */ +class IcuIteratorWrapper extends BreakIterator { + + /* The wrapped ICU implementation. Non-final for #clone() */ + private android.icu.text.BreakIterator wrapped; + + /** + * Constructs a IcuIteratorWrapper according to the datafile + * provided. + */ + IcuIteratorWrapper(android.icu.text.BreakIterator iterator) { + wrapped = iterator; + } + + /** + * Clones this iterator. + * + * @return A newly-constructed IcuIteratorWrapper with the same + * behavior as this one. + */ + public Object clone() { + IcuIteratorWrapper result = (IcuIteratorWrapper) super.clone(); + result.wrapped = (android.icu.text.BreakIterator) wrapped.clone(); + return result; + } + + /** + * Returns true if both BreakIterators are of the same class, have the same + * rules, and iterate over the same text. + */ + public boolean equals(Object that) { + if (!(that instanceof IcuIteratorWrapper)) { + return false; + } + return wrapped.equals(((IcuIteratorWrapper) that).wrapped); + } + + //======================================================================= + // BreakIterator overrides + //======================================================================= + + /** + * Returns text + */ + public String toString() { + return wrapped.toString(); + } + + /** + * Compute a hashcode for this BreakIterator + * + * @return A hash code + */ + public int hashCode() { + return wrapped.hashCode(); + } + + /** + * Sets the current iteration position to the beginning of the text. + * (i.e., the CharacterIterator's starting offset). + * + * @return The offset of the beginning of the text. + */ + public int first() { + return wrapped.first(); + } + + /** + * Sets the current iteration position to the end of the text. + * (i.e., the CharacterIterator's ending offset). + * + * @return The text's past-the-end offset. + */ + public int last() { + return wrapped.last(); + } + + /** + * Advances the iterator either forward or backward the specified number of steps. + * Negative values move backward, and positive values move forward. This is + * equivalent to repeatedly calling next() or previous(). + * + * @param n The number of steps to move. The sign indicates the direction + * (negative is backwards, and positive is forwards). + * @return The character offset of the boundary position n boundaries away from + * the current one. + */ + public int next(int n) { + return wrapped.next(n); + } + + /** + * Advances the iterator to the next boundary position. + * + * @return The position of the first boundary after this one. + */ + public int next() { + return wrapped.next(); + } + + /** + * Advances the iterator backwards, to the last boundary preceding this one. + * + * @return The position of the last boundary position preceding this one. + */ + public int previous() { + return wrapped.previous(); + } + + /** + * Throw IllegalArgumentException unless begin <= offset < end. + */ + protected static final void checkOffset(int offset, CharacterIterator text) { + if (offset < text.getBeginIndex() || offset > text.getEndIndex()) { + throw new IllegalArgumentException("offset out of bounds"); + } + } + + /** + * Sets the iterator to refer to the first boundary position following + * the specified position. + * + * @return The position of the first break after the current position. + * @offset The position from which to begin searching for a break position. + */ + public int following(int offset) { + CharacterIterator text = getText(); + checkOffset(offset, text); + return wrapped.following(offset); + } + + /** + * Sets the iterator to refer to the last boundary position before the + * specified position. + * + * @return The position of the last boundary before the starting position. + * @offset The position to begin searching for a break from. + */ + public int preceding(int offset) { + // if we start by updating the current iteration position to the + // position specified by the caller, we can just use previous() + // to carry out this operation + CharacterIterator text = getText(); + checkOffset(offset, text); + return wrapped.preceding(offset); + } + + /** + * Returns true if the specfied position is a boundary position. As a side + * effect, leaves the iterator pointing to the first boundary position at + * or after "offset". + * + * @param offset the offset to check. + * @return True if "offset" is a boundary position. + */ + public boolean isBoundary(int offset) { + CharacterIterator text = getText(); + checkOffset(offset, text); + return wrapped.isBoundary(offset); + } + + /** + * Returns the current iteration position. + * + * @return The current iteration position. + */ + public int current() { + return wrapped.current(); + } + + /** + * Return a CharacterIterator over the text being analyzed. This version + * of this method returns the actual CharacterIterator we're using internally. + * Changing the state of this iterator can have undefined consequences. If + * you need to change it, clone it first. + * + * @return An iterator over the text being analyzed. + */ + public CharacterIterator getText() { + return wrapped.getText(); + } + + public void setText(String newText) { + wrapped.setText(newText); + } + + /** + * Set the iterator to analyze a new piece of text. This function resets + * the current iteration position to the beginning of the text. + * + * @param newText An iterator over the text to analyze. + */ + public void setText(CharacterIterator newText) { + newText.current(); + wrapped.setText(newText); + } +} diff --git a/jre_emul/icu_data.mk b/jre_emul/icu_data.mk index 11ca277a92..67bc209ea7 100644 --- a/jre_emul/icu_data.mk +++ b/jre_emul/icu_data.mk @@ -1080,7 +1080,48 @@ ICU_ZONE_RES = \ $(ICU_RES_PACKAGE)/zone/zh_Hant_TW.res \ $(ICU_RES_PACKAGE)/zone/zu.res \ -ICU_DATA_RES = $(ICU_COMMON_RES) $(ICU_LOCALE_RES) $(ICU_ZONE_RES) +ICU_BRKITR_RES = \ + $(ICU_RES_PACKAGE)/brkitr/burmesedict.dict \ + $(ICU_RES_PACKAGE)/brkitr/char.brk \ + $(ICU_RES_PACKAGE)/brkitr/cjdict.dict \ + $(ICU_RES_PACKAGE)/brkitr/de.res \ + $(ICU_RES_PACKAGE)/brkitr/el.res \ + $(ICU_RES_PACKAGE)/brkitr/en.res \ + $(ICU_RES_PACKAGE)/brkitr/en_US.res \ + $(ICU_RES_PACKAGE)/brkitr/en_US_POSIX.res \ + $(ICU_RES_PACKAGE)/brkitr/es.res \ + $(ICU_RES_PACKAGE)/brkitr/fi.res \ + $(ICU_RES_PACKAGE)/brkitr/fr.res \ + $(ICU_RES_PACKAGE)/brkitr/it.res \ + $(ICU_RES_PACKAGE)/brkitr/ja.res \ + $(ICU_RES_PACKAGE)/brkitr/khmerdict.dict \ + $(ICU_RES_PACKAGE)/brkitr/laodict.dict \ + $(ICU_RES_PACKAGE)/brkitr/line.brk \ + $(ICU_RES_PACKAGE)/brkitr/line_fi.brk \ + $(ICU_RES_PACKAGE)/brkitr/line_loose.brk \ + $(ICU_RES_PACKAGE)/brkitr/line_loose_cj.brk \ + $(ICU_RES_PACKAGE)/brkitr/line_loose_fi.brk \ + $(ICU_RES_PACKAGE)/brkitr/line_normal.brk \ + $(ICU_RES_PACKAGE)/brkitr/line_normal_cj.brk \ + $(ICU_RES_PACKAGE)/brkitr/line_normal_fi.brk \ + $(ICU_RES_PACKAGE)/brkitr/pt.res \ + $(ICU_RES_PACKAGE)/brkitr/res_index.res \ + $(ICU_RES_PACKAGE)/brkitr/root.res \ + $(ICU_RES_PACKAGE)/brkitr/ru.res \ + $(ICU_RES_PACKAGE)/brkitr/sent.brk \ + $(ICU_RES_PACKAGE)/brkitr/sent_el.brk \ + $(ICU_RES_PACKAGE)/brkitr/thaidict.dict \ + $(ICU_RES_PACKAGE)/brkitr/title.brk \ + $(ICU_RES_PACKAGE)/brkitr/word.brk \ + $(ICU_RES_PACKAGE)/brkitr/word_POSIX.brk \ + $(ICU_RES_PACKAGE)/brkitr/zh.res \ + $(ICU_RES_PACKAGE)/brkitr/zh_Hant.res \ + +ICU_DATA_RES = \ + $(ICU_COMMON_RES) \ + $(ICU_LOCALE_RES) \ + $(ICU_ZONE_RES) \ + $(ICU_BRKITR_RES) \ ICU_TZ_DATA_ZIP = \ diff --git a/jre_emul/jre_sources.mk b/jre_emul/jre_sources.mk index aa8ebf0c78..9ab482e409 100644 --- a/jre_emul/jre_sources.mk +++ b/jre_emul/jre_sources.mk @@ -1871,10 +1871,12 @@ JAVA_PUBLIC_SOURCES_ICU = \ android/icu/util/ULocale.java \ android/icu/util/ValueIterator.java \ android/icu/util/VersionInfo.java \ + java/text/BreakIterator.java \ JAVA_PRIVATE_SOURCES_ICU = \ android/icu/impl/Assert.java \ android/icu/impl/BMPSet.java \ + android/icu/impl/CSCharacterIterator.java \ android/icu/impl/CacheBase.java \ android/icu/impl/CacheValue.java \ android/icu/impl/CalendarCache.java \ @@ -1921,6 +1923,7 @@ JAVA_PRIVATE_SOURCES_ICU = \ android/icu/impl/ResourceBundleWrapper.java \ android/icu/impl/RuleCharacterIterator.java \ android/icu/impl/SimpleCache.java \ + android/icu/impl/SimpleFilteredSentenceBreakIterator.java \ android/icu/impl/SimpleFormatterImpl.java \ android/icu/impl/SoftCache.java \ android/icu/impl/SortedSetRelation.java \ @@ -1967,12 +1970,14 @@ JAVA_PRIVATE_SOURCES_ICU = \ android/icu/impl/locale/UnicodeLocaleExtension.java \ android/icu/lang/CharSequences.java \ android/icu/lang/UCharacterNameIterator.java \ + android/icu/text/BreakIteratorFactory.java \ android/icu/text/CurrencyDisplayNames.java \ android/icu/text/CurrencyMetaInfo.java \ android/icu/text/DictionaryBreakEngine.java \ android/icu/text/DictionaryMatcher.java \ android/icu/text/DigitList.java \ android/icu/text/Edits.java \ + android/icu/text/FilteredBreakIteratorBuilder.java \ android/icu/text/FilteredNormalizer2.java \ android/icu/text/LanguageBreakEngine.java \ android/icu/text/NFRule.java \ @@ -2001,6 +2006,8 @@ JAVA_PRIVATE_SOURCES_ICU = \ android/icu/util/AnnualTimeZoneRule.java \ android/icu/util/BasicTimeZone.java \ android/icu/util/BytesTrie.java \ + android/icu/util/CharsTrie.java \ + android/icu/util/CharsTrieBuilder.java \ android/icu/util/DateTimeRule.java \ android/icu/util/ICUCloneNotSupportedException.java \ android/icu/util/ICUException.java \ @@ -2009,6 +2016,7 @@ JAVA_PRIVATE_SOURCES_ICU = \ android/icu/util/RuleBasedTimeZone.java \ android/icu/util/STZInfo.java \ android/icu/util/SimpleTimeZone.java \ + android/icu/util/StringTrieBuilder.java \ android/icu/util/TimeArrayTimeZoneRule.java \ android/icu/util/TimeZoneRule.java \ android/icu/util/TimeZoneTransition.java \ @@ -2016,6 +2024,7 @@ JAVA_PRIVATE_SOURCES_ICU = \ android/icu/util/UResourceBundleIterator.java \ android/icu/util/UResourceTypeMismatchException.java \ android/icu/util/VTimeZone.java \ + java/text/IcuIteratorWrapper.java \ NATIVE_JRE_ICU_EMBEDDED_DATA = ICUData.m diff --git a/jre_emul/test_sources.mk b/jre_emul/test_sources.mk index b9aef8e327..54589c8097 100644 --- a/jre_emul/test_sources.mk +++ b/jre_emul/test_sources.mk @@ -465,6 +465,7 @@ TEST_SOURCES := \ libcore/java/nio/charset/CharsetDecoderTest.java \ libcore/java/nio/charset/CharsetEncoderTest.java \ libcore/java/text/AttributedCharacterIteratorAttributeTest.java \ + libcore/java/text/BreakIteratorTest.java \ libcore/java/text/CollatorTest.java \ libcore/java/text/DateFormatSymbolsTest.java \ libcore/java/text/DecimalFormatSymbolsTest.java \ @@ -708,6 +709,7 @@ TEST_SOURCES := \ org/apache/harmony/tests/java/nio/channels/spi/AbstractSelectionKeyTest.java \ org/apache/harmony/tests/java/nio/channels/spi/AbstractSelectorTest.java \ org/apache/harmony/tests/java/text/AttributedStringTest.java \ + org/apache/harmony/tests/java/text/BreakIteratorTest.java \ org/apache/harmony/tests/java/text/ChoiceFormatTest.java \ org/apache/harmony/tests/java/text/DateFormatSymbolsTest.java \ org/apache/harmony/tests/java/text/DateFormatTest.java \