-
Notifications
You must be signed in to change notification settings - Fork 8
/
TextDecoder.java
274 lines (248 loc) · 9.41 KB
/
TextDecoder.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
/*
* Copyright (c) 2014, Francis Galiegue (fgaliegue@gmail.com)
*
* This software is dual-licensed under:
*
* - the Lesser General Public License (LGPL) version 3.0 or, at your option, any
* later version;
* - the Apache Software License (ASL) version 2.0.
*
* The text of both licenses is available under the src/resources/ directory of
* this project (under the names LGPL-3.0.txt and ASL-2.0.txt respectively).
*
* Direct link to the sources:
*
* - LGPL 3.0: https://www.gnu.org/licenses/lgpl-3.0.txt
* - ASL 2.0: http://www.apache.org/licenses/LICENSE-2.0.txt
*/
package com.github.fge.largetext.load;
import com.github.fge.largetext.LargeText;
import com.github.fge.largetext.LargeTextException;
import com.github.fge.largetext.LargeTextFactory;
import com.github.fge.largetext.range.IntRange;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.RangeMap;
import com.google.common.collect.TreeRangeMap;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import javax.annotation.concurrent.GuardedBy;
import javax.annotation.concurrent.ThreadSafe;
import java.io.Closeable;
import java.io.IOException;
import java.nio.CharBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadFactory;
/**
* Text file decoder
*
* <p>This is the first core class of this package (the second is {@link
* TextCache}. Its role is to decode a text file chunk by chunk. The size of
* chunks to use is determined when you build your {@link LargeTextFactory}.</p>
*
* <p>{@link LargeText} will call upon this class to obtain a {@link TextRange}
* (or a list of them) containing the character at a given index (or the range
* of characters), by using the methods {@link #getRange(int)} and {@link
* #getRanges(IntRange)} respectively.</p>
*
* <p>These methods are blocking, but they <em>do not</em> throw {@link
* InterruptedException}; if an interruption occurs, these methods reset the
* thread interruption status and throw the appropriate {@link
* RuntimeException} (for instance, an {@link IndexOutOfBoundsException} if the
* requested offset exceeds the number of characters in the file).</p>
*
* <p>Implementation note: this class uses a <em>single threaded</em> {@link
* ExecutorService} to perform the decoding operation. Decoding is not done in
* parallel, and cannot be, since it is not guaranteeed that a byte mapping can
* be decoded exactly to a character sequence (for instance, using UTF-8, the
* end of the mapping may contain one byte only of a three-byte sequence).</p>
*
* @see DecodingStatus
*/
@ThreadSafe
public final class TextDecoder
implements Closeable
{
private static final ThreadFactory THREAD_FACTORY
= new ThreadFactoryBuilder().setNameFormat("text-decoder").build();
private final ExecutorService executor
= Executors.newSingleThreadExecutor(THREAD_FACTORY);
private final DecodingStatus status = new DecodingStatus();
@GuardedBy("ranges")
private final RangeMap<Integer, TextRange> ranges = TreeRangeMap.create();
private final FileChannel channel;
private final Charset charset;
private final long fileSize;
private final long targetMapSize;
/**
* Constructor; don't use directly!
*
* @param channel the {@link FileChannel} to the target file
* @param charset the character encoding to use
* @param targetMapSize the target byte mapping size
* @throws IOException error obtaining information on the channel
*/
public TextDecoder(final FileChannel channel, final Charset charset,
final long targetMapSize)
throws IOException
{
this.channel = channel;
fileSize = channel.size();
this.targetMapSize = targetMapSize;
this.charset = charset;
executor.submit(decodingTask());
}
/**
* Return the appropriate text range containing the character at the given
* offset
*
* @param charOffset the offset
* @return the appropriate {@link TextRange}
* @throws LargeTextException method has been interrupted, or a decoding
* error
* has occurred
* @throws IndexOutOfBoundsException offset requested is out of range
*/
public TextRange getRange(final int charOffset)
{
try {
needChars(charOffset + 1);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new LargeTextException("Interrupted", e);
}
synchronized (ranges) {
return ranges.get(charOffset);
}
}
/**
* Return an ordered iterable of text ranges covering the requested range
*
* @param range the range
* @return the appropriate list of text ranges
* @throws LargeTextException method has been interrupted, or a decoding
* error has occurred
* @throws IndexOutOfBoundsException range is out of bounds for this decoder
*/
public List<TextRange> getRanges(final IntRange range)
{
try {
needChars(range.getUpperBound());
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new LargeTextException("Interrupted", e);
}
final Collection<TextRange> ret;
synchronized (ranges) {
ret = ranges.subRangeMap(range.asGuavaRange())
.asMapOfRanges().values();
}
return ImmutableList.copyOf(ret);
}
/**
* Return the total number of characters in this decoder
*
* <p>This method sleeps until the decoding operation finishes (either
* successfully or with an error).</p>
*
* @return the total number of characters
* @throws LargeTextException method has been interrupted, or a decoding
* error has occurred
*
* @see DecodingStatus#getTotalSize()
*/
public int getTotalChars()
{
return status.getTotalSize();
}
@Override
public void close()
throws IOException
{
executor.shutdownNow();
}
private void needChars(final int needed)
throws InterruptedException
{
final CharWaiter waiter = new CharWaiter(needed);
if (status.addWaiter(waiter))
waiter.await();
}
// TODO: move to another class?
private Runnable decodingTask()
{
return new Runnable()
{
@Override
public void run()
{
final CharsetDecoder decoder = charset.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
final CharBuffer charMap
= CharBuffer.allocate((int) targetMapSize);
long byteOffset = 0L;
int charOffset = 0;
TextRange textRange;
boolean interrupted;
while (byteOffset < fileSize) {
interrupted = Thread.currentThread().isInterrupted();
if (interrupted) {
status.setFailed(new IOException("interrupted!"));
break;
}
try {
textRange = nextRange(byteOffset, charOffset, decoder,
charMap);
if (textRange.getByteRange().isEmpty())
throw new IOException("unable to read file as text "
+ "starting from byte offset " + byteOffset);
} catch (IOException e) {
status.setFailed(e);
break;
}
byteOffset = textRange.getByteRange().getUpperBound();
charOffset = textRange.getCharRange().getUpperBound();
status.setNrChars(charOffset);
synchronized (ranges) {
ranges.put(textRange.getCharRange().asGuavaRange(),
textRange);
}
}
status.setFinished(charOffset);
}
};
}
private TextRange nextRange(final long byteOffset, final int charOffset,
final CharsetDecoder decoder, final CharBuffer charMap)
throws IOException
{
long nrBytes = Math.min(targetMapSize, fileSize - byteOffset);
final MappedByteBuffer byteMap
= channel.map(FileChannel.MapMode.READ_ONLY, byteOffset, nrBytes);
charMap.rewind();
decoder.reset();
final CoderResult result = decoder.decode(byteMap, charMap, true);
/*
* Unmappable character... It _can_ happen even with a decoder, see
* http://stackoverflow.com/a/22902806/1093528
*/
if (result.isUnmappable())
result.throwException();
/*
* Incomplete byte sequence: in this case, the mapping position reflects
* what was actually read; change the mapping size
*/
if (result.isMalformed())
nrBytes = (long) byteMap.position();
return new TextRange(byteOffset, nrBytes, charOffset,
charMap.position());
}
}