This repository has been archived by the owner on Feb 19, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 82
/
StreamSentenceSegmenter.scala
70 lines (60 loc) · 1.76 KB
/
StreamSentenceSegmenter.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
package epic.preprocess
import java.io.{Reader, InputStream}
import breeze.util.Iterators
import java.nio.channels.Channels
/**
* TODO
*
* @author dlwh
**/
class StreamSentenceSegmenter(val baseSegmenter: SentenceSegmenter, segmentOnNewLines: Boolean = false) {
def sentences(stream: InputStream):Iterator[String] = {
// addendum maintains the characters that we haven't read.
var addendum = ""
val pieces = chunkInput(stream).flatMap { (s: String) =>
if (segmentOnNewLines) {
val sentences = (addendum + s).split("\n").flatMap(baseSegmenter(_)).toIndexedSeq
if (!s.endsWith("\n")) {
addendum = sentences.last
sentences.dropRight(1)
} else {
sentences
}
} else {
val sentences = baseSegmenter(addendum + s).flatMap(baseSegmenter(_)).toIndexedSeq
addendum = sentences.last
sentences.dropRight(1)
}
}
pieces ++ Iterator(addendum).filter(_.nonEmpty)
}
private def chunkInput(stream: InputStream):Iterator[String] = {
val cin = Channels.newChannel(stream)
val reader = Channels.newReader(cin, "UTF-8")
val buffer = new Array[Char](1024 * 1024)
var done = false
Iterators.fromProducer {
if (done) {
None
} else {
val numRead = reader.read(buffer)
if (numRead == -1) {
done = true
None
} else {
val s = new String(buffer.take(numRead))
Some(s)
}
}
}
}
}
object StreamSentenceSegmenter {
def main(args: Array[String]) {
val seg = MLSentenceSegmenter.loadModel(new java.io.File("en-sent-segmenter.model.ser.gz"))
val ss = new StreamSentenceSegmenter(seg)
for(s <- ss.sentences(System.in)) {
println(">>> " + s)
}
}
}