This repository has been archived by the owner on Feb 19, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 82
/
CONLLSequenceReader.scala
75 lines (68 loc) · 2.47 KB
/
CONLLSequenceReader.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
package epic.corpora
import epic.framework.{Observation, Example}
import io.Source
import collection.mutable.ArrayBuffer
import java.io.{File, FileInputStream, InputStream}
/**
* Reads tag sequences in the conll shared task format. See http://mlcomp.org/faq/domains "Sequence Tagging" for the spec.
* @author dlwh
*/
object CONLLSequenceReader {
def readTrain(f: InputStream, name: String = "sequence", splitToken:String =" "):Iterator[Example[IndexedSeq[String],IndexedSeq[IndexedSeq[String]]]] = {
val source = Source.fromInputStream(f).getLines()
new Iterator[Example[IndexedSeq[String],IndexedSeq[IndexedSeq[String]]]] {
def hasNext = source.hasNext
var index = 0
def next():Example[IndexedSeq[String],IndexedSeq[IndexedSeq[String]]] = {
val inputs = new ArrayBuffer[IndexedSeq[String]]()
val outputs = new ArrayBuffer[String]
import scala.util.control.Breaks._
breakable {
while (source.hasNext) {
val line = source.next()
if (line.trim().isEmpty) break
val split = line.split(splitToken)
inputs += split.take(split.length -1).toIndexedSeq
outputs += split.last
}
}
val id = name + "-" + index
index += 1
Example(outputs, inputs, id)
}
}
}
/**
* This format reads a CONLL file with the last column (i.e. the label) missing. If you have the label,
* use readTrain, even if you plan on testing with it. Silly, I know.
* @param f
* @param name
* @return
*/
def readTest(f: InputStream, name: String = "test-sequence", splitToken: String = " "):Iterator[Observation[IndexedSeq[IndexedSeq[String]]]] = {
val source = Source.fromInputStream(f).getLines()
new Iterator[Observation[IndexedSeq[IndexedSeq[String]]]] {
def hasNext = source.hasNext
var index = 0
def next() = {
val inputs = new ArrayBuffer[IndexedSeq[String]]()
import scala.util.control.Breaks._
breakable {
while (source.hasNext) {
val line = source.next()
if (line.trim().isEmpty) break
val split = line.split(splitToken)
inputs += split
}
}
val id = name + "-" + index
index += 1
Observation(inputs,id)
}
}
}
def main(args: Array[String]) {
println(readTrain(new FileInputStream(new File(args(0)))).length)
println(readTest(new FileInputStream(new File(args(1)))).length)
}
}