This repository has been archived by the owner on Feb 19, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 82
/
NGramSpanFeaturizer.scala
110 lines (97 loc) · 3.98 KB
/
NGramSpanFeaturizer.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
package epic.features
import epic.framework.Feature
import breeze.linalg.Counter
import breeze.util.Index
import epic.trees.TreeInstance
import scala.collection.mutable.ArrayBuffer
import breeze.util.SerializableLogging
/**
*
*
* @author dlwh
*/
@SerialVersionUID(1L)
class NGramSpanFeaturizer(wordCounts: Counter[String,Double],
bigramCounts: Counter[(String,String),Double],
allSents: Seq[Seq[String]],
ngramCountThreshold: Int,
maxOrder: Int,
useNot: Boolean) extends SurfaceFeaturizer[String] with Serializable {
private val higherOrderCounts = (3 to maxOrder).map(n => NGramSpanFeaturizer.countNgrams(allSents, n))
private val wordIndex = Index(wordCounts.keysIterator)
private val bigramIndex = Index(bigramCounts.keysIterator)
private val higherOrderIndices = (3 to maxOrder).map(n => Index(higherOrderCounts(n-3).keysIterator))
println(wordIndex.size + " unigrams, " + bigramIndex.size + " bigrams, " + higherOrderIndices.map(_.size) + " higher-order n-grams")
def anchor(words: IndexedSeq[String]): SurfaceFeatureAnchoring[String] = {
new SurfaceFeatureAnchoring[String] {
def featuresForSpan(begin: Int, end: Int): Array[Feature] = {
// println("Span: " + words.slice(begin, end))
val unigramFeats = for (i <- begin until end) yield {
// println(words(i) + ": " + wordCounts(words(i)))
NGramUnigramFeature(if (wordCounts(words(i)) < ngramCountThreshold) -1 else wordIndex(words(i)))
}
val bigramFeats = for (i <- begin until end - 1) yield {
val pair = (words(i), words(i+1))
// println(pair + ": " + bigramCounts(pair))
NGramBigramFeature(if (bigramCounts(pair) < ngramCountThreshold) -1 else bigramIndex(pair))
}
val notFeats = if (useNot) {
val notFeats = new ArrayBuffer[Feature]
var inNotSpan = false
for (i <- begin until end) {
if (NGramSpanFeaturizer.NotWords.contains(words(i))) {
inNotSpan = true
} else if (NGramSpanFeaturizer.NotEndingPunc.contains(words(i))) {
inNotSpan = false
} else if (inNotSpan) {
// println(words.slice(begin, end) + " (not span): " + words(i))
notFeats += NotFeature(if (wordCounts(words(i)) < ngramCountThreshold) -1 else wordIndex(words(i)))
}
}
notFeats.toArray
} else {
Array[Feature]()
}
if (maxOrder >= 3) {
val ngramFeats = (3 to maxOrder).flatMap(n => {
for (i <- begin until end - n + 1) yield {
val slice = words.slice(i, i+n)
// println(slice + ": " + higherOrderCounts(n-3)(slice))
NGramFeature(n, if (higherOrderCounts(n-3)(slice) < ngramCountThreshold) -1 else higherOrderIndices(n-3)(slice))
}
})
(unigramFeats ++ bigramFeats ++ notFeats ++ ngramFeats).toArray
} else {
(unigramFeats ++ bigramFeats ++ notFeats).toArray
}
}
}
}
}
object NGramSpanFeaturizer {
val NotWords = Set("not", "n't")
val NotEndingPunc = Set(".", ",", ";", "--", ":")
def countBigrams[L, W](data: TraversableOnce[TreeInstance[L, W]]) = {
val bigrams = Counter[(W,W), Double]()
for( ti <- data) {
val TreeInstance(_, tree, words) = ti
for (i <- 0 until words.size - 1) {
bigrams((words(i), words(i+1))) += 1.0
}
}
bigrams
}
def countNgrams(allSents: Seq[Seq[String]], n: Int) = {
val ngrams = Counter[Seq[String], Double]()
for(sent <- allSents) {
for (i <- 0 until sent.size - n) {
ngrams(sent.slice(i, i+n)) += 1.0
}
}
ngrams
}
}
case class NotFeature(idx: Int) extends Feature
case class NGramUnigramFeature(idx: Int) extends Feature
case class NGramBigramFeature(idx: Int) extends Feature
case class NGramFeature(n: Int, idx: Int) extends Feature