This repository has been archived by the owner on Feb 19, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 82
/
SplitSpanFeaturizer.scala
306 lines (246 loc) · 12.3 KB
/
SplitSpanFeaturizer.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
package epic.features
import epic.framework.Feature
import breeze.util.Index
import epic.trees.BinaryTree
import epic.trees.TreeInstance
import epic.features.WordFeaturizer.Modifier
import epic.features.SplitSpanFeaturizer.{ProductSplitSpanFeaturizer, SumSplitSpanFeaturizer}
import epic.util.{AlwaysSeenSet, ThreadLocalBloomFilter, Arrays}
import epic.features.SurfaceFeaturizer.MarkerPos
import scala.collection.mutable.ArrayBuffer
/**
* TODO
*
* @author dlwh
**/
trait SplitSpanFeaturizer[W] extends SurfaceFeaturizer[W] {
def anchor(w: IndexedSeq[W]):SplitSpanFeatureAnchoring[W]
def *(other: SplitSpanFeaturizer[W]) = (this,other) match {
case (a@SumSplitSpanFeaturizer(as, x),b@SumSplitSpanFeaturizer(bs, y)) => ProductSplitSpanFeaturizer(a, b, y, x)
case (a,b@SumSplitSpanFeaturizer(bs, x)) => ProductSplitSpanFeaturizer(a, b, x, false)
case (a@SumSplitSpanFeaturizer(as, x),b) => ProductSplitSpanFeaturizer(a, b, false, x)
case _ => ProductSplitSpanFeaturizer(this, other)
}
override def +(other: SurfaceFeaturizer[W]):SplitSpanFeaturizer[W] = (this,other) match {
case (SumSplitSpanFeaturizer(as, x),SumSplitSpanFeaturizer(bs, y)) => SumSplitSpanFeaturizer(as ++ bs, x || y)
case (SumSplitSpanFeaturizer(as, x),b) => SumSplitSpanFeaturizer(as :+ SplitSpanFeaturizer.liftSurfaceFeaturizerToSplitSpan(b), x)
case (a,SumSplitSpanFeaturizer(bs, x)) => SumSplitSpanFeaturizer(a +: bs, x)
case _ => SumSplitSpanFeaturizer(IndexedSeq(this, other))
}
def +(unit: SplitSpanFeaturizer.unit.type) = new SumSplitSpanFeaturizer(IndexedSeq(this), true)
}
object SplitSpanFeaturizer {
object split extends SplitPointMarker
trait DSL extends SurfaceFeaturizer.DSL {
val split: SplitSpanFeaturizer.split.type = SplitSpanFeaturizer.split
implicit def splitWFModifier[W]: WordFeaturizer.Modifier[W, split.type, SplitFeaturizer[W]] = new Modifier[W, split.type, SplitFeaturizer[W]] {
def apply(f: WordFeaturizer[W], t: split.type): SplitFeaturizer[W] = new SplitFeaturizer(f)
}
def zeroSplit[W] = new ZeroSplitSpanFeaturizer[W]
def distance[W](b: MarkerPos, s: split.type, db: DistanceBinner = DistanceBinner()):SplitSpanFeaturizer[W] = {
new SplitSpanDistanceFeaturizer[W](b, s, db)
}
def distance[W](s: split.type, b: MarkerPos):SplitSpanFeaturizer[W] = {
new SplitSpanDistanceFeaturizer[W](s, b, DistanceBinner())
}
def distance[W](m: MarkerPos, n: MarkerPos):SplitSpanFeaturizer[W] = {
new SplitSpanDistanceFeaturizer[W](m, n, DistanceBinner())
}
def relativeLength[W]:SplitSpanFeaturizer[W] = {
new RelativeLengthFeaturizer[W]()
}
val splitSpanShape = new SplitSpanShapeFeaturizer()
def distanceToSentenceBoundaries[W] = new DistanceToSentenceBoundariesFeaturizer[W]
val unit: SplitSpanFeaturizer.unit.type = SplitSpanFeaturizer.unit
}
case object unit
implicit def liftSurfaceFeaturizerToSplitSpan[W](surface: SurfaceFeaturizer[W]):SplitSpanFeaturizer[W] = surface match {
case x: SplitSpanFeaturizer[W] => x
case _ =>
new SplitSpanFeaturizer[W] {
def anchor(w: IndexedSeq[W]): SplitSpanFeatureAnchoring[W] = new SplitSpanFeatureAnchoring[W] {
val anch = surface.anchor(w)
def featuresForSplit(begin: Int, split: Int, end: Int): Array[Feature] = emptyArray
def featuresForSpan(begin: Int, end: Int): Array[Feature] = anch.featuresForSpan(begin, end)
}
}
}
class SplitSpanShapeFeaturizer extends SplitSpanFeaturizer[String] {
def anchor(w: IndexedSeq[String]): SplitSpanFeatureAnchoring[String] = new SplitSpanFeatureAnchoring[String] {
def featuresForSplit(begin: Int, split: Int, end: Int): Array[Feature] = {
Array(SplitShapeFeature(SpanShapeGenerator.splitShapeFor(w, begin, split, end)))
}
def featuresForSpan(begin: Int, end: Int): Array[Feature] = Array.empty
}
}
case class SplitShapeFeature(shape: String) extends Feature
class SplitSpanDistanceFeaturizer[W] private[SplitSpanFeaturizer](a: Any, b: Any, db: DistanceBinner = DistanceBinner()) extends SplitSpanFeaturizer[W] {
val label = s"$a <-> $b"
private val theSplitNeedingAnchoring = new SplitSpanFeatureAnchoring[W] with Serializable {
def featuresForSplit(begin: Int, split: Int, end: Int): Array[Feature] = {
val lhs = markerToPos(a, begin, end, split)
val rhs = markerToPos(b, begin, end, split)
Array(DistanceFeature(db.binnedDistance(lhs, rhs), label))
}
def featuresForSpan(begin: Int, end: Int): Array[Feature] = emptyArray
}
private val theNotSplitNeedingAnchoring = new SplitSpanFeatureAnchoring[W] with Serializable {
def featuresForSplit(begin: Int, split: Int, end: Int): Array[Feature] = {
emptyArray
}
def featuresForSpan(begin: Int, end: Int): Array[Feature] = {
val lhs = markerToPos(a, begin, end, -1)
val rhs = markerToPos(b, begin, end, -1)
Array(DistanceFeature(db.binnedDistance(lhs, rhs), label))
}
}
private def markerToPos(a: Any, begin: Int, end: Int, split: Int): Int = {
a match {
case MarkerPos(i, true) => begin + i
case MarkerPos(i, false) => end + i
case _: SplitPointMarker => split
case _ => ???
}
}
def anchor(w: IndexedSeq[W]): SplitSpanFeatureAnchoring[W] = {
if (a.isInstanceOf[SplitPointMarker] || b.isInstanceOf[SplitPointMarker])
theSplitNeedingAnchoring
else
theNotSplitNeedingAnchoring
}
}
class DistanceToSentenceBoundariesFeaturizer[W] extends SurfaceFeaturizer[W] {
val db: DistanceBinner = DistanceBinner()
def anchor(words: IndexedSeq[W]): SurfaceFeatureAnchoring[W] = new SurfaceFeatureAnchoring[W] {
def featuresForSpan(begin: Int, end: Int): Array[Feature] = Array(DistToBOSFeature(db.binnedDistance(begin)), DistToEOSFeature(db.binnedDistance(end, words.length)))
}
}
case class DistToBOSFeature(dist: Int) extends Feature
case class DistToEOSFeature(dist: Int) extends Feature
/**
* Returns the binned difference between the [begin,split) and [split,end) spans.
* @param db
* @tparam W
*/
class RelativeLengthFeaturizer[W] private[SplitSpanFeaturizer](db: DistanceBinner = DistanceBinner()) extends SplitSpanFeaturizer[W] {
val label = s"RelativeDifference"
private val theSplitNeedingAnchoring = new SplitSpanFeatureAnchoring[W] with Serializable {
def featuresForSplit(begin: Int, split: Int, end: Int): Array[Feature] = {
Array(DistanceFeature(db.binnedDistance((end-split) - (split-begin)), label))
}
def featuresForSpan(begin: Int, end: Int): Array[Feature] = emptyArray
}
def anchor(w: IndexedSeq[W]): SplitSpanFeatureAnchoring[W] = {
theSplitNeedingAnchoring
}
}
case class SplitFeaturizer[W](f: WordFeaturizer[W]) extends SplitSpanFeaturizer[W] {
def anchor(w: IndexedSeq[W]): SplitSpanFeatureAnchoring[W] = new SplitSpanFeatureAnchoring[W] {
val wf = f.anchor(w)
def featuresForSplit(begin: Int, split: Int, end: Int): Array[Feature] = {
wf.featuresForWord(split).map(SplitFeature)
}
def featuresForSpan(begin: Int, end: Int): Array[Feature] = emptyArray
}
}
case class ZeroSplitSpanFeaturizer[W]() extends SplitSpanFeaturizer[W] {
def anchor(w: IndexedSeq[W]): SplitSpanFeatureAnchoring[W] = new SplitSpanFeatureAnchoring[W] {
def featuresForSplit(begin: Int, split: Int, end: Int): Array[Feature] = {
emptyArray
}
def featuresForSpan(begin: Int, end: Int): Array[Feature] = emptyArray
}
}
case class SumSplitSpanFeaturizer[W](prods: IndexedSeq[SplitSpanFeaturizer[W]], unitized: Boolean = false) extends SplitSpanFeaturizer[W] {
def anchor(w: IndexedSeq[W]): SplitSpanFeatureAnchoring[W] = new SplitSpanFeatureAnchoring[W] {
val anchs = prods.map(_.anchor(w)).toArray
def featuresForSpan(begin: Int, end: Int): Array[Feature] = anchs.flatMap(_.featuresForSpan(begin, end))
def featuresForSplit(begin: Int, split: Int, end: Int): Array[Feature] = {
Arrays.concatenate(anchs.map(_.featuresForSplit(begin, split, end)):_*)
}
}
override def +(unit: SplitSpanFeaturizer.unit.type) = copy(unitized = true)
}
case class ProductSplitSpanFeaturizer[W](a: SplitSpanFeaturizer[W], b: SplitSpanFeaturizer[W], keepJustA: Boolean = false, keepJustB: Boolean = false) extends SplitSpanFeaturizer[W] {
def anchor(w: IndexedSeq[W]): SplitSpanFeatureAnchoring[W] = new SplitSpanFeatureAnchoring[W] {
val aa = a.anchor(w)
val ba = b.anchor(w)
def featuresForSpan(begin: Int, end: Int): Array[Feature] = {
val afeats: Array[Feature] = aa.featuresForSpan(begin, end)
val bfeats: Array[Feature] = ba.featuresForSpan(begin, end)
val cross:Array[Feature] = Arrays.crossProduct(afeats, bfeats)(CrossProductFeature(_, _))
if (keepJustA && keepJustB) {
Arrays.concatenate[Feature](cross, afeats, bfeats)
} else if (keepJustA) {
Arrays.concatenate[Feature](cross, afeats)
} else if (keepJustB) {
Arrays.concatenate[Feature](cross, bfeats)
} else {
cross
}
}
def featuresForSplit(begin: Int, split: Int, end: Int): Array[Feature] = {
val aSplit: Array[Feature] = aa.featuresForSplit(begin, split, end)
val bSplit: Array[Feature] = ba.featuresForSplit(begin, split, end)
val aSpan: Array[Feature] = aa.featuresForSpan(begin, end)
val bSpan: Array[Feature] = ba.featuresForSpan(begin, end)
val results = ArrayBuffer[Array[Feature]](
Arrays.crossProduct(aSplit, bSpan)(CrossProductFeature(_, _, "Split")),
Arrays.crossProduct(aSplit, bSplit)(CrossProductFeature(_, _, "Split")),
Arrays.crossProduct(aSpan, bSplit)(CrossProductFeature(_, _, "Split"))
)
if (keepJustA) {
results += aSplit
}
if (keepJustB) {
results += bSplit
}
Arrays.concatenate(results:_*)
}
}
}
sealed trait SplitPointMarker extends Serializable { override def toString = "split"}
private val emptyArray = Array.empty[Feature]
}
trait SplitSpanFeatureAnchoring[W] extends SurfaceFeatureAnchoring[W] {
def featuresForSplit(begin: Int, split: Int, end: Int):Array[Feature]
}
trait IndexedSplitSpanFeaturizer[W] {
def anchor(w: IndexedSeq[W]):IndexedSplitSpanFeatureAnchoring[W]
def featureIndex: Index[Feature]
}
trait IndexedSplitSpanFeatureAnchoring[W] extends IndexedSurfaceAnchoring[W] {
def featuresForSplit(begin: Int, split: Int, end: Int):Array[Int]
}
object IndexedSplitSpanFeaturizer {
def fromData[L, W](f: SplitSpanFeaturizer[W],
trees: IndexedSeq[TreeInstance[L, W]],
hashFeatures: HashFeature.Scale = HashFeature.Relative(1.0),
bloomFilter: Boolean = false,
deduplicateFeatures: Boolean = false):IndexedSplitSpanFeaturizer[W] = {
def seenSet = if (bloomFilter) new ThreadLocalBloomFilter[Long](8 * 1024 * 1024 * 50, 3) else AlwaysSeenSet
val builder = if (deduplicateFeatures) new NonRedundantIndexBuilder[Feature] else new NormalIndexBuilder[Feature]
for (ti <- trees) {
val wspec = f.anchor(ti.words)
ti.tree.allChildren.foreach {
case t@BinaryTree(a, b, c, span) =>
builder.add(wspec.featuresForSpan(span.begin, span.end) ++ wspec.featuresForSplit(span.begin, t.splitPoint, span.end))
case t =>
builder.add(wspec.featuresForSpan(t.span.begin, t.span.end))
}
}
val index = builder.result()
new BasicIndexedSplitSpanFeaturizer(f, if (hashFeatures.numFeatures(index.size) != 0) new HashExtendingIndex(index, HashFeature(_), hashFeatures, seenSet) else index)
}
class BasicIndexedSplitSpanFeaturizer[W](f: SplitSpanFeaturizer[W], val featureIndex: Index[Feature]) extends IndexedSplitSpanFeaturizer[W] with Serializable {
def anchor(w: IndexedSeq[W]): IndexedSplitSpanFeatureAnchoring[W] = {
val anc = f.anchor(w)
new IndexedSplitSpanFeatureAnchoring[W] {
def featuresForSplit(begin: Int, split: Int, end: Int): Array[Int] = anc.featuresForSplit(begin, split, end).map(featureIndex)
def words: IndexedSeq[W] = w
def featuresForSpan(begin: Int, end: Int): Array[Int] = anc.featuresForSpan(begin, end).map(featureIndex)
}
}
}
}
case class SplitFeature(x: Feature) extends Feature