This repository has been archived by the owner on Feb 19, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 82
/
BilexicalFeaturizer.scala
233 lines (193 loc) · 9.85 KB
/
BilexicalFeaturizer.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
package epic.features
import epic.framework.Feature
import epic.trees.DependencyTree
import epic.features.BilexicalFeaturizer._
import epic.features.WordFeaturizer.Modifier
import epic.util.Arrays
import epic.features.BilexicalFeaturizer.HeadFeaturizer
import epic.features.BilexicalFeaturizer.DepFeaturizer
import epic.features.BilexicalFeaturizer.SumBilexicalFeaturizer
import epic.features.BilexicalFeaturizer.HeadDepFeaturizer
import breeze.util.Index
/**
* TODO
*
* @author dlwh
**/
trait BilexicalFeaturizer[W] {
def anchor(w: IndexedSeq[W]):BilexicalFeatureAnchoring[W]
def *(other: BilexicalFeaturizer[W]) = (this,other) match {
case (x:HeadFeaturizer[W], y: DepFeaturizer[W]) => new HeadDepFeaturizer(x.base,y.base)
case (y: DepFeaturizer[W], x:HeadFeaturizer[W]) => new HeadDepFeaturizer(x.base,y.base)
case _ => new ProductBilexicalFeaturizer(this,other)
}
def +(other: BilexicalFeaturizer[W]):BilexicalFeaturizer[W] = (this,other) match {
case (SumBilexicalFeaturizer(as),SumBilexicalFeaturizer(bs)) => SumBilexicalFeaturizer(as ++ bs)
case (a,SumBilexicalFeaturizer(bs)) => SumBilexicalFeaturizer(a +: bs)
case (SumBilexicalFeaturizer(as),b) => SumBilexicalFeaturizer(as :+ b)
case _ => SumBilexicalFeaturizer(IndexedSeq(this, other))
}
}
object BilexicalFeaturizer {
trait DSL {
object head
object dep
def bilex[W](f: WordFeaturizer[W]):BilexicalFeaturizer[W] = bilex(f, f)
def bilex[W](head: WordFeaturizer[W], dep: WordFeaturizer[W]):BilexicalFeaturizer[W] = HeadDepFeaturizer(head, dep)
lazy val distance = new BilexicalFeaturizer.DistanceFeaturizer[String]()
def adaptSpanFeaturizer[W](f: SurfaceFeaturizer[W]) = new BilexicalFeaturizer.AdaptedSurfaceFeaturizer[W](f)
def withDistance[W](f: BilexicalFeaturizer[W], db: DistanceBinner = new DistanceBinner()) = new BinomialFeaturizer(f, new DistanceFeaturizer(db))
implicit def headWFModifier[W]: WordFeaturizer.Modifier[W, head.type, HeadFeaturizer[W]] = new Modifier[W, head.type, HeadFeaturizer[W]] {
def apply(f: WordFeaturizer[W], t: head.type): HeadFeaturizer[W] = new HeadFeaturizer(f)
}
implicit def depWFModifier[W]: WordFeaturizer.Modifier[W, dep.type, DepFeaturizer[W]] = new Modifier[W, dep.type, DepFeaturizer[W]] {
def apply(f: WordFeaturizer[W], t: dep.type): DepFeaturizer[W] = new DepFeaturizer(f)
}
}
case class SumBilexicalFeaturizer[W](prods: IndexedSeq[BilexicalFeaturizer[W]]) extends BilexicalFeaturizer[W] {
def anchor(w: IndexedSeq[W]): BilexicalFeatureAnchoring[W] = new BilexicalFeatureAnchoring[W] {
val anchs = prods.map(_.anchor(w)).toArray
def featuresForAttachment(head: Int, dep: Int): Array[Feature] = anchs.flatMap(_.featuresForAttachment(head, dep))
}
}
case class AdaptedSurfaceFeaturizer[W](base: SurfaceFeaturizer[W]) extends BilexicalFeaturizer[W] {
def anchor(w: IndexedSeq[W]): BilexicalFeatureAnchoring[W] = new BilexicalFeatureAnchoring[W] {
val ba = base.anchor(w)
def featuresForAttachment(head: Int, dep: Int): Array[Feature] = {
if (head < dep) ba.featuresForSpan(head, dep)
else ba.featuresForSpan(dep, head)
}
}
}
case class HeadFeaturizer[W](base: WordFeaturizer[W]) extends BilexicalFeaturizer[W] {
def anchor(w: IndexedSeq[W]): BilexicalFeatureAnchoring[W] = new BilexicalFeatureAnchoring[W] {
val ba = base.anchor(w)
def featuresForAttachment(head: Int, dep: Int): Array[Feature] = ba.featuresForWord(head).map(f => HeadFeature(f):Feature)
}
}
case class DepFeaturizer[W](base: WordFeaturizer[W]) extends BilexicalFeaturizer[W] {
def anchor(w: IndexedSeq[W]): BilexicalFeatureAnchoring[W] = new BilexicalFeatureAnchoring[W] {
val ba = base.anchor(w)
def featuresForAttachment(head: Int, dep: Int): Array[Feature] = ba.featuresForWord(dep).map(f => DepFeature(f):Feature)
}
}
case class BinomialFeaturizer[W](headBase: BilexicalFeaturizer[W], depBase: BilexicalFeaturizer[W]) extends BilexicalFeaturizer[W] {
def anchor(w: IndexedSeq[W]): BilexicalFeatureAnchoring[W] = new BilexicalFeatureAnchoring[W] {
val hb = headBase.anchor(w)
val db = if (headBase eq depBase) hb else depBase.anchor(w)
def featuresForAttachment(head: Int, dep: Int): Array[Feature] = {
val hf = hb.featuresForAttachment(head, dep)
val df = db.featuresForAttachment(head, dep)
val cross = Arrays.crossProduct(hf, df)((a, b) => CrossProductFeature(a,b):Feature)
Arrays.concatenate(hf, df, cross)
}
}
}
case class HeadDepFeaturizer[W](headBase: WordFeaturizer[W], depBase: WordFeaturizer[W]) extends BilexicalFeaturizer[W] {
def anchor(w: IndexedSeq[W]): BilexicalFeatureAnchoring[W] = new BilexicalFeatureAnchoring[W] {
val hb = headBase.anchor(w)
val db = if (headBase eq depBase) hb else depBase.anchor(w)
def featuresForAttachment(head: Int, dep: Int): Array[Feature] = {
Arrays.crossProduct(hb.featuresForWord(head), db.featuresForWord(dep))((a, b) => HeadDepFeature(a,b):Feature)
}
}
}
case class ProductBilexicalFeaturizer[W](a: BilexicalFeaturizer[W], b: BilexicalFeaturizer[W]) extends BilexicalFeaturizer[W] {
def anchor(w: IndexedSeq[W]): BilexicalFeatureAnchoring[W] = new BilexicalFeatureAnchoring[W] {
val aa = a.anchor(w)
val ba = b.anchor(w)
def featuresForAttachment(head: Int, dep: Int): Array[Feature] = {
Arrays.crossProduct(aa.featuresForAttachment(head, dep), ba.featuresForAttachment(head, dep))(CrossProductFeature(_, _, "BilexCross"))
}
}
}
case class DistanceFeaturizer[W](db: DistanceBinner = DistanceBinner()) extends BilexicalFeaturizer[W] {
private val theAnchoring = new BilexicalFeatureAnchoring[W] with Serializable {
def featuresForAttachment(head: Int, dep: Int): Array[Feature] = Array(DistanceFeature(db.binnedDistance(head, dep)))
}
def anchor(w: IndexedSeq[W]): BilexicalFeatureAnchoring[W] = theAnchoring
}
}
trait BilexicalFeatureAnchoring[W] {
def featuresForAttachment(head: Int, dep: Int):Array[Feature]
}
@SerialVersionUID(1L)
class ProductIndexedBilexicalFeaturizer[W](headFeaturizer: IndexedWordFeaturizer[W],
depFeaturizer: IndexedWordFeaturizer[W],
val featureIndex: CrossProductIndex[Feature, Feature]) extends IndexedBilexicalFeaturizer[W] with Serializable {
def anchor(w: IndexedSeq[W]): IndexedBilexicalFeatureAnchoring[W] = new IndexedBilexicalFeatureAnchoring[W] {
val headAnchoring = headFeaturizer.anchor(w)
val depAnchoring = depFeaturizer.anchor(w)
val cache = Array.ofDim[Array[Int]](words.length, words.length)
def words: IndexedSeq[W] = w
def featuresForAttachment(head: Int, dep: Int): Array[Int] = {
var ret = cache(head)(dep)
if (ret eq null) {
val f1 = featureIndex.crossProduct(headAnchoring.featuresForWord(head), depAnchoring.featuresForWord(dep), usePlainLabelFeatures = false)
ret = f1
cache(head)(dep) = f1
}
ret
}
}
}
case class BilexicalFeature(head: Any, dep: Any) extends Feature
trait IndexedBilexicalFeaturizer[W] {
def anchor(w: IndexedSeq[W]):IndexedBilexicalFeatureAnchoring[W]
def featureIndex: Index[Feature]
}
trait IndexedBilexicalFeatureAnchoring[W] {
def featuresForAttachment(head: Int, dep: Int):Array[Int]
}
object IndexedBilexicalFeaturizer {
def fromData[L, W](f: BilexicalFeaturizer[W],
depTrees: IndexedSeq[DependencyTree[L, W]],
hashFeatures: HashFeature.Scale = HashFeature.Relative(1.0)):IndexedBilexicalFeaturizer[W] = {
val index = Index[Feature]()
for (tree <- depTrees) {
val anch = f.anchor(tree.words)
for( (head, dep) <- tree.arcs if head < tree.words.length) {
anch.featuresForAttachment(head, dep) foreach index.index
}
}
new BasicIndexedBilexicalFeaturizer(f, new HashExtendingIndex(index, HashFeature(_), hashFeatures))
}
class BasicIndexedBilexicalFeaturizer[W](f: BilexicalFeaturizer[W], val featureIndex: Index[Feature]) extends IndexedBilexicalFeaturizer[W] with Serializable {
def anchor(w: IndexedSeq[W]): IndexedBilexicalFeatureAnchoring[W] = {
val anc = f.anchor(w)
new IndexedBilexicalFeatureAnchoring[W] {
def featuresForAttachment(head: Int, dep: Int): Array[Int] = anc.featuresForAttachment(head, dep).map(featureIndex)
}
}
}
/*
def fromData[L, W](headFeaturizer: IndexedWordFeaturizer[W],
depFeaturizer: IndexedWordFeaturizer[W],
depTrees: IndexedSeq[DependencyTree[L, W]],
hashFeatures: HashFeature.Scale = HashFeature.Relative(1.0)):IndexedBilexicalFeaturizer[W] = {
val builder = new CrossProductIndex.Builder(headFeaturizer.featureIndex, depFeaturizer.featureIndex, hashFeatures, "Bilexical")
for (tree <- depTrees) {
val hanch = headFeaturizer.anchor(tree.words)
val danch = headFeaturizer.anchor(tree.words)
for( (head, dep) <- tree.arcs if head < tree.words.length) {
builder.add(hanch.featuresForWord(head),
danch.featuresForWord(dep))
// builder.add(danch.featuresForWord(head),
// hanch.featuresForWord(dep))
}
}
val index = builder.result()
new ProductIndexedBilexicalFeaturizer[W](headFeaturizer, depFeaturizer, index)
}
*/
}
trait LexFeature extends Feature
case class HeadFeature[P](r: Feature) extends LexFeature
case class DepFeature[P](r: Feature) extends LexFeature
case class HeadDepFeature[P](head: Feature, dep: Feature) extends LexFeature
case class DistFeature(distance: Int, f: Any) extends LexFeature
case class DistanceFeature(distance: Int, label: String = "Span")extends LexFeature
case class AttachRight(distance: Int) extends LexFeature
case object AttachRight extends LexFeature
case class AttachLeft(distance: Int) extends LexFeature
case object AttachLeft extends LexFeature