This repository has been archived by the owner on Feb 19, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 82
/
TagDictionaryFeaturizer.scala
94 lines (85 loc) · 2.78 KB
/
TagDictionaryFeaturizer.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
package epic.features
import epic.framework.Feature
import breeze.linalg._
import breeze.util.{Encoder, Index}
import epic.features.TagDictionaryFeaturizer._
import scala.collection.mutable
/**
* TODO
*
* @author dlwh
**/
@SerialVersionUID(1L)
class TagDictionaryFeaturizer[L](counts: Counter2[L, String, Double], commonWordThreshold: Int = 80) extends WordFeaturizer[String] with Serializable {
private val wordIndex = Index(counts.keysIterator.map(_._2))
private val labelIndices = counts.keysIterator.map(_._1).map(l => l -> MostCommonTagFeature(l)).toMap
private val emptyArray = Array.empty[Feature]
private val argmaxes = Encoder.fromIndex(wordIndex).tabulateArray{w =>
val totalCount = sum(counts(::, w))
if (totalCount >= commonWordThreshold) {
emptyArray
} else if (totalCount <= 2) {
emptyArray
} else {
val feats1 = counts(::, w).iterator.filter(_._2 == totalCount).map(_._1).map(labelIndices).toArray[Feature]
feats1
}
}
private val variants = Encoder.fromIndex(wordIndex).tabulateArray{w =>
val totalCount = sum(counts(::, w))
if (totalCount < commonWordThreshold) {
variantFeatures(w)
} else emptyArray
}
private def variantFeatures(w: String) = {
val arr = mutable.ArrayBuilder.make[Feature]
if (w(0).isUpper) {
val lowerCount = sum(counts(::, w.toLowerCase))
if (lowerCount != 0.0) {
arr += HasKnownLowerCaseVariant(counts(::, w.toLowerCase).argmax)
}
}
val dashIndex = w.lastIndexOf('-')
if (dashIndex >= 0) {
val afterDash = w.substring(dashIndex)
val undashedCount = sum(counts(::, afterDash))
if (undashedCount != 0.0) {
arr += HasKnownAfterDashSuffix(counts(::, afterDash).argmax)
}
}
arr.result()
}
def anchor(w: IndexedSeq[String]): WordFeatureAnchoring[String] = new WordFeatureAnchoring[String] {
val indices = w.map(wordIndex)
val myArgmaxes = indices.map{i =>
if (i < 0) {
emptyArray
} else argmaxes(i)
}
val variants: IndexedSeq[Array[Feature]] = indices.zipWithIndex.map{ case(i, pos) =>
if (i < 0) {
variantFeatures(w(pos))
} else {
TagDictionaryFeaturizer.this.variants(i)
}
}
def featuresForWord(pos: Int): Array[Feature] = {
if (pos < 0 || pos >= w.length) {
Array(IndicatorWSFeature('OutOfBounds))
} else {
val am = myArgmaxes(pos)
if (variants(pos).length != 0) {
am ++ variants(pos)
} else {
am
}
}
}
def words: IndexedSeq[String] = w
}
}
object TagDictionaryFeaturizer {
case class MostCommonTagFeature[L](l: L) extends Feature
case class HasKnownLowerCaseVariant[L](l: L) extends Feature
case class HasKnownAfterDashSuffix[L](l: L) extends Feature
}