This repository has been archived by the owner on Feb 19, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 82
/
WordClassFeaturizer.scala
57 lines (46 loc) · 1.65 KB
/
WordClassFeaturizer.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
package epic.features
import breeze.linalg.Counter
import epic.framework.Feature
import breeze.util.{Encoder, Interner, Index}
import scala.collection.immutable
/**
*
*
* @author dlwh
**/
class WordClassFeaturizer(wordCounts: Counter[String, Double],
functionWordThreshold: Int = 100) extends WordFeaturizer[String] with Serializable {
def anchor(words: IndexedSeq[String]):WordFeatureAnchoring[String] = {
val w = words
new WordFeatureAnchoring[String] {
val indices = words.map(wordIndex)
def words = w
def featuresForWord(pos: Int): Array[Feature] = {
if (pos < 0 || pos >= words.length) {
boundaryFeatures
} else {
_minimalFeatures(pos)
}
}
private val _minimalFeatures: immutable.IndexedSeq[Array[Feature]] = words.indices.map { i =>
val index = indices(i)
if (index >= 0) {
WordClassFeaturizer.this.minimalFeatures(index)
} else {
val ww = words(i)
val classe = interner(WordFeature(EnglishWordClassGenerator(ww), 'Class))
Array[Feature](classe)
}
}
}
}
// more positional classes to add
private val wordIndex = Index(wordCounts.keySet)
private val interner = new Interner[Feature]
private val boundaryFeatures = Array[Feature](BoundaryFeature)
private val classes = Encoder.fromIndex(wordIndex).tabulateArray(w => if (wordCounts(w) > functionWordThreshold) interner(IndicatorFeature(w)) else interner(WordFeature(EnglishWordClassGenerator(w), 'Class)))
// caches
private val minimalFeatures = Array.tabulate(wordIndex.size){ i =>
Array(classes(i))
}
}