This repository has been archived by the owner on Feb 19, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 82
/
NGramSurfaceFeaturizer.scala
50 lines (40 loc) · 1.66 KB
/
NGramSurfaceFeaturizer.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
package epic.features
import epic.framework.Feature
import scala.collection.mutable.ArrayBuffer
import breeze.util.CachedHashCode
import epic.util.TwoTwelveSupport
import scala.runtime.ScalaRunTime
case class OrientedNGramFeature(offset: Int, features: IndexedSeq[Feature]) extends Feature with CachedHashCode {
override def equals(other: Any): Boolean = other match {
case x: OrientedNGramFeature => x.hashCode == hashCode && TwoTwelveSupport._equals(this, x)
case _ => false
}
}
/**
*
* @author dlwh
*/
class NGramWordFeaturizer[W](base: WordFeaturizer[W], wordNgramOrder: Int = 2) extends WordFeaturizer[W] {
def anchor(w: IndexedSeq[W]): WordFeatureAnchoring[W] = {
new WordFeatureAnchoring[W] {
val baseAnch = base.anchor(w)
def words: IndexedSeq[W] = w
def featuresForWord(pos: Int): Array[Feature] = {
val result = ArrayBuffer[Feature]() ++= baseAnch.featuresForWord(pos)
for(order <- 2 to wordNgramOrder)
addNgramFeatures(result, pos, order)
result.toArray
}
def addNgramFeatures(buffer: ArrayBuffer[Feature], pos: Int, order: Int) {
for (offset <- (-order+1) to 0) {
val features = for( pos2 <- (pos + offset) to (pos + offset + order) ) yield baseAnch.featuresForWord(pos)
val configs = allConfigurations(features).map(OrientedNGramFeature(offset, _))
buffer ++= configs
}
}
}
}
private def allConfigurations(seqOfSeqs: TraversableOnce[Array[Feature]]): IndexedSeq[IndexedSeq[Feature]] = {
seqOfSeqs.foldLeft(IndexedSeq(IndexedSeq.empty[Feature]))((acc,currentFeatures) => {for(a <- acc; b <- currentFeatures) yield a :+ b})
}
}