This repository has been archived by the owner on Feb 19, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 82
/
HackyHeadFinder.scala
99 lines (87 loc) · 3.71 KB
/
HackyHeadFinder.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
package epic.features
import scala.collection.mutable.HashMap
/**
* HackyHeadFinders find "heads" in a span using only preterminal labels.
* It doesn't use the syntactic structure of the sentence.
*
* @author gdurrett
* @tparam L
* @tparam T
*/
trait HackyHeadFinder[L,T] extends Serializable {
def findHead(label: L, preterminals: Seq[T]): Int
}
case class RuleBasedHackyHeadFinder() extends HackyHeadFinder[String,String] {
def findHead(label: String, preterminals: Seq[String]): Int = {
if (!RuleBasedHackyHeadFinder.headRules.contains(label)) {
0
} else {
val result = RuleBasedHackyHeadFinder.headRules(label)(preterminals)
if (result == -1) {
println("-1 for " + label + ": " + preterminals)
}
result
}
}
}
object RuleBasedHackyHeadFinder {
val L2R = true
val R2L = false
val headRules = new HashMap[String,(Seq[String] => Int)]
// Ss: lots of problems are due to fronted PPs, NPs with sentential complements, etc.
// NPs: lots of stuff due to CD / $, weird NPs
// SBAR: I can't figure out why 0 tends to work the best but it does
headRules.put("ADJP", (preterminals) => searchFindFirst(preterminals, L2R, Set("NNS", "NN", "$", "JJ", "VBN", "VBG", "JJR", "JJS")))
headRules.put("ADVP", (preterminals) => searchFindFirst(preterminals, R2L, Set("RB", "RBR", "RBS", "FW")))
headRules.put("NP", (preterminals) => searchFindLastBefore(preterminals, L2R,
Set("NN", "NNP", "NNPS", "NNS", "NX", "POS", "JJR", "$", "PRN"),
Set(",", "WDT", "TO", "IN", "-LRB-", ":", "CC", "("
))); // block appositives, complementizers, prepositions, parentheticals, conjunctions
headRules.put("QP", (preterminals) => searchFindFirst(preterminals, L2R, Set("$", "IN", "CD")))
headRules.put("PP", (preterminals) => searchFindFirst(preterminals, L2R, Set("IN", "TO", "VBG", "VBN", "RP", "FW")))
headRules.put("PRN", (preterminals) => if (preterminals.size > 1) 1 else 0)
headRules.put("S", (preterminals) => searchFindFirst(preterminals, L2R, Set("TO", "VBD", "VBN", "MD", "VBZ", "VB", "VBG", "VBP")))
headRules.put("VP", (preterminals) => searchFindFirst(preterminals, L2R, Set("TO", "VBD", "VBN", "MD", "VBZ", "VB", "VBG", "VBP")))
// headRules.put("SBAR", (preterminals) => searchFindFirst(preterminals, L2R, Set("WP", "WP$", "WDT", "WRB", "IN", "PRP", "PRP$")));
def searchFindFirst(preterminals: Seq[String], leftToRight: Boolean, goodOnes: Set[String]): Int = {
val start = if (leftToRight) 0 else preterminals.size - 1
val end = if (leftToRight) preterminals.size else -1
var headIdx = -1
var i = start
while (i != end && headIdx == -1) {
if (goodOnes.contains(preterminals(i))) {
headIdx = i
}
i += (if (leftToRight) 1 else -1)
}
if (headIdx < 0 || headIdx >= preterminals.size) {
headIdx = start
}
headIdx
}
def searchFindLastBefore(preterminals: Seq[String], leftToRight: Boolean, goodOnes: Set[String], blockers: Set[String]) = {
val start = if (leftToRight) 0 else preterminals.size - 1
val end = if (leftToRight) preterminals.size else -1
var headIdx = -1
var i = start
var blocked = false
while (i != end && !blocked) {
if (goodOnes.contains(preterminals(i))) {
headIdx = i
}
if (blockers.contains(preterminals(i))) {
blocked = true
} else {
i += (if (leftToRight) 1 else -1)
}
}
if (headIdx == -1) {
// headIdx = if (leftToRight) preterminals.size - 1 else 0;
headIdx = if (leftToRight) Math.max(0, i - 1) else Math.min(i+1, preterminals.size)
}
if (headIdx < 0 || headIdx >= preterminals.size) {
headIdx = 0
}
headIdx
}
}