-
Notifications
You must be signed in to change notification settings - Fork 1
/
pos_pruning.py
55 lines (44 loc) · 1.47 KB
/
pos_pruning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from nltk_utils import *
import nltk
from nltk.corpus import wordnet as wn
def memoizefirst(f):
memo = {}
def helper(x, y):
if x not in memo:
memo[x] = f(x, y)
return memo[x]
return helper
def get_noun_set(article, tokens):
tags = nltk.pos_tag(tokens)
nouns = set(
map(
lambda x: x[0],
filter(
lambda x: x[1] == wn.NOUN,
map(lambda x: (x[0], penn_to_wn(x[1])), tags),
)
)
)
return nouns
get_noun_set = memoizefirst(get_noun_set)
def prune_statements(dataset, questions, debug=True):
total_old = 0
total_new = 0
for i in range(len(questions)):
question = questions[i]
new_statements = []
old_statements = question[2]
# Keep only statements which have at least 1 common noun
q = question[3]
q_nouns = get_noun_set('|'.join(q), q)
for s in old_statements:
s_nouns = get_noun_set('|'.join(s), s)
if len(s_nouns.intersection(q_nouns)) > 0:
new_statements.append(s)
questions[i][2] = new_statements
total_old += len(old_statements)
total_new += len(new_statements)
if debug and i < 3:
print "Question: ", q, "Statements:\n", old_statements, "\n", new_statements, "\nbefore %d after %d" % (len(old_statements), len(new_statements))
#print("Before %d After %d" % (total_old, total_new))
return questions