forked from sbenthall/bluestocking
-
Notifications
You must be signed in to change notification settings - Fork 0
/
chunkerator.py
86 lines (76 loc) · 2.99 KB
/
chunkerator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import nltk
import types
from nltk import tag, chunk
class Chunkerator:
def __init__(self, rules, nounChunking):
self.chunkParser = nltk.RegexpParser(rules)
self.chunksSeen = dict()
self.nounChunking = nounChunking
def initial_tag(self,sentence):
'''
Uses the basic nltk tagger to assign tags.
'''
#sent = sentence.split()
return tag.pos_tag(sentence)
def initial_chunk(self,tagged_sent):
tree = self.chunkParser.parse(tagged_sent)
return tree
def merge_chunk(self,tup):
'''
Reads in tuple from tree, flattens it out as atomic word.
Finds head noun of every chunk.
'''
out = ''
headNoun = str()
for phrase in tup:
#flatten out the chunk, conjoun with '_'
out+=(str(phrase[0])+str('_'))
#if tag of word is any kind of noun, treat as head
if self.nounChunking and (phrase[1][1] == "N" or phrase[1] == "PRP"):
headNoun = phrase[0]
out = out.rstrip('_')
if self.nounChunking:
self.chunksSeen[out] = headNoun
return out
def neg(self, tree):
neg_words = ['not','never', 'isn\'t','was\'nt','hasn\'t','haven\'t','didn\'t','don\'t','doesn\'t']
for ii in xrange(len(tree)):
if tree[ii][0] in neg_words:
for jj in range(ii+1,len(tree)):
if type(tree[jj][0]) != types.TupleType:
if len(tree[jj])> 0 and len(tree[jj][1])> 0 and tree[jj][1][1]!="C":
blob = 'neg_%s' % str(tree[jj][0])
tree[jj] = (blob,tree[jj][1])
else:
break
return tree
def remake_chunked_sent(self,tree):
'''
Reconstitutes the original sentence with chunks treated as atomic.
'''
output = ''
neg_words = ['not','never', 'isn\'t','was\'nt','hasn\'t','haven\'t','didn\'t','don\'t','doesn\'t']
for xx in tree:
if type(xx[0]) == types.TupleType:
output+= str(self.merge_chunk(xx))+' '
elif xx[0] not in neg_words:
output+= str(xx[0])+' '
output.rstrip()
return output.split()
def replace_chunk_with_head(self, sentence):
for x in xrange(0,len(sentence)):
if sentence[x] in self.chunksSeen:
sentence[x] = self.chunksSeen[sentence[x]]
return sentence
def chunk_sent(self,sentence):
'''
Chunks a sentence (into nouns or clauses, based on init rules)
If chunking by noun, will replace noun chunks with new entity IDs.
'''
tagged_sent = self.initial_tag(sentence)
tree = self.initial_chunk(tagged_sent)
tree = self.neg(tree)
out_sent = self.remake_chunked_sent(tree)
if self.nounChunking:
out_sent = self.replace_chunk_with_head(out_sent)
return out_sent