-
Notifications
You must be signed in to change notification settings - Fork 0
/
postagfix.py
executable file
·109 lines (86 loc) · 2.66 KB
/
postagfix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python
## $Id: postagfix.py,v 1.1.1.1 2003/07/01 23:28:27 euske Exp $
##
## postagfix.py - POS Tag Fixer
##
import sys, re
from regpat import PatternSet, PatCounter
from sentence import Sentence, Dictionary, TextTokenizer, SentenceSplitter, POSTagger
from abstfilter import AbstractFeeder, AbstractFilter, AbstractConsumer
from document import HTMLProcessor, TexProcessor, PlainTextProcessor
##
##
class POSTagFixPatternSet(PatternSet):
def __init__(self, pats):
PatternSet.__init__(self)
self.fixpatterns = map(lambda (p,tags): (self.compile(p), tags), pats)
return
def compile_item0(self, t):
return lambda w: w.pos_pref == t
def compile_item1(self, t):
def pred1(p):
if p == "capita":
return lambda w: w.s[0].isupper()
elif p.startswith("!"):
p = p[1:]
return lambda w: p not in w.pos
else:
return lambda w: p in w.pos
return self.combine_preds(map(pred1, t.split(",")))
def compile_item2(self, s):
return lambda w: (not isinstance(w.s, Sentence)) and s.lower() == w.s.lower()
def perform(self, seq):
for (pat, tags) in self.fixpatterns:
for m in pat.search(seq):
#print zip(seq[m.start:m.end], tags)
for (w,t) in zip(seq[m.start:m.end], tags):
if t:
w.pos_pref = t
return
##
##
class POSTagFixer(AbstractFilter):
patternset = POSTagFixPatternSet([
('([DT]|[DT1]|[DTS]|[PRP$]|[PDT]|[PDT1]|[PDTS]) [NN,VBP]',
[None, "NN"] ),
('([DT]|[DT1]|[DTS]|[PRP$]|[PDT]|[PDT1]|[PDTS]) [NNS,VBZ]',
[None, "NNS"] ),
('[NN] [NNS,VBZ] ([DT]|[DT1]|[DTS]|[PDT]|[PDT1]|[PDTS]|[PRP]|[PRPS]|[NN]|[NNS])',
[None, "VBZ", None] ),
('[NN,VB] ([VB]|[VBZ])',
["NN", None] ),
('"to" [VB]',
[None, "VB"] ),
('[MD] [VB]',
["MD", "VB"] ),
('[MDZ] [VB]',
["MDZ", "VB"] ),
])
def __init__(self, next_filter):
AbstractFilter.__init__(self, next_filter)
return
def process(self, sent1):
for w in sent1.words:
if len(w.pos) == 1:
w.pos_pref = w.pos[0]
self.patternset.perform(sent1.words)
return sent1
##
#
if __name__ == "__main__":
class Consumer(AbstractConsumer):
def feed(self, s):
print repr(s)
return
if sys.argv[1] == "-t":
docproc = TexProcessor
elif sys.argv[1] == "-l":
docproc = HTMLProcessor
elif sys.argv[1] == "-p":
docproc = PlainTextProcessor
else:
assert 0
dict = Dictionary("LOCAL/dict.cdb")
out = Consumer()
pipeline = docproc(TextTokenizer(SentenceSplitter(POSTagger(dict, POSTagFixer(out)))))
pipeline.read(sys.stdin)