Permalink
Find file
cc30d72 Jun 28, 2011
executable file 271 lines (233 sloc) 6.66 KB
#!/usr/bin/env python
"""
A feature extractor for named eneity recognition (NER).
Copyright 2010,2011 Naoaki Okazaki.
"""
# Separator of field values.
separator = ' '
# Field names of the input data.
fields = 'y w pos chk'
import crfutils
def get_shape(token):
r = ''
for c in token:
if c.isupper():
r += 'U'
elif c.islower():
r += 'L'
elif c.isdigit():
r += 'D'
elif c in ('.', ','):
r += '.'
elif c in (';', ':', '?', '!'):
r += ';'
elif c in ('+', '-', '*', '/', '=', '|', '_'):
r += '-'
elif c in ('(', '{', '[', '<'):
r += '('
elif c in (')', '}', ']', '>'):
r += ')'
else:
r += c
return r
def degenerate(src):
dst = ''
for c in src:
if not dst or dst[-1] != c:
dst += c
return dst
def get_type(token):
T = (
'AllUpper', 'AllDigit', 'AllSymbol',
'AllUpperDigit', 'AllUpperSymbol', 'AllDigitSymbol',
'AllUpperDigitSymbol',
'InitUpper',
'AllLetter',
'AllAlnum',
)
R = set(T)
if not token:
return 'EMPTY'
for i in range(len(token)):
c = token[i]
if c.isupper():
R.discard('AllDigit')
R.discard('AllSymbol')
R.discard('AllDigitSymbol')
elif c.isdigit() or c in (',', '.'):
R.discard('AllUpper')
R.discard('AllSymbol')
R.discard('AllUpperSymbol')
R.discard('AllLetter')
elif c.islower():
R.discard('AllUpper')
R.discard('AllDigit')
R.discard('AllSymbol')
R.discard('AllUpperDigit')
R.discard('AllUpperSymbol')
R.discard('AllDigitSymbol')
R.discard('AllUpperDigitSymbol')
else:
R.discard('AllUpper')
R.discard('AllDigit')
R.discard('AllUpperDigit')
R.discard('AllLetter')
R.discard('AllAlnum')
if i == 0 and not c.isupper():
R.discard('InitUpper')
for tag in T:
if tag in R:
return tag
return 'NO'
def get_2d(token):
return len(token) == 2 and token.isdigit()
def get_4d(token):
return len(token) == 4 and token.isdigit()
def get_da(token):
bd = False
ba = False
for c in token:
if c.isdigit():
bd = True
elif c.isalpha():
ba = True
else:
return False
return bd and ba
def get_dand(token, p):
bd = False
bdd = False
for c in token:
if c.isdigit():
bd = True
elif c == p:
bdd = True
else:
return False
return bd and bdd
def get_all_other(token):
for c in token:
if c.isalnum():
return False
return True
def get_capperiod(token):
return len(token) == 2 and token[0].isupper() and token[1] == '.'
def contains_upper(token):
b = False
for c in token:
b |= c.isupper()
return b
def contains_lower(token):
b = False
for c in token:
b |= c.islower()
return b
def contains_alpha(token):
b = False
for c in token:
b |= c.isalpha()
return b
def contains_digit(token):
b = False
for c in token:
b |= c.isdigit()
return b
def contains_symbol(token):
b = False
for c in token:
b |= ~c.isalnum()
return b
def b(v):
return 'yes' if v else 'no'
def observation(v, defval=''):
# Lowercased token.
v['wl'] = v['w'].lower()
# Token shape.
v['shape'] = get_shape(v['w'])
# Token shape degenerated.
v['shaped'] = degenerate(v['shape'])
# Token type.
v['type'] = get_type(v['w'])
# Prefixes (length between one to four).
v['p1'] = v['w'][0] if len(v['w']) >= 1 else defval
v['p2'] = v['w'][:2] if len(v['w']) >= 2 else defval
v['p3'] = v['w'][:3] if len(v['w']) >= 3 else defval
v['p4'] = v['w'][:4] if len(v['w']) >= 4 else defval
# Suffixes (length between one to four).
v['s1'] = v['w'][-1] if len(v['w']) >= 1 else defval
v['s2'] = v['w'][-2:] if len(v['w']) >= 2 else defval
v['s3'] = v['w'][-3:] if len(v['w']) >= 3 else defval
v['s4'] = v['w'][-4:] if len(v['w']) >= 4 else defval
# Two digits
v['2d'] = b(get_2d(v['w']))
# Four digits.
v['4d'] = b(get_4d(v['w']))
# Alphanumeric token.
v['d&a'] = b(get_da(v['w']))
# Digits and '-'.
v['d&-'] = b(get_dand(v['w'], '-'))
# Digits and '/'.
v['d&/'] = b(get_dand(v['w'], '/'))
# Digits and ','.
v['d&,'] = b(get_dand(v['w'], ','))
# Digits and '.'.
v['d&.'] = b(get_dand(v['w'], '.'))
# A uppercase letter followed by '.'
v['up'] = b(get_capperiod(v['w']))
# An initial uppercase letter.
v['iu'] = b(v['w'] and v['w'][0].isupper())
# All uppercase letters.
v['au'] = b(v['w'].isupper())
# All lowercase letters.
v['al'] = b(v['w'].islower())
# All digit letters.
v['ad'] = b(v['w'].isdigit())
# All other (non-alphanumeric) letters.
v['ao'] = b(get_all_other(v['w']))
# Contains a uppercase letter.
v['cu'] = b(contains_upper(v['w']))
# Contains a lowercase letter.
v['cl'] = b(contains_lower(v['w']))
# Contains a alphabet letter.
v['ca'] = b(contains_alpha(v['w']))
# Contains a digit.
v['cd'] = b(contains_digit(v['w']))
# Contains a symbol.
v['cs'] = b(contains_symbol(v['w']))
def disjunctive(X, t, field, begin, end):
name = '%s[%d..%d]' % (field, begin, end)
for offset in range(begin, end+1):
p = t + offset
if p not in range(0, len(X)):
continue
X[t]['F'].append('%s=%s' % (name, X[p][field]))
U = [
'w', 'wl', 'pos', 'chk', 'shape', 'shaped', 'type',
'p1', 'p2', 'p3', 'p4',
's1', 's2', 's3', 's4',
'2d', '4d', 'd&a', 'd&-', 'd&/', 'd&,', 'd&.', 'up',
'iu', 'au', 'al', 'ad', 'ao',
'cu', 'cl', 'ca', 'cd', 'cs',
]
B = ['w', 'pos', 'chk', 'shaped', 'type']
templates = []
for name in U:
templates += [((name, i),) for i in range(-2, 3)]
for name in B:
templates += [((name, i), (name, i+1)) for i in range(-2, 2)]
def feature_extractor(X):
# Append observations.
for x in X:
observation(x)
# Apply the feature templates.
crfutils.apply_templates(X, templates)
# Append disjunctive features.
for t in range(len(X)):
disjunctive(X, t, 'w', -4, -1)
disjunctive(X, t, 'w', 1, 4)
# Append BOS and EOS features.
if X:
X[0]['F'].append('__BOS__')
X[-1]['F'].append('__EOS__')
if __name__ == '__main__':
crfutils.main(feature_extractor, fields=fields, sep=separator)