Skip to content

Commit

Permalink
added Adams variant of UEA-Lite
Browse files Browse the repository at this point in the history
  • Loading branch information
chrislit committed Aug 26, 2018
1 parent 9a57024 commit 713e442
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 31 deletions.
44 changes: 40 additions & 4 deletions abydos/stemmer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1711,7 +1711,8 @@ def caumanns(word):
return word


def uealite(word, max_word_length=20, return_rule_no=False, var=None):
def uealite(word, max_word_length=20, max_acro_length=8, return_rule_no=False,
var=None):
"""Return UEA-Lite stem.
The UEA-Lite stemmer is discussed in:
Expand Down Expand Up @@ -1933,9 +1934,37 @@ def uealite(word, max_word_length=20, return_rule_no=False, var=None):
4: ['eeds', 'reds', 'beds']}
for del_len in perl_deletions:
for term in perl_deletions[del_len]:
del perl_deletions[del_len][term]
del rule_table[del_len][term]
elif var == 'Adams':
pass
adams_additions = {6: {'chited': (22.8, 1, None)},
5: {'dying': (58.2, 4, 'ie'),
'tying': (58.2, 4, 'ie'),
'vited': (22.6, 1, None),
'mited': (22.5, 1, None),
'vided': (22.9, 1, None),
'mided': (22.10, 1, None),
'lying': (58.2, 4, 'ie'),
'arred': (19.1, 3, None),
},
4: {'ited': (22.7, 2, None),
'oked': (31.1, 1, None),
'aked': (31.1, 1, None),
'iked': (31.1, 1, None),
'uked': (31.1, 1, None),
'amed': (31, 1, None),
'imed': (31, 1, None),
'does': (31.2, 2, None),
},
3: {'oed': (31.3, 1, None),
'oes': (31.2, 1, None),
'kes': (63.1, 1, None),
'des': (63.10, 1, None),
'res': (63.9, 1, None),
}}
for del_len in adams_additions:
for term in adams_additions[del_len]:
rule_table[del_len][term] = adams_additions[del_len][term]
problem_words.add('menses')

def _stem_with_duplicate_character_check(word, del_length):
if word[-1] == 's':
Expand All @@ -1953,7 +1982,7 @@ def _stem(word):
return word, 0
if word in problem_words:
return word, 90
if len(word) > max_word_length:
if max_word_length and len(word) > max_word_length:
return word, 95

if "'" in word:
Expand All @@ -1979,13 +2008,20 @@ def _stem(word):
elif '_' in word:
return word, 90
elif word[-1] == 's' and word[:-1].isupper():
if var == 'Adams' and len(word)-1 > max_acro_length:
return word, 96
return word[:-1], 91.1
elif word.isupper():
if var == 'Adams' and len(word) > max_acro_length:
return word, 96
return word, 91
elif re.match(r'^.*[A-Z].*[A-Z].*$', word):
return word, 92
elif word[0].isupper():
return word, 93
elif var == 'Adams' and re.match(r'^[a-z]{1}(|[rl])(ing|ed)$',
word):
return word, 97

for n in range(7, 1, -1):
if word[-n:] in rule_table[n]:
Expand Down
112 changes: 85 additions & 27 deletions tests/test_stemmer.py
Original file line number Diff line number Diff line change
Expand Up @@ -852,61 +852,119 @@ def test_uealite(self):
# test cases copied from Ruby port
# https://github.com/ealdent/uea-stemmer/blob/master/test/uea_stemmer_test.rb
# These are corrected to match the Java version's output.
# "stem base words to just the base word"
# stem base words to just the base word
self.assertEqual(uealite('man'), 'man')
self.assertEqual(uealite('happiness'), 'happiness')
# "stem theses as thesis but not bases as basis"
# stem theses as thesis but not bases as basis
self.assertEqual(uealite('theses'), 'thesis')
self.assertNotEqual(uealite('bases'), 'basis')
# "stem preterite words ending in -ed without the -ed"
# stem preterite words ending in -ed without the -ed
self.assertEqual(uealite('ordained'), 'ordain')
self.assertEqual(uealite('killed'), 'kill')
self.assertEqual(uealite('liked'), 'lik') #
self.assertEqual(uealite('liked'), 'lik')
self.assertEqual(uealite('helped'), 'help')
self.assertEqual(uealite('scarred'), 'scarre') #
self.assertEqual(uealite('invited'), 'invit') #
self.assertEqual(uealite('scarred'), 'scarre')
self.assertEqual(uealite('invited'), 'invit')
self.assertEqual(uealite('exited'), 'exit')
self.assertEqual(uealite('debited'), 'debit')
self.assertEqual(uealite('smited'), 'smit') #
# "stem progressive verbs and gerunds without the -ing"
self.assertEqual(uealite('smited'), 'smit')
# stem progressive verbs and gerunds without the -ing
self.assertEqual(uealite('running'), 'run')
self.assertEqual(uealite('settings'), 'set')
self.assertEqual(uealite('timing'), 'time')
self.assertEqual(uealite('dying'), 'dy') #
self.assertEqual(uealite('dying'), 'dy')
self.assertEqual(uealite('harping'), 'harp')
self.assertEqual(uealite('charring'), 'char')
# "not stem false progressive verbs such as 'sing'"
# not stem false progressive verbs such as 'sing'
self.assertEqual(uealite('ring'), 'ring')
self.assertEqual(uealite('sing'), 'se') #
self.assertEqual(uealite('bring'), 'br') #
self.assertEqual(uealite('fling'), 'fle') #
# "stem various plural nouns and 3rd-pres verbs without the -s/-es"
self.assertEqual(uealite('sing'), 'se')
self.assertEqual(uealite('bring'), 'br')
self.assertEqual(uealite('fling'), 'fle')
# stem various plural nouns and 3rd-pres verbs without the -s/-es
self.assertEqual(uealite('changes'), 'change')
self.assertEqual(uealite('deaths'), 'death')
self.assertEqual(uealite('shadows'), 'shadow')
self.assertEqual(uealite('flies'), 'fly')
self.assertEqual(uealite('things'), 'thing')
self.assertEqual(uealite('nothings'), 'nothing')
self.assertEqual(uealite('witches'), 'witch')
self.assertEqual(uealite('makes'), 'mak') #
self.assertEqual(uealite('smokes'), 'smok') #
self.assertEqual(uealite('makes'), 'mak')
self.assertEqual(uealite('smokes'), 'smok')
self.assertEqual(uealite('does'), 'do')
# "stem various words with -des suffix"
self.assertEqual(uealite('abodes'), 'abod') #
self.assertEqual(uealite('escapades'), 'escapad') #
self.assertEqual(uealite('crusades'), 'crusad') #
self.assertEqual(uealite('grades'), 'grad') #
# "stem various words with -res suffix"
self.assertEqual(uealite('wires'), 'wir') #
self.assertEqual(uealite('acres'), 'acr') #
self.assertEqual(uealite('fires'), 'fir') #
self.assertEqual(uealite('cares'), 'car') #
# "stem acronyms when pluralized otherwise they should be left alone"
# stem various words with -des suffix
self.assertEqual(uealite('abodes'), 'abod')
self.assertEqual(uealite('escapades'), 'escapad')
self.assertEqual(uealite('crusades'), 'crusad')
self.assertEqual(uealite('grades'), 'grad')
# stem various words with -res suffix
self.assertEqual(uealite('wires'), 'wir')
self.assertEqual(uealite('acres'), 'acr')
self.assertEqual(uealite('fires'), 'fir')
self.assertEqual(uealite('cares'), 'car')
# stem acronyms when pluralized otherwise they should be left alone
self.assertEqual(uealite('USA'), 'USA')
self.assertEqual(uealite('FLOSS'), 'FLOSS')
self.assertEqual(uealite('MREs'), 'MRE')
self.assertEqual(uealite('USAED'), 'USAED')

# test cases copied from Ruby port
# https://github.com/ealdent/uea-stemmer/blob/master/test/uea_stemmer_test.rb
# stem base words to just the base word
self.assertEqual(uealite('man', var='Adams'), 'man')
self.assertEqual(uealite('happiness', var='Adams'), 'happiness')
# stem theses as thesis but not bases as basis
self.assertEqual(uealite('theses', var='Adams'), 'thesis')
self.assertNotEqual(uealite('bases', var='Adams'), 'basis')
# stem preterite words ending in -ed without the -ed
self.assertEqual(uealite('ordained', var='Adams'), 'ordain')
self.assertEqual(uealite('killed', var='Adams'), 'kill')
self.assertEqual(uealite('liked', var='Adams'), 'like')
self.assertEqual(uealite('helped', var='Adams'), 'help')
# self.assertEqual(uealite('scarred', var='Adams'), 'scar')
self.assertEqual(uealite('invited', var='Adams'), 'invite')
self.assertEqual(uealite('exited', var='Adams'), 'exit')
self.assertEqual(uealite('debited', var='Adams'), 'debit')
self.assertEqual(uealite('smited', var='Adams'), 'smite')
# stem progressive verbs and gerunds without the -ing
self.assertEqual(uealite('running', var='Adams'), 'run')
self.assertEqual(uealite('settings', var='Adams'), 'set')
self.assertEqual(uealite('timing', var='Adams'), 'time')
self.assertEqual(uealite('dying', var='Adams'), 'die')
self.assertEqual(uealite('harping', var='Adams'), 'harp')
self.assertEqual(uealite('charring', var='Adams'), 'char')
# not stem false progressive verbs such as 'sing'
self.assertEqual(uealite('ring', var='Adams'), 'ring')
self.assertEqual(uealite('sing', var='Adams'), 'sing')
self.assertEqual(uealite('ring', var='Adams'), 'ring')
self.assertEqual(uealite('bring', var='Adams'), 'bring')
self.assertEqual(uealite('fling', var='Adams'), 'fling')
# stem various plural nouns and 3rd-pres verbs without the -s/-es
self.assertEqual(uealite('changes', var='Adams'), 'change')
self.assertEqual(uealite('deaths', var='Adams'), 'death')
self.assertEqual(uealite('shadows', var='Adams'), 'shadow')
self.assertEqual(uealite('flies', var='Adams'), 'fly')
self.assertEqual(uealite('things', var='Adams'), 'thing')
self.assertEqual(uealite('nothings', var='Adams'), 'nothing')
self.assertEqual(uealite('witches', var='Adams'), 'witch')
self.assertEqual(uealite('makes', var='Adams'), 'make')
self.assertEqual(uealite('smokes', var='Adams'), 'smoke')
self.assertEqual(uealite('does', var='Adams'), 'do')
# stem various words with -des suffix
self.assertEqual(uealite('abodes', var='Adams'), 'abode')
self.assertEqual(uealite('escapades', var='Adams'), 'escapade')
self.assertEqual(uealite('crusades', var='Adams'), 'crusade')
self.assertEqual(uealite('grades', var='Adams'), 'grade')
# stem various words with -res suffix
self.assertEqual(uealite('wires', var='Adams'), 'wire')
self.assertEqual(uealite('acres', var='Adams'), 'acre')
self.assertEqual(uealite('fires', var='Adams'), 'fire')
self.assertEqual(uealite('cares', var='Adams'), 'care')
# stem acronyms when pluralized otherwise they should be left alone
self.assertEqual(uealite('USA', var='Adams'), 'USA')
self.assertEqual(uealite('FLOSS', var='Adams'), 'FLOSS')
self.assertEqual(uealite('MREs', var='Adams'), 'MRE')
self.assertEqual(uealite('USAED', var='Adams'), 'USAED')

def test_uealite_wsj_set(self):
"""Test abydos.stemmer.uealite using the WSJ test set."""
with open(TESTDIR + '/corpora/uea-lite_wsj.csv') as wsj_testset:
Expand Down

0 comments on commit 713e442

Please sign in to comment.