From 713e442f83fb690768467e4e1ddbcd5cadebf589 Mon Sep 17 00:00:00 2001 From: Chris Little Date: Sat, 25 Aug 2018 20:52:09 -0700 Subject: [PATCH] added Adams variant of UEA-Lite --- abydos/stemmer.py | 44 +++++++++++++++-- tests/test_stemmer.py | 112 ++++++++++++++++++++++++++++++++---------- 2 files changed, 125 insertions(+), 31 deletions(-) diff --git a/abydos/stemmer.py b/abydos/stemmer.py index bf85a081e..6c29c0051 100644 --- a/abydos/stemmer.py +++ b/abydos/stemmer.py @@ -1711,7 +1711,8 @@ def caumanns(word): return word -def uealite(word, max_word_length=20, return_rule_no=False, var=None): +def uealite(word, max_word_length=20, max_acro_length=8, return_rule_no=False, + var=None): """Return UEA-Lite stem. The UEA-Lite stemmer is discussed in: @@ -1933,9 +1934,37 @@ def uealite(word, max_word_length=20, return_rule_no=False, var=None): 4: ['eeds', 'reds', 'beds']} for del_len in perl_deletions: for term in perl_deletions[del_len]: - del perl_deletions[del_len][term] + del rule_table[del_len][term] elif var == 'Adams': - pass + adams_additions = {6: {'chited': (22.8, 1, None)}, + 5: {'dying': (58.2, 4, 'ie'), + 'tying': (58.2, 4, 'ie'), + 'vited': (22.6, 1, None), + 'mited': (22.5, 1, None), + 'vided': (22.9, 1, None), + 'mided': (22.10, 1, None), + 'lying': (58.2, 4, 'ie'), + 'arred': (19.1, 3, None), + }, + 4: {'ited': (22.7, 2, None), + 'oked': (31.1, 1, None), + 'aked': (31.1, 1, None), + 'iked': (31.1, 1, None), + 'uked': (31.1, 1, None), + 'amed': (31, 1, None), + 'imed': (31, 1, None), + 'does': (31.2, 2, None), + }, + 3: {'oed': (31.3, 1, None), + 'oes': (31.2, 1, None), + 'kes': (63.1, 1, None), + 'des': (63.10, 1, None), + 'res': (63.9, 1, None), + }} + for del_len in adams_additions: + for term in adams_additions[del_len]: + rule_table[del_len][term] = adams_additions[del_len][term] + problem_words.add('menses') def _stem_with_duplicate_character_check(word, del_length): if word[-1] == 's': @@ -1953,7 +1982,7 @@ def _stem(word): return word, 0 if word in problem_words: return word, 90 - if len(word) > max_word_length: + if max_word_length and len(word) > max_word_length: return word, 95 if "'" in word: @@ -1979,13 +2008,20 @@ def _stem(word): elif '_' in word: return word, 90 elif word[-1] == 's' and word[:-1].isupper(): + if var == 'Adams' and len(word)-1 > max_acro_length: + return word, 96 return word[:-1], 91.1 elif word.isupper(): + if var == 'Adams' and len(word) > max_acro_length: + return word, 96 return word, 91 elif re.match(r'^.*[A-Z].*[A-Z].*$', word): return word, 92 elif word[0].isupper(): return word, 93 + elif var == 'Adams' and re.match(r'^[a-z]{1}(|[rl])(ing|ed)$', + word): + return word, 97 for n in range(7, 1, -1): if word[-n:] in rule_table[n]: diff --git a/tests/test_stemmer.py b/tests/test_stemmer.py index 23f12236c..e6b5ea9a0 100644 --- a/tests/test_stemmer.py +++ b/tests/test_stemmer.py @@ -852,35 +852,35 @@ def test_uealite(self): # test cases copied from Ruby port # https://github.com/ealdent/uea-stemmer/blob/master/test/uea_stemmer_test.rb # These are corrected to match the Java version's output. - # "stem base words to just the base word" + # stem base words to just the base word self.assertEqual(uealite('man'), 'man') self.assertEqual(uealite('happiness'), 'happiness') - # "stem theses as thesis but not bases as basis" + # stem theses as thesis but not bases as basis self.assertEqual(uealite('theses'), 'thesis') self.assertNotEqual(uealite('bases'), 'basis') - # "stem preterite words ending in -ed without the -ed" + # stem preterite words ending in -ed without the -ed self.assertEqual(uealite('ordained'), 'ordain') self.assertEqual(uealite('killed'), 'kill') - self.assertEqual(uealite('liked'), 'lik') # + self.assertEqual(uealite('liked'), 'lik') self.assertEqual(uealite('helped'), 'help') - self.assertEqual(uealite('scarred'), 'scarre') # - self.assertEqual(uealite('invited'), 'invit') # + self.assertEqual(uealite('scarred'), 'scarre') + self.assertEqual(uealite('invited'), 'invit') self.assertEqual(uealite('exited'), 'exit') self.assertEqual(uealite('debited'), 'debit') - self.assertEqual(uealite('smited'), 'smit') # - # "stem progressive verbs and gerunds without the -ing" + self.assertEqual(uealite('smited'), 'smit') + # stem progressive verbs and gerunds without the -ing self.assertEqual(uealite('running'), 'run') self.assertEqual(uealite('settings'), 'set') self.assertEqual(uealite('timing'), 'time') - self.assertEqual(uealite('dying'), 'dy') # + self.assertEqual(uealite('dying'), 'dy') self.assertEqual(uealite('harping'), 'harp') self.assertEqual(uealite('charring'), 'char') - # "not stem false progressive verbs such as 'sing'" + # not stem false progressive verbs such as 'sing' self.assertEqual(uealite('ring'), 'ring') - self.assertEqual(uealite('sing'), 'se') # - self.assertEqual(uealite('bring'), 'br') # - self.assertEqual(uealite('fling'), 'fle') # - # "stem various plural nouns and 3rd-pres verbs without the -s/-es" + self.assertEqual(uealite('sing'), 'se') + self.assertEqual(uealite('bring'), 'br') + self.assertEqual(uealite('fling'), 'fle') + # stem various plural nouns and 3rd-pres verbs without the -s/-es self.assertEqual(uealite('changes'), 'change') self.assertEqual(uealite('deaths'), 'death') self.assertEqual(uealite('shadows'), 'shadow') @@ -888,25 +888,83 @@ def test_uealite(self): self.assertEqual(uealite('things'), 'thing') self.assertEqual(uealite('nothings'), 'nothing') self.assertEqual(uealite('witches'), 'witch') - self.assertEqual(uealite('makes'), 'mak') # - self.assertEqual(uealite('smokes'), 'smok') # + self.assertEqual(uealite('makes'), 'mak') + self.assertEqual(uealite('smokes'), 'smok') self.assertEqual(uealite('does'), 'do') - # "stem various words with -des suffix" - self.assertEqual(uealite('abodes'), 'abod') # - self.assertEqual(uealite('escapades'), 'escapad') # - self.assertEqual(uealite('crusades'), 'crusad') # - self.assertEqual(uealite('grades'), 'grad') # - # "stem various words with -res suffix" - self.assertEqual(uealite('wires'), 'wir') # - self.assertEqual(uealite('acres'), 'acr') # - self.assertEqual(uealite('fires'), 'fir') # - self.assertEqual(uealite('cares'), 'car') # - # "stem acronyms when pluralized otherwise they should be left alone" + # stem various words with -des suffix + self.assertEqual(uealite('abodes'), 'abod') + self.assertEqual(uealite('escapades'), 'escapad') + self.assertEqual(uealite('crusades'), 'crusad') + self.assertEqual(uealite('grades'), 'grad') + # stem various words with -res suffix + self.assertEqual(uealite('wires'), 'wir') + self.assertEqual(uealite('acres'), 'acr') + self.assertEqual(uealite('fires'), 'fir') + self.assertEqual(uealite('cares'), 'car') + # stem acronyms when pluralized otherwise they should be left alone self.assertEqual(uealite('USA'), 'USA') self.assertEqual(uealite('FLOSS'), 'FLOSS') self.assertEqual(uealite('MREs'), 'MRE') self.assertEqual(uealite('USAED'), 'USAED') + # test cases copied from Ruby port + # https://github.com/ealdent/uea-stemmer/blob/master/test/uea_stemmer_test.rb + # stem base words to just the base word + self.assertEqual(uealite('man', var='Adams'), 'man') + self.assertEqual(uealite('happiness', var='Adams'), 'happiness') + # stem theses as thesis but not bases as basis + self.assertEqual(uealite('theses', var='Adams'), 'thesis') + self.assertNotEqual(uealite('bases', var='Adams'), 'basis') + # stem preterite words ending in -ed without the -ed + self.assertEqual(uealite('ordained', var='Adams'), 'ordain') + self.assertEqual(uealite('killed', var='Adams'), 'kill') + self.assertEqual(uealite('liked', var='Adams'), 'like') + self.assertEqual(uealite('helped', var='Adams'), 'help') + # self.assertEqual(uealite('scarred', var='Adams'), 'scar') + self.assertEqual(uealite('invited', var='Adams'), 'invite') + self.assertEqual(uealite('exited', var='Adams'), 'exit') + self.assertEqual(uealite('debited', var='Adams'), 'debit') + self.assertEqual(uealite('smited', var='Adams'), 'smite') + # stem progressive verbs and gerunds without the -ing + self.assertEqual(uealite('running', var='Adams'), 'run') + self.assertEqual(uealite('settings', var='Adams'), 'set') + self.assertEqual(uealite('timing', var='Adams'), 'time') + self.assertEqual(uealite('dying', var='Adams'), 'die') + self.assertEqual(uealite('harping', var='Adams'), 'harp') + self.assertEqual(uealite('charring', var='Adams'), 'char') + # not stem false progressive verbs such as 'sing' + self.assertEqual(uealite('ring', var='Adams'), 'ring') + self.assertEqual(uealite('sing', var='Adams'), 'sing') + self.assertEqual(uealite('ring', var='Adams'), 'ring') + self.assertEqual(uealite('bring', var='Adams'), 'bring') + self.assertEqual(uealite('fling', var='Adams'), 'fling') + # stem various plural nouns and 3rd-pres verbs without the -s/-es + self.assertEqual(uealite('changes', var='Adams'), 'change') + self.assertEqual(uealite('deaths', var='Adams'), 'death') + self.assertEqual(uealite('shadows', var='Adams'), 'shadow') + self.assertEqual(uealite('flies', var='Adams'), 'fly') + self.assertEqual(uealite('things', var='Adams'), 'thing') + self.assertEqual(uealite('nothings', var='Adams'), 'nothing') + self.assertEqual(uealite('witches', var='Adams'), 'witch') + self.assertEqual(uealite('makes', var='Adams'), 'make') + self.assertEqual(uealite('smokes', var='Adams'), 'smoke') + self.assertEqual(uealite('does', var='Adams'), 'do') + # stem various words with -des suffix + self.assertEqual(uealite('abodes', var='Adams'), 'abode') + self.assertEqual(uealite('escapades', var='Adams'), 'escapade') + self.assertEqual(uealite('crusades', var='Adams'), 'crusade') + self.assertEqual(uealite('grades', var='Adams'), 'grade') + # stem various words with -res suffix + self.assertEqual(uealite('wires', var='Adams'), 'wire') + self.assertEqual(uealite('acres', var='Adams'), 'acre') + self.assertEqual(uealite('fires', var='Adams'), 'fire') + self.assertEqual(uealite('cares', var='Adams'), 'care') + # stem acronyms when pluralized otherwise they should be left alone + self.assertEqual(uealite('USA', var='Adams'), 'USA') + self.assertEqual(uealite('FLOSS', var='Adams'), 'FLOSS') + self.assertEqual(uealite('MREs', var='Adams'), 'MRE') + self.assertEqual(uealite('USAED', var='Adams'), 'USAED') + def test_uealite_wsj_set(self): """Test abydos.stemmer.uealite using the WSJ test set.""" with open(TESTDIR + '/corpora/uea-lite_wsj.csv') as wsj_testset: