Skip to content

Commit

Permalink
Better handle low quality input
Browse files Browse the repository at this point in the history
  • Loading branch information
gunthercox committed Dec 9, 2018
1 parent 4cfa79a commit e2c0c73
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 12 deletions.
33 changes: 22 additions & 11 deletions chatterbot/stemming.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,27 +185,38 @@ def get_bigram_pair_string(self, text):
words = text.split()

# Separate punctuation from last word in string
word_with_punctuation_removed = words[-1].strip(string.punctuation)
if words:
word_with_punctuation_removed = words[-1].strip(string.punctuation)

if word_with_punctuation_removed:
words[-1] = word_with_punctuation_removed
if word_with_punctuation_removed:
words[-1] = word_with_punctuation_removed

pos_tags = pos_tag(words)

hypernyms = self.get_hypernyms(pos_tags)

bigrams = []
high_quality_bigrams = []
all_bigrams = []

word_count = len(words)

if word_count <= 1:
bigrams = words
if bigrams:
bigrams[0] = bigrams[0].lower()
all_bigrams = words
if all_bigrams:
all_bigrams[0] = all_bigrams[0].lower()

for index in range(1, word_count):
if words[index].lower() not in self.get_stopwords():
bigram = pos_tags[index - 1][1] + ':' + hypernyms[index].lower()
bigrams.append(bigram)
word = words[index].lower()
previous_word_pos = pos_tags[index - 1][1]
if word not in self.get_stopwords() and len(word) > 1:
bigram = previous_word_pos + ':' + hypernyms[index].lower()
high_quality_bigrams.append(bigram)
all_bigrams.append(bigram)
else:
bigram = previous_word_pos + ':' + word
all_bigrams.append(bigram)

return ' '.join(bigrams)
if high_quality_bigrams:
all_bigrams = high_quality_bigrams

return ' '.join(all_bigrams)
16 changes: 15 additions & 1 deletion tests/test_stemming.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@ class SimpleStemmerTests(TestCase):
def setUp(self):
self.stemmer = stemming.SimpleStemmer()

def test_empty_string(self):
stemmed_text = self.stemmer.get_bigram_pair_string(
''
)

self.assertEqual(stemmed_text, '')

def test_stemming(self):
stemmed_text = self.stemmer.get_stemmed_words(
'Hello, how are you doing on this awesome day?'
Expand Down Expand Up @@ -112,6 +119,13 @@ class PosHypernymStemmerTests(TestCase):
def setUp(self):
self.stemmer = stemming.PosHypernymStemmer()

def test_empty_string(self):
stemmed_text = self.stemmer.get_bigram_pair_string(
''
)

self.assertEqual(stemmed_text, '')

def test_stemming(self):
stemmed_text = self.stemmer.get_bigram_pair_string(
'Hello, how are you doing on this awesome day?'
Expand Down Expand Up @@ -202,7 +216,7 @@ def test_get_bigram_pair_string_single_character_words(self):
'a e i o u'
)

self.assertEqual(bigram_string, 'DT:antioxidant VBP:nucleotide')
self.assertEqual(bigram_string, 'DT:e NN:i NN:o VBP:u')

def test_get_bigram_pair_string_two_character_words(self):
bigram_string = self.stemmer.get_bigram_pair_string(
Expand Down

0 comments on commit e2c0c73

Please sign in to comment.