Skip to content

Commit

Permalink
Fix for overlapping custom sentiment terms
Browse files Browse the repository at this point in the history
  • Loading branch information
clairempr committed May 30, 2022
1 parent efa82cd commit 8e4659f
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 1 deletion.
14 changes: 13 additions & 1 deletion letter_sentiment/custom_sentiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ def highlight_for_custom_sentiment(text, custom_sentiment_id):
termvector = get_sentiment_termvector_for_text(text)
terms_to_place = {}

# Look for terms in text's termvector
# If an n-gram of the term is inside the token, remove that n-gram's token,
# because we're interested in the most complete occurrence of the term
for term in terms:
term_text = term.analyzed_text
if term_text in termvector and 'term_freq' in termvector[term_text]:
Expand All @@ -50,11 +53,20 @@ def highlight_for_custom_sentiment(text, custom_sentiment_id):
terms_to_place[position] = (start, end, term.weight)
termvector = update_tokens_in_termvector(termvector, term, token)

# offsets will be altered by insertions of highlighting markup,
# Offsets will be altered by insertions of highlighting markup,
# depending on position in text, so start inserting at the end
sorted_terms_to_place \
= [terms_to_place[pos] for pos in sorted(terms_to_place.keys(), reverse=True)]

# If there are overlapping terms in the text, adjust start or end position of one of them
prev_start_pos = 0
for idx, (start_pos, end_pos, weight) in enumerate(sorted_terms_to_place):
if prev_start_pos and end_pos > prev_start_pos:
new_end = prev_start_pos - 1
sorted_terms_to_place[idx] = (start_pos, new_end, weight)
prev_start_pos = start_pos

# Apply css classes for highlighting
for start_pos, end_pos, weight in sorted_terms_to_place:
highlight_class = highlight_normal_class if weight == 1 else highlight_extra_class
highlighted_text = str.format('{0}<span class="{1}">{2}</span>{3}',
Expand Down
30 changes: 30 additions & 0 deletions letter_sentiment/tests/test_custom_sentiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,36 @@ def test_highlight_for_custom_sentiment_termvector_no_tokens(self, mock_update_t
self.assertEqual(mock_update_tokens_in_termvector.call_count, 0,
"highlight_for_custom_sentiment() shouldn't call update_tokens_in_termvector() if no tokens in text")

@patch('letter_sentiment.custom_sentiment.get_custom_sentiment', autospec=True)
@patch('letter_sentiment.custom_sentiment.sort_terms_by_number_of_words', autospec=True)
@patch('letter_sentiment.custom_sentiment.get_sentiment_termvector_for_text', autospec=True)
@patch('letter_sentiment.custom_sentiment.update_tokens_in_termvector', autospec=True)
def test_highlight_for_custom_sentiment_overlapping_terms(self, mock_update_tokens_in_termvector,
mock_get_sentiment_termvector_for_text,
mock_sort_terms_by_number_of_words,
mock_get_custom_sentiment):
"""
Test highlight_for_custom_sentiment() for situations where there are overlapping terms found
"""

# Text is "tofu artisan pabst"
artisan_pabst = TermFactory(text='artisan pabst', analyzed_text='artisan pabst',
custom_sentiment=self.custom_sentiment)

mock_get_sentiment_termvector_for_text.return_value = self.termvector
mock_sort_terms_by_number_of_words.return_value = [self.tofu_artisan, artisan_pabst, self.pabst, self.locavore]
mock_update_tokens_in_termvector.return_value = self.termvector

highlighted_text = highlight_for_custom_sentiment(self.text, custom_sentiment_id=self.custom_sentiment.id)

# Highlighted text should be <highlight>tofu</highlight><highlight>artisan</highlight><highlight>pabst</highlight>
# because highlights are inserted starting from the end
self.assertTrue('tofu' in highlighted_text, 'Overlapping terms should be highlighted separately')
self.assertFalse('tofu artisan' in highlighted_text, "Overlapping terms shouldn't be highlighted together")
self.assertTrue('artisan' in highlighted_text, 'Overlapping terms should be highlighted separately')
self.assertFalse('artisan pabst' in highlighted_text, "Overlapping terms shouldn't be highlighted together")
self.assertTrue('pabst' in highlighted_text, 'Overlapping terms should be highlighted separately')


class SortTermsByNumberOfWordsTestCase(TestCase):
"""
Expand Down

0 comments on commit 8e4659f

Please sign in to comment.