Fix for overlapping custom sentiment terms

clairempr · May 30, 2022 · 8e4659f · 8e4659f
1 parent efa82cd
commit 8e4659f
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 1 deletion.
diff --git a/letter_sentiment/custom_sentiment.py b/letter_sentiment/custom_sentiment.py
@@ -40,6 +40,9 @@ def highlight_for_custom_sentiment(text, custom_sentiment_id):
     termvector = get_sentiment_termvector_for_text(text)
     terms_to_place = {}
 
+    # Look for terms in text's termvector
+    # If an n-gram of the term is inside the token, remove that n-gram's token,
+    # because we're interested in the most complete occurrence of the term
     for term in terms:
         term_text = term.analyzed_text
         if term_text in termvector and 'term_freq' in termvector[term_text]:
@@ -50,11 +53,20 @@ def highlight_for_custom_sentiment(text, custom_sentiment_id):
                     terms_to_place[position] = (start, end, term.weight)
                     termvector = update_tokens_in_termvector(termvector, term, token)
 
-    # offsets will be altered by insertions of highlighting markup,
+    # Offsets will be altered by insertions of highlighting markup,
     # depending on position in text, so start inserting at the end
     sorted_terms_to_place \
         = [terms_to_place[pos] for pos in sorted(terms_to_place.keys(), reverse=True)]
 
+    # If there are overlapping terms in the text, adjust start or end position of one of them
+    prev_start_pos = 0
+    for idx, (start_pos, end_pos, weight) in enumerate(sorted_terms_to_place):
+        if prev_start_pos and end_pos > prev_start_pos:
+            new_end = prev_start_pos - 1
+            sorted_terms_to_place[idx] = (start_pos, new_end, weight)
+        prev_start_pos = start_pos
+
+    # Apply css classes for highlighting
     for start_pos, end_pos, weight in sorted_terms_to_place:
         highlight_class = highlight_normal_class if weight == 1 else highlight_extra_class
         highlighted_text = str.format('{0}<span class="{1}">{2}</span>{3}',

diff --git a/letter_sentiment/tests/test_custom_sentiment.py b/letter_sentiment/tests/test_custom_sentiment.py
@@ -325,6 +325,36 @@ def test_highlight_for_custom_sentiment_termvector_no_tokens(self, mock_update_t
         self.assertEqual(mock_update_tokens_in_termvector.call_count, 0,
             "highlight_for_custom_sentiment() shouldn't call update_tokens_in_termvector() if no tokens in text")
 
+    @patch('letter_sentiment.custom_sentiment.get_custom_sentiment', autospec=True)
+    @patch('letter_sentiment.custom_sentiment.sort_terms_by_number_of_words', autospec=True)
+    @patch('letter_sentiment.custom_sentiment.get_sentiment_termvector_for_text', autospec=True)
+    @patch('letter_sentiment.custom_sentiment.update_tokens_in_termvector', autospec=True)
+    def test_highlight_for_custom_sentiment_overlapping_terms(self, mock_update_tokens_in_termvector,
+                                            mock_get_sentiment_termvector_for_text,
+                                            mock_sort_terms_by_number_of_words,
+                                            mock_get_custom_sentiment):
+        """
+        Test highlight_for_custom_sentiment() for situations where there are overlapping terms found
+        """
+
+        # Text is "tofu artisan pabst"
+        artisan_pabst = TermFactory(text='artisan pabst', analyzed_text='artisan pabst',
+                                    custom_sentiment=self.custom_sentiment)
+
+        mock_get_sentiment_termvector_for_text.return_value = self.termvector
+        mock_sort_terms_by_number_of_words.return_value = [self.tofu_artisan, artisan_pabst, self.pabst, self.locavore]
+        mock_update_tokens_in_termvector.return_value = self.termvector
+
+        highlighted_text = highlight_for_custom_sentiment(self.text, custom_sentiment_id=self.custom_sentiment.id)
+
+        # Highlighted text should be <highlight>tofu</highlight><highlight>artisan</highlight><highlight>pabst</highlight>
+        # because highlights are inserted starting from the end
+        self.assertTrue('tofu' in highlighted_text, 'Overlapping terms should be highlighted separately')
+        self.assertFalse('tofu artisan' in highlighted_text, "Overlapping terms shouldn't be highlighted together")
+        self.assertTrue('artisan' in highlighted_text, 'Overlapping terms should be highlighted separately')
+        self.assertFalse('artisan pabst' in highlighted_text, "Overlapping terms shouldn't be highlighted together")
+        self.assertTrue('pabst' in highlighted_text, 'Overlapping terms should be highlighted separately')
+
 
 class SortTermsByNumberOfWordsTestCase(TestCase):
     """