update spacy version and geoname classifier

ecohealthalliance · Apr 4, 2019 · 6942b2b · 6942b2b
1 parent 8fe5458
commit 6942b2b
Show file tree

Hide file tree

Showing 7 changed files with 58 additions and 63 deletions.
diff --git a/epitator/geoname_classifier.py b/epitator/geoname_classifier.py
@@ -27,45 +27,45 @@
     'classes_': array([False,  True]),
     'coef_': array([[
         # log_population
-        0.3429166761571069,
+        0.3288826652391892,
         # name_count
-        0.21709830611570793,
+        0.21180346761017554,
         # names_used
-        0.8269376580293233,
+        0.6921137888810455,
         # exact_name_match
-        0.7878854182406542,
+        0.8907122816181057,
         # multiple_spans
-        0.42317571947152266,
+        0.4385195357489847,
         # span_length
-        0.11635624170666362,
+        0.14885276617871698,
         # all_acronyms
-        -2.328731808328372,
+        -2.4168746146102844,
         # cannonical_name_used
-        2.385204467844477,
+        2.179668174506593,
         # loc_NE_portion
-        0.9329501759634057,
+        1.116805637212708,
         # other_NE_portion
-        -0.2987093084510809,
+        -0.19762918486107944,
         # noun_portion
         0.0,
         # num_tokens
-        0.6928142361575395,
+        0.45408740342595283,
         # med_token_prob
-        -0.3397337893668447,
+        -0.3287514923579695,
         # exact_alternatives
-        -0.9036023140885108,
+        -0.9026051613064152,
         # PPL_feature_code
-        -0.9397288188035898,
+        -0.8713730521972315,
         # ADM_feature_code
-        -1.2550824212955247,
+        -1.3357916436134925,
         # PCL_feature_code
-        2.5134423822820797,
+        2.2270435390046472,
         # other_feature_code
         0.0,
         # first_order
-        1.1084960371415022,
+        1.217498915455807,
         # combined_span
-        1.4932312460013852,
+        1.5478451457037072,
         # close_locations
         0.0,
         # very_close_locations
@@ -79,8 +79,8 @@
         # containing_locations
         0.0,
     ]]),
-    'intercept_': array([-13.1126576]),
-    'n_iter_': array([39], dtype=int32),
+    'intercept_': array([-12.90888507]),
+    'n_iter_': array([42], dtype=int32),
 }
 
 contextual_classifier =\
@@ -102,60 +102,60 @@
     'classes_': array([False,  True]),
     'coef_': array([[
         # log_population
-        0.31206770841202336,
+        0.28228248087215385,
         # name_count
-        0.2005454877333416,
+        0.19424959537231867,
         # names_used
-        0.648225709888574,
+        0.4115855267983377,
         # exact_name_match
-        0.26445683289728583,
+        0.339488954171464,
         # multiple_spans
-        0.3307522320457672,
+        0.3082735346279554,
         # span_length
-        0.12130650943509746,
+        0.15794944543924766,
         # all_acronyms
-        -2.163337787453372,
+        -2.0910830240634297,
         # cannonical_name_used
-        2.3090619705615563,
+        1.9360786322544647,
         # loc_NE_portion
-        1.4508913007712096,
+        1.5434804936396482,
         # other_NE_portion
         0.0,
         # noun_portion
         0.0,
         # num_tokens
-        0.7907962702562655,
+        0.4670721503329358,
         # med_token_prob
-        -0.2923973470351299,
+        -0.2642808804913221,
         # exact_alternatives
-        -0.8011614417553122,
+        -0.7689405320821576,
         # PPL_feature_code
-        -0.5475447417562185,
+        -0.450305349464007,
         # ADM_feature_code
-        -0.9261497922203902,
+        -0.9421785297062796,
         # PCL_feature_code
-        2.299107106153456,
+        1.885630901040498,
         # other_feature_code
         0.0,
         # first_order
-        1.033140751957464,
+        1.0904825221747996,
         # combined_span
-        0.36278895602219524,
+        0.31258233468268437,
         # close_locations
-        0.12774614812147872,
+        0.1364085867326155,
         # very_close_locations
-        -0.01864384833938384,
+        -0.00243964840385385,
         # base_score
-        -1.478626994083764,
+        -1.1232711320877737,
         # base_score_margin
-        2.5041655302244554,
+        2.6704843451856335,
         # contained_locations
-        0.1104326082786128,
+        0.0784905901350068,
         # containing_locations
-        0.40453329344238753,
+        0.47875034021904533,
     ]]),
-    'intercept_': array([-12.44492995]),
-    'n_iter_': array([36], dtype=int32),
+    'intercept_': array([-11.7165164]),
+    'n_iter_': array([33], dtype=int32),
 }
 
 # Logistic regression code from scipy

diff --git a/epitator/spacy_annotator.py b/epitator/spacy_annotator.py
@@ -3,7 +3,7 @@
 from __future__ import absolute_import
 from .annotator import Annotator, AnnoSpan, AnnoTier
 import re
-from .spacy_nlp import spacy_nlp, sent_nlp
+from .spacy_nlp import spacy_nlp, custom_sentencizer
 
 
 class TokenSpan(AnnoSpan):
@@ -68,7 +68,7 @@ def annotate(self, doc):
         # is not memory constrained.
         # https://github.com/explosion/spaCy/issues/1636
         sentences = AnnoTier([
-            SentSpan(sent, doc) for sent in sent_nlp(doc.text).sents])
+            SentSpan(sent, doc) for sent in custom_sentencizer(doc.text)])
         tiers['spacy.sentences'] = sentences
         group_size = 10
         for sent_group_idx in range(0, len(sentences), group_size):

diff --git a/epitator/spacy_nlp.py b/epitator/spacy_nlp.py
@@ -13,23 +13,23 @@
 line_break_re = re.compile(r"\n{4,}")
 
 
-def sentencizer_strategy(doc):
+def custom_sentencizer(doc_text):
     """
     A modified version of the default sentencizer_strategy that also breaks
     on sequences of more than 4 spaces.
     """
+    doc = sent_nlp(doc_text)
     start = 0
     seen_sent_end = False
     for i, word in enumerate(doc):
+        word.is_sent_start = i == 0
         if seen_sent_end and not word.is_punct:
             yield doc[start:word.i]
             start = word.i
+            word.is_sent_start = True
             seen_sent_end = False
         elif word.text in ['.', '!', '?'] or line_break_re.match(word.text):
             seen_sent_end = True
     if start < len(doc):
+        doc[start].is_sent_start = True
         yield doc[start:len(doc)]
-
-
-sent_nlp.add_pipe(sent_nlp.create_pipe('sentencizer', config={
-    'strategy': sentencizer_strategy}))
diff --git a/epitator/version.py b/epitator/version.py
@@ -1 +1 @@
-__version__ = '1.3.0'
+__version__ = '1.3.1'
diff --git a/requirements.txt b/requirements.txt
@@ -6,7 +6,7 @@ python-dateutil
 dateparser==0.7.1
 rdflib
 six
-spacy==2.1.2
+spacy==2.1.3
 pyparsing==2.2.0
 regex==2018.01.10
 https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0/en_core_web_md-2.1.0.tar.gz
diff --git a/setup.py b/setup.py
@@ -30,7 +30,7 @@
         'dateparser==0.7.1',
         'geopy>=1.11.0',
         'unicodecsv>=0.14.1',
-        'spacy==2.1.2',
+        'spacy==2.1.3',
         'pyparsing==2.2.0',
         'numpy>=1.16.1',
         'rdflib>=4.2.2',

diff --git a/tests/annotator/test_structured_incident_annotator.py b/tests/annotator/test_structured_incident_annotator.py
@@ -76,11 +76,6 @@ def test_count_table(self):
             'attributes': []
         }])
 
-    # TODO: Alagoas is resolved to incorrect location.
-    # Alagoas (AL) resolves to a location in Alagoas
-    # Marechal Deodoro, because one of its alternative names is Alagoas,
-    # so it is mistaken for being a compound name which scores higher than
-    # Alagoas due to its reduced ambiguity.
     # @with_log_level(logging.getLogger('epitator.structured_incident_annotator'), logging.INFO)
     def test_location_count_table(self):
         doc = AnnoDoc("""
@@ -119,10 +114,10 @@ def test_location_count_table(self):
             remove_empty_props(span.metadata)
             for span in doc.tiers['structured_incidents']
         ]
-        incident = metadatas[1]
-        self.assertEqual(incident['value'], 8)
+        incident = metadatas[0]
+        self.assertEqual(incident['value'], 1)
         self.assertEqual(incident['type'], 'caseCount')
-        self.assertEqual(incident['location']['geonameid'], '6319493')
+        self.assertEqual(incident['location']['geonameid'], '3665474')
         self.assertEqual(
             incident['dateRange'],
             [datetime.datetime(2017, 7, 1),