Skip to content

Commit

Permalink
update spacy version and geoname classifier
Browse files Browse the repository at this point in the history
  • Loading branch information
nathanathan committed Apr 4, 2019
1 parent 8fe5458 commit 6942b2b
Show file tree
Hide file tree
Showing 7 changed files with 58 additions and 63 deletions.
90 changes: 45 additions & 45 deletions epitator/geoname_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,45 +27,45 @@
'classes_': array([False, True]),
'coef_': array([[
# log_population
0.3429166761571069,
0.3288826652391892,
# name_count
0.21709830611570793,
0.21180346761017554,
# names_used
0.8269376580293233,
0.6921137888810455,
# exact_name_match
0.7878854182406542,
0.8907122816181057,
# multiple_spans
0.42317571947152266,
0.4385195357489847,
# span_length
0.11635624170666362,
0.14885276617871698,
# all_acronyms
-2.328731808328372,
-2.4168746146102844,
# cannonical_name_used
2.385204467844477,
2.179668174506593,
# loc_NE_portion
0.9329501759634057,
1.116805637212708,
# other_NE_portion
-0.2987093084510809,
-0.19762918486107944,
# noun_portion
0.0,
# num_tokens
0.6928142361575395,
0.45408740342595283,
# med_token_prob
-0.3397337893668447,
-0.3287514923579695,
# exact_alternatives
-0.9036023140885108,
-0.9026051613064152,
# PPL_feature_code
-0.9397288188035898,
-0.8713730521972315,
# ADM_feature_code
-1.2550824212955247,
-1.3357916436134925,
# PCL_feature_code
2.5134423822820797,
2.2270435390046472,
# other_feature_code
0.0,
# first_order
1.1084960371415022,
1.217498915455807,
# combined_span
1.4932312460013852,
1.5478451457037072,
# close_locations
0.0,
# very_close_locations
Expand All @@ -79,8 +79,8 @@
# containing_locations
0.0,
]]),
'intercept_': array([-13.1126576]),
'n_iter_': array([39], dtype=int32),
'intercept_': array([-12.90888507]),
'n_iter_': array([42], dtype=int32),
}

contextual_classifier =\
Expand All @@ -102,60 +102,60 @@
'classes_': array([False, True]),
'coef_': array([[
# log_population
0.31206770841202336,
0.28228248087215385,
# name_count
0.2005454877333416,
0.19424959537231867,
# names_used
0.648225709888574,
0.4115855267983377,
# exact_name_match
0.26445683289728583,
0.339488954171464,
# multiple_spans
0.3307522320457672,
0.3082735346279554,
# span_length
0.12130650943509746,
0.15794944543924766,
# all_acronyms
-2.163337787453372,
-2.0910830240634297,
# cannonical_name_used
2.3090619705615563,
1.9360786322544647,
# loc_NE_portion
1.4508913007712096,
1.5434804936396482,
# other_NE_portion
0.0,
# noun_portion
0.0,
# num_tokens
0.7907962702562655,
0.4670721503329358,
# med_token_prob
-0.2923973470351299,
-0.2642808804913221,
# exact_alternatives
-0.8011614417553122,
-0.7689405320821576,
# PPL_feature_code
-0.5475447417562185,
-0.450305349464007,
# ADM_feature_code
-0.9261497922203902,
-0.9421785297062796,
# PCL_feature_code
2.299107106153456,
1.885630901040498,
# other_feature_code
0.0,
# first_order
1.033140751957464,
1.0904825221747996,
# combined_span
0.36278895602219524,
0.31258233468268437,
# close_locations
0.12774614812147872,
0.1364085867326155,
# very_close_locations
-0.01864384833938384,
-0.00243964840385385,
# base_score
-1.478626994083764,
-1.1232711320877737,
# base_score_margin
2.5041655302244554,
2.6704843451856335,
# contained_locations
0.1104326082786128,
0.0784905901350068,
# containing_locations
0.40453329344238753,
0.47875034021904533,
]]),
'intercept_': array([-12.44492995]),
'n_iter_': array([36], dtype=int32),
'intercept_': array([-11.7165164]),
'n_iter_': array([33], dtype=int32),
}

# Logistic regression code from scipy
Expand Down
4 changes: 2 additions & 2 deletions epitator/spacy_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from __future__ import absolute_import
from .annotator import Annotator, AnnoSpan, AnnoTier
import re
from .spacy_nlp import spacy_nlp, sent_nlp
from .spacy_nlp import spacy_nlp, custom_sentencizer


class TokenSpan(AnnoSpan):
Expand Down Expand Up @@ -68,7 +68,7 @@ def annotate(self, doc):
# is not memory constrained.
# https://github.com/explosion/spaCy/issues/1636
sentences = AnnoTier([
SentSpan(sent, doc) for sent in sent_nlp(doc.text).sents])
SentSpan(sent, doc) for sent in custom_sentencizer(doc.text)])
tiers['spacy.sentences'] = sentences
group_size = 10
for sent_group_idx in range(0, len(sentences), group_size):
Expand Down
10 changes: 5 additions & 5 deletions epitator/spacy_nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,23 @@
line_break_re = re.compile(r"\n{4,}")


def sentencizer_strategy(doc):
def custom_sentencizer(doc_text):
"""
A modified version of the default sentencizer_strategy that also breaks
on sequences of more than 4 spaces.
"""
doc = sent_nlp(doc_text)
start = 0
seen_sent_end = False
for i, word in enumerate(doc):
word.is_sent_start = i == 0
if seen_sent_end and not word.is_punct:
yield doc[start:word.i]
start = word.i
word.is_sent_start = True
seen_sent_end = False
elif word.text in ['.', '!', '?'] or line_break_re.match(word.text):
seen_sent_end = True
if start < len(doc):
doc[start].is_sent_start = True
yield doc[start:len(doc)]


sent_nlp.add_pipe(sent_nlp.create_pipe('sentencizer', config={
'strategy': sentencizer_strategy}))
2 changes: 1 addition & 1 deletion epitator/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '1.3.0'
__version__ = '1.3.1'
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ python-dateutil
dateparser==0.7.1
rdflib
six
spacy==2.1.2
spacy==2.1.3
pyparsing==2.2.0
regex==2018.01.10
https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0/en_core_web_md-2.1.0.tar.gz
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
'dateparser==0.7.1',
'geopy>=1.11.0',
'unicodecsv>=0.14.1',
'spacy==2.1.2',
'spacy==2.1.3',
'pyparsing==2.2.0',
'numpy>=1.16.1',
'rdflib>=4.2.2',
Expand Down
11 changes: 3 additions & 8 deletions tests/annotator/test_structured_incident_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,6 @@ def test_count_table(self):
'attributes': []
}])

# TODO: Alagoas is resolved to incorrect location.
# Alagoas (AL) resolves to a location in Alagoas
# Marechal Deodoro, because one of its alternative names is Alagoas,
# so it is mistaken for being a compound name which scores higher than
# Alagoas due to its reduced ambiguity.
# @with_log_level(logging.getLogger('epitator.structured_incident_annotator'), logging.INFO)
def test_location_count_table(self):
doc = AnnoDoc("""
Expand Down Expand Up @@ -119,10 +114,10 @@ def test_location_count_table(self):
remove_empty_props(span.metadata)
for span in doc.tiers['structured_incidents']
]
incident = metadatas[1]
self.assertEqual(incident['value'], 8)
incident = metadatas[0]
self.assertEqual(incident['value'], 1)
self.assertEqual(incident['type'], 'caseCount')
self.assertEqual(incident['location']['geonameid'], '6319493')
self.assertEqual(incident['location']['geonameid'], '3665474')
self.assertEqual(
incident['dateRange'],
[datetime.datetime(2017, 7, 1),
Expand Down

0 comments on commit 6942b2b

Please sign in to comment.