Skip to content

Commit

Permalink
Add split_compound_geonames option
Browse files Browse the repository at this point in the history
  • Loading branch information
nathanathan committed Apr 17, 2019
1 parent 5e11753 commit aa2df7b
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 13 deletions.
40 changes: 28 additions & 12 deletions epitator/geoname_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,13 +91,14 @@ def location_contains(loc_outer, loc_inner):


class GeoSpan(AnnoSpan):
def __init__(self, start, end, doc, geoname):
def __init__(self, original_span, geoname):
super(GeoSpan, self).__init__(
start,
end,
doc,
original_span.start,
original_span.end,
original_span.doc,
metadata={
'geoname': geoname
'geoname': geoname,
'original_span': original_span
})
self.geoname = geoname
self.label = geoname.name
Expand Down Expand Up @@ -138,6 +139,7 @@ class GeonameRow(object):
'alternate_locations',
'overlapping_locations',
'spans',
'original_spans',
'parents',
'score',
'lat_long',
Expand All @@ -152,6 +154,7 @@ def __init__(self, sqlite3_row):
self.alternate_locations = set()
self.overlapping_locations = set()
self.spans = set()
self.original_spans = set()
self.parents = set()
self.score = None

Expand Down Expand Up @@ -450,6 +453,7 @@ def is_possible_geoname(text, tokens):
candidate_geonames = []
for geoname in geoname_results:
geoname.add_spans(span_text_to_spans)
geoname.original_spans = set(geoname.spans)
# In rare cases geonames may have no matching spans because
# sqlite unicode equivalency rules match geonames that use different
# characters the document spans used to query them.
Expand All @@ -463,7 +467,7 @@ def is_possible_geoname(text, tokens):
for span in geoname.spans:
span_to_geonames[span].append(geoname)
geoname_spans = span_to_geonames.keys()
combined_spans = AnnoTier(geoname_spans).chains(at_least=2, at_most=4, max_dist=4)
combined_spans = AnnoTier(geoname_spans).chains(at_least=2, at_most=4, max_dist=4).label_spans('combined_span')
for combined_span in combined_spans:
leaf_spans = combined_span.iterate_leaf_base_spans()
first_spans = next(leaf_spans)
Expand Down Expand Up @@ -603,7 +607,7 @@ def feature_generator(filter_fun=lambda x: True):
for feature in features:
feature.set_contextual_features()

def annotate(self, doc, show_features_for_geonameids=None):
def annotate(self, doc, show_features_for_geonameids=None, split_compound_geonames=False):
logger.info('geoannotator started')
candidate_geonames = self.get_candidate_geonames(doc)
features = self.extract_features(candidate_geonames, doc)
Expand Down Expand Up @@ -669,12 +673,24 @@ def annotate(self, doc, show_features_for_geonameids=None):
setattr(geoname, attr, val)
prev_val = val
logger.info('admin names added')
geo_spans = []
geospans = []
for geoname in culled_geonames:
for span in geoname.spans:
geo_span = GeoSpan(
span.start, span.end, doc, geoname)
geo_spans.append(geo_span)
culled_geospans = AnnoTier(geo_spans).optimal_span_set(prefer=lambda x: (len(x), x.geoname.score,))
geospan = GeoSpan(span, geoname)
geospans.append(geospan)
culled_geospans = AnnoTier(geospans).optimal_span_set(prefer=lambda x: (len(x), x.geoname.score,))
if split_compound_geonames:
result = []
for geospan in culled_geospans:
original_span = geospan.metadata['original_span']
if original_span.label == 'combined_span':
possible_geonames = geospan.metadata['geoname'].parents | set([geospan.metadata['geoname']])
for original_leaf_span in original_span.iterate_leaf_base_spans():
for geoname in possible_geonames:
if original_leaf_span in geoname.original_spans:
result.append(GeoSpan(original_leaf_span, geoname))
else:
result.append(geospan)
culled_geospans = AnnoTier(result)
logger.info('overlapping geospans removed')
return {'geonames': culled_geospans}
2 changes: 1 addition & 1 deletion epitator/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '1.3.1'
__version__ = '1.3.2'
10 changes: 10 additions & 0 deletions tests/annotator/test_geoname_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,16 @@ def test_multipart_names_4(self):
doc.add_tier(self.annotator)
self.assertEqual(len(doc.tiers['geonames'].spans), 2)

def test_multipart_names_5(self):
text = 'From Seattle, WA, Canada is not far away.'
doc = AnnoDoc(text)
doc.add_tier(self.annotator, split_compound_geonames=True)

self.assertEqual(doc.text, text)
self.assertEqual(len(doc.tiers['geonames'].spans), 3)
self.assertEqual(doc.tiers['geonames'].spans[0].text, "Seattle")
self.assertEqual(doc.tiers['geonames'].spans[1].text, "WA")

def test_bug_causing_sentence(self):
text = u"""
In late June 2012, an increase in cases of prolonged fever for ≥3 days
Expand Down

0 comments on commit aa2df7b

Please sign in to comment.