Skip to content

Commit

Permalink
fix number extraction edge cases for structured incident annotator
Browse files Browse the repository at this point in the history
  • Loading branch information
nathanathan committed Nov 29, 2018
1 parent 722b5a7 commit 26a990d
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 36 deletions.
27 changes: 17 additions & 10 deletions epitator/raw_number_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,11 @@ def annotate(self, doc):
spacy_tokens = doc.tiers['spacy.tokens']
spacy_nes = doc.tiers['spacy.nes']
numbers = []
spacy_numbers = []
for ne_span in spacy_nes:
if ne_span.label in ['QUANTITY', 'CARDINAL']:
if is_valid_number(ne_span.text):
numbers.append(ne_span)
spacy_numbers.append(ne_span)
else:
joiner_offsets = [m.span()
for m in re.finditer(r'\s(?:to|and|or)\s',
Expand All @@ -41,22 +42,28 @@ def annotate(self, doc):
range_start = AnnoSpan(ne_span.start, ne_span.start + joiner_offsets[0][0], doc)
range_end = AnnoSpan(ne_span.start + joiner_offsets[0][1], ne_span.end, doc)
if is_valid_number(range_start.text):
numbers.append(range_start)
spacy_numbers.append(range_start)
if is_valid_number(range_end.text):
numbers.append(range_end)
spacy_numbers.append(range_end)

# Add purely numeric numbers that were not picked up by the NER.
# NB: The dates in SpaCy NEs may be longer than those in dates. The
# SpaCy date NEs are removed to prevent excessively long spans of text
# from being used to remove valid counts.
numbers += spacy_tokens.search_spans(r'[1-9]\d{0,6}')\
.without_overlaps(spacy_nes.without_overlaps(dates)).spans
numbers += spacy_numbers
# Add purely numeric tokens that were not picked up by the NER.
# NB: The dates in SpaCy NEs may be longer than the date annotations
# in the dates tier. The SpaCy date NEs are removed to prevent
# excessively long spans of text from being used to remove valid counts.
numbers += spacy_tokens.search_spans(r'[1-9]\d{0,6}').without_overlaps(
AnnoTier(spacy_numbers, presorted=True).without_overlaps(dates)
).spans
# Add delimited numbers
numbers += doc.create_regex_tier(
r'[1-9]\d{0,2}(( \d{3})+|(,\d{3})+)').spans
# Remove counts that overlap a date
filtered_numbers = []
for number_span, date_spans in AnnoTier(numbers).group_spans_by_containing_span(dates, allow_partial_containment=True):
date_annotations_and_spacy_ner_dates = dates + spacy_nes.with_label('DATE').without_overlaps(dates)
dates_by_number_span = AnnoTier(numbers).group_spans_by_containing_span(
date_annotations_and_spacy_ner_dates,
allow_partial_containment=True)
for number_span, date_spans in dates_by_number_span:
if len(date_spans) > 1:
continue
# If the number span completely contains the date span keep it
Expand Down
39 changes: 15 additions & 24 deletions epitator/structured_incident_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,24 +81,12 @@ class StructuredIncidentAnnotator(Annotator):
"""

def annotate(self, doc):
if 'structured_data' not in doc.tiers:
doc.add_tiers(StructuredDataAnnotator())
if 'geonames' not in doc.tiers:
doc.add_tiers(GeonameAnnotator())
if 'dates' not in doc.tiers:
doc.add_tiers(DateAnnotator())
if 'resolved_keywords' not in doc.tiers:
doc.add_tiers(ResolvedKeywordAnnotator())
if 'spacy.tokens' not in doc.tiers:
doc.add_tiers(SpacyAnnotator())
if 'raw_numbers' not in doc.tiers:
doc.add_tiers(RawNumberAnnotator())

geonames = doc.tiers['geonames']
dates = doc.tiers['dates']
resolved_keywords = doc.tiers['resolved_keywords']
spacy_tokens = doc.tiers['spacy.tokens']
numbers = doc.tiers['raw_numbers']
structured_data = doc.require_tiers('structured_data', via=StructuredDataAnnotator)
geonames = doc.require_tiers('geonames', via=GeonameAnnotator)
dates = doc.require_tiers('dates', via=DateAnnotator)
resolved_keywords = doc.require_tiers('resolved_keywords', via=ResolvedKeywordAnnotator)
spacy_tokens = doc.require_tiers('spacy.tokens', via=SpacyAnnotator)
numbers = doc.require_tiers('raw_numbers', via=RawNumberAnnotator)
species_list = []
disease_list = []
for k in resolved_keywords:
Expand Down Expand Up @@ -134,7 +122,7 @@ def annotate(self, doc):
tables = []
possible_titles = doc.create_regex_tier("[^\n]+\n")\
.chains(at_most=4, max_dist=0)\
.without_overlaps(doc.tiers['structured_data'])\
.without_overlaps(structured_data)\
.optimal_span_set(prefer='text_length_min_spans')
for span in doc.tiers['structured_data'].spans:
if span.metadata['type'] != 'table':
Expand Down Expand Up @@ -194,22 +182,22 @@ def annotate(self, doc):
column_values = AnnoTier(column_values)
# Choose column type based on greatest percent match,
# if under 30, choose text.
max_matches = 0
max_score = 0
matching_column_entities = None
column_type = "text"
for value_type, value_spans in entities_by_type.items():
filtered_value_spans = value_spans
if value_type == "integer":
filtered_value_spans = value_spans.without_overlaps(dates)
column_entities = [
SpanGroup(contained_spans, metadata=combine_metadata(contained_spans)) if len(contained_spans) > 0 else None
for group_span, contained_spans in column_values.group_spans_by_containing_span(filtered_value_spans)]
num_matches = sum(
contained_spans is not None
for contained_spans in column_entities)
# Prefer other types like dates over integers if all else is equal.
match_score = num_matches + (0 if value_type == "integer" else 1)
if num_non_null_rows > 0 and float(num_matches) / num_non_null_rows > 0.3:
if num_matches > max_matches:
max_matches = num_matches
if match_score > max_score:
max_score = match_score
matching_column_entities = column_entities
column_type = value_type
if matching_column_entities is None:
Expand All @@ -228,6 +216,9 @@ def annotate(self, doc):
column_definitions = [
{'type': column_type}
for column_type in column_types]
# This is designed to detect rows that cover weekly or montly intervals
# with some potentially missing entries.
# Irregular intervals are not detected.
date_period = None
for column_def, entities in zip(column_definitions, parsed_column_entities):
if column_def['type'] == 'date':
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ python-dateutil
dateparser==0.7.0
rdflib
six
spacy==2.0.16
spacy==2.0.17
pyparsing==2.2.0
regex==2018.01.10
https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.0.0/en_core_web_md-2.0.0.tar.gz
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
'dateparser==0.7.0',
'geopy>=1.11.0',
'unicodecsv>=0.14.1',
'spacy==2.0.16',
'spacy==2.0.17',
'pyparsing==2.2.0',
'numpy>=1.15.0',
'rdflib>=4.2.2',
Expand Down
20 changes: 20 additions & 0 deletions tests/annotator/test_incident_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,3 +203,23 @@ def test_location_grouping(self):
self.assertEqual(
len(find(doc.tiers['incidents'], lambda span: span.metadata['value'] == 3).metadata['locations']),
3)

def test_abiguous_date_count(self):
doc = AnnoDoc("""
Dengue in West Virginia, United States:
Date / Cases / New cases per week
23 May 2018 / 106
29 Jun 2018 / 404 / 59
13 Jul 2018 / 540 / 68
9 Nov 2018 / 1774 / 49
16 Nov 2018 / 1859 / 85
23 Nov 2018 / 1907 / 48
""")
doc.add_tier(self.annotator)
self.assertTrue(1907 in [span.metadata['value'] for span in doc.tiers['incidents']])
20 changes: 20 additions & 0 deletions tests/annotator/test_structured_incident_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,3 +446,23 @@ def test_multiline_title(self):
# name when determining column types. Simply giving integer interpretations
# priority in all cases doesn't work on docs like the one in test_unusual_format.
self.assertEqual(doc.tiers['structured_incidents'][0].metadata['location']['name'], 'Arizona')

# @with_log_level(logging.getLogger('epitator.structured_incident_annotator'), logging.INFO)
def test_missing_count_bug(self):
doc = AnnoDoc("""
State / Number of Cases
Alabama / 25
Arizona / 6
Arkansas / 9
California / 54
Colorado / 18
Connecticut / 9
""")
doc.add_tier(self.annotator)
self.assertEqual(len(doc.tiers['structured_incidents']), 6)

0 comments on commit 26a990d

Please sign in to comment.