In [1]:
!pip install -U spacy==2.3.5



In [2]:
import spacy
print(spacy.__version__)

2.3.5


In [3]:
# Had to run this to make something below work.
# This is a donwgrade of numpy to a version without a bug.
# This is a temp solution until a new version of numpy has solved the bug.
!pip install numpy==1.19.3 --user



In [4]:
def is_float(n):
    try:
        support_float_with_norwegian_format = n.replace(',','.')
        float_n = float(support_float_with_norwegian_format)
    except ValueError:
        return False
    else:
        return True
    
# test
print(is_float('test'))
print(is_float('4'))
print(is_float('4.5'))
print(is_float('4,5'))

False
True
True
True


In [5]:
def is_int(n):
    try:
        float_n = float(n)
        int_n = int(float_n)
    except ValueError:
        return False
    else:
        return float_n == int_n

# test
print(is_int('test'))
print(is_int('4'))
print(is_int('4.5'))

False
True
False


In [6]:
#text = "(2) Vessels constructed before 2 January 1988 shall be constructed and equipped in accordance with the rules applicable at the time of construction of the vessel, unless otherwise provided by the individual provisions of this chapter. (1) Vessels constructed after 1 January 1992 shall satisfy the requirements of Nordic Boat Standard (1990)"
#text = "(3) Vessels of 6 to 15 metres in overall length constructed between 1 January 1992 and 2002 with electrical installations with voltages of more than 50 V shall satisfy the requirements laid down in the now repealed Regulations of 1 March 1990 No. 125 for electrical installations - Maritime installations."
text = "(4) Vessels of less than 10.67 metres in overall length constructed after 1 January 1992 may as an alternative be equipped with a fire-extinguishing system in accordance with guidelines laid down in chapter C14 of Nordic Boat Standard (1990)."

In [7]:
import spacy
from spacy.lang.en import English
from spacy.matcher import Matcher

# Blank model.
# Does not have a POS tagger.
nlp = English()

doc = nlp(text)

matcher = Matcher(nlp.vocab)

#
# START - spaCy patterns
#

# WATER_VESSEL
water_vessel_pattern = [{"LOWER": {"IN": ["vessels"]}}]
matcher.add("WATER_VESSEL", None, water_vessel_pattern)

# DATE
matcher.add("DATE", None, [{'IS_DIGIT': True, 'LENGTH': 4}])

# CONSTRUCT
matcher.add("CONSTRUCT", None, [{"LOWER": {"IN": ["constructed"]}}])

#
# END - spaCy patterns
#

result = []

for match_id, token_start, token_end in matcher(doc):

    match_id_as_string = nlp.vocab.strings[match_id]
    final_token_start = token_start
    final_token_end = token_end
    
    if match_id_as_string == "DATE" and token_start > 0:

        # At this point, DATE is just a year string. Example: 2021

        # Expand DATE?
        prev_word_1_token_number = token_start - 1
        prev_word_1_token = doc[prev_word_1_token_number]
        if prev_word_1_token.text.lower() in ("january","february","march","april","may","june","july","august","september","october","november","december"):
            final_token_start = prev_word_1_token_number # expanding
            # Expand more?
            prev_word_2_token_number = token_start - 2
            prev_word_2_token = doc[prev_word_2_token_number]
            if is_int(prev_word_2_token.text):
                final_token_start = prev_word_2_token_number # expanding

        prev_word_on_date_token_number = final_token_start - 1
        prev_word_on_date_token = doc[prev_word_on_date_token_number]

        # Does the DATE have a DATE_SEPARATOR?
        if prev_word_on_date_token.text in ("and", "to"):
            prev_word_on_date_char_span_start_number = prev_word_on_date_token.idx
            prev_word_on_date_char_span_end_number = prev_word_on_date_char_span_start_number + len(prev_word_on_date_token.text)
            identified_entity = {'start': prev_word_on_date_char_span_start_number, 'end': prev_word_on_date_char_span_end_number, 'label': "DATE_SEPARATOR"}
            result.append(identified_entity)

        # Does the DATE have a DATE_SEPARATOR?
        elif prev_word_on_date_token.text in ("between", "before", "after"):
            # DATE_PREFIX detected
            prev_word_on_date_char_span_start_number = prev_word_on_date_token.idx
            prev_word_on_date_char_span_end_number = prev_word_on_date_char_span_start_number + len(prev_word_on_date_token.text)
            identified_entity = {'start': prev_word_on_date_char_span_start_number, 'end': prev_word_on_date_char_span_end_number, 'label': "DATE_PREFIX"}
            result.append(identified_entity)

    #
    # convert token_span to char_span.
    # char_span is needed to display correctly withdisplacy.render().
    #
    span = doc[final_token_start:final_token_end]
    span_char_start = span[0].idx
    span_char_end = span[-1].idx + len(span[-1].text)

    # return result
    identified_entity = {'start': span_char_start, 'end': span_char_end, 'label': match_id_as_string}
    result.append(identified_entity)

# display result with spacy styling
display_this_result = {'text': text, 'ents': sorted(result, key=lambda x: x['start']), 'title': 0}
options = {"colors": {"DATE": "orange", "WATER_VESSEL": "aquamarine", "CONSTRUCT": "darksalmon", "DATE_SEPARATOR": "cornflowerblue", "DATE_PREFIX": "lightgoldenrodyellow"}}
from spacy import displacy
displacy.render(display_this_result, style='ent', manual=True, options=options)

# Functions run from .py file

In [8]:
#
# Something that must be done 
# to make the notebook reload a function
# from a costum .py file.
#

%reload_ext autoreload
%autoreload 2

import importlib

import spacy_matching_rule_identify_build_date_en

from spacy_matching_rule_identify_build_date_en import identify_build_date_in_text

importlib.reload(spacy_matching_rule_identify_build_date_en)

################
################

identify_build_date_in_text(text)

[{'start': 4, 'end': 11, 'label': 'WATER_VESSEL'},
 {'start': 56, 'end': 67, 'label': 'CONSTRUCT'},
 {'start': 68, 'end': 73, 'label': 'DATE_PREFIX'},
 {'start': 74, 'end': 88, 'label': 'DATE'},
 {'start': 236, 'end': 240, 'label': 'DATE'}]