In [1]:
from tika import parser
import re
import os
import spacy
from spacy.pipeline import EntityRuler
from collections import OrderedDict
from spacy.lang.en.stop_words import STOP_WORDS as STOP_WORDS_EN
from spacy.lang.es.stop_words import STOP_WORDS as STOP_WORDS_ES
from spacy import displacy

STOP_WORDS = STOP_WORDS_EN | STOP_WORDS_ES

In [2]:
def read_text(f):
    """converts pdf or txt file to string"""
    _, file_extension = os.path.splitext(f)
    if file_extension == '.txt':
        with open(f, encoding="utf8") as reader:
            text = reader.read()
    elif file_extension == '.pdf':
        raw = parser.from_file(f)
        text = raw['content']
        if not text:
            print('No text found')
    else:
        print("Incorrect file extension")
    
#     # remove new lines  # NOT NEEDED WITH SPLIT/JOIN BELOW
#     text = re.sub('\n\n+', '\n\n', text)
#     text = re.sub('\n+\s\n+', '\n\n', text)  
    # drop punctuation except periods, apostrophes, and hyphens
    # (note some files use different characters than ASCII apostrophes)
#     text = re.sub(r'[/:"]', " ", text)  # r'[()/:"]'
    # drop bullets (requires utf8 encoding)
    text = text.replace('\u2022', '')
    text = text.replace('\u2219', '')
    text = text.replace('\ufeff?', '')
    # drop periods when used in TOC, etc (i.e., multiple periods except elipsis)
    text = re.sub(r'\.\.\.\.+', '', text)  # four or more dots
    # replace elipses so it's not confused by spacy's sentence parser
    text = re.sub(r'\.\.\.', '---', text)  # three dots
    text = re.sub(r'\.\s\.\s\.\s', '---', text)  # three dots with spaces
    text = text.replace('\u2026', '---')  # elipse character
    # replace two dots
    text = re.sub(r'\.\.', '', text)  # two dots
    
    # remove excess spaces
    text = ' '.join(text.split())
    
    return text[:1000000]  # spacy nlp character limit is 1000000

In [3]:
report_folder = r'D:\data\essc-knowledge-base\reports_eng'
report = '04-14-Minnick-723_paper.pdf'
text = read_text(os.path.join(report_folder, report))
text

'Formalization of Artisanal and Small-Scale Gold Mining in Colombia FORMALIZATION OF ARTISANAL AND SMALL-SCALE GOLD MINING IN COLOMBIA: A PROPOSAL FOR IMPROVING ENVIRONMENTAL, SOCIAL, AND ECONOMIC PERFORMANCE IN A POST-CONFLICT SCENARIO GREGORY MINNICK, PETER DOYLE, THOMAS HENTSCHEL Chemonics Inc., Projekt-Consult GmbH gminnick@chemonics.com Paper prepared for presentation at the “2020 WORLD BANK CONFERENCE ON LAND AND POVERTY” The World Bank - Washington DC, March 16-20, 2020 Copyright 2020 by author(s). All rights reserved. Readers may make verbatim copies of this document for non-commercial purposes by any means, provided that this copyright notice appears on all such copies. Abstract Artisanal and small-scale gold mining (ASGM) is an important economic activity in many rural areas, with a long tradition in Colombia. High rates of informality undermine governance, security, and legal economies, and cause significant environmental and social impacts. As the government of Colombia con

In [4]:
# edited from scispacy Abbreviations module (add STOP WORDS to find_abbreviation)

from typing import Tuple, List, Optional, Set, Dict
from collections import defaultdict
from spacy.tokens import Span, Doc
from spacy.matcher import Matcher
from spacy.language import Language


def find_abbreviation(
    long_form_candidate: Span, short_form_candidate: Span, 
    STOP_WORDS: List = None
) -> Tuple[Span, Optional[Span]]:
    """
    Implements the abbreviation detection algorithm in "A simple algorithm
    for identifying abbreviation definitions in biomedical text.", (Schwartz & Hearst, 2003).

    The algorithm works by enumerating the characters in the short form of the abbreviation,
    checking that they can be matched against characters in a candidate text for the long form
    in order, as well as requiring that the first letter of the abbreviated form matches the
    _beginning_ letter of a word.

    Parameters
    ----------
    long_form_candidate: Span, required.
        The spaCy span for the long form candidate of the definition.
    short_form_candidate: Span, required.
        The spaCy span for the abbreviation candidate.

    Returns
    -------
    A Tuple[Span, Optional[Span]], representing the short form abbreviation and the
    span corresponding to the long form expansion, or None if a match is not found.
    """
    long_form = "".join([x.text_with_ws for x in long_form_candidate])
    short_form = "".join([x.text_with_ws for x in short_form_candidate])
    
    # create helper list for skipping stop words
    if STOP_WORDS:
        long_form_stop = [[True]*len(token.text_with_ws) 
                          if 
                          token.text in STOP_WORDS
                          else 
                          [False]*len(token.text_with_ws)
                         for token in long_form_candidate]
        # flatten nested list
        long_form_stop = [item for sublist in long_form_stop for item in sublist]
    else:
        long_form_stop = [True]*len(long_form)
    
    long_index = len(long_form) - 1
    short_index = len(short_form) - 1

    while short_index >= 0:
        current_char = short_form[short_index].lower()
        # We don't check non alpha-numeric characters.
        if not current_char.isalnum():
            short_index -= 1
            continue

            # Does the character match at this position? ...
        while (
            (long_index >= 0 and long_form[long_index].lower() != current_char)
            or
            # .... or if we are checking the first character of the abbreviation, we enforce
            # to be the _starting_ character of a span.
            (
                short_index == 0
                and long_index > 0
                and long_form[long_index - 1].isalnum()
            )
            or
            # ... or word is one of STOP_WORDS
            (
                long_form_stop[long_index]
            )
        ):
            long_index -= 1

        if long_index < 0:
            return short_form_candidate, None

        long_index -= 1
        short_index -= 1

    # The last subtraction will either take us on to a whitespace character, or
    # off the front of the string (i.e. long_index == -1). Either way, we want to add
    # one to get back to the start character of the long form
    long_index += 1

    # Now we know the character index of the start of the character span,
    # here we just translate that to the first token beginning after that
    # value, so we can return a spaCy span instead.
    word_lengths = 0
    starting_index = None
    for i, word in enumerate(long_form_candidate):
        # need to add 1 for the space characters
        word_lengths += len(word.text_with_ws)
        if word_lengths > long_index:
            starting_index = i
            break

    return short_form_candidate, long_form_candidate[starting_index:]


def filter_matches(
    matcher_output: List[Tuple[int, int, int]], doc: Doc
) -> List[Tuple[Span, Span]]:
    # Filter into two cases:
    # 1. <Short Form> ( <Long Form> )
    # 2. <Long Form> (<Short Form>) [this case is most common].
    candidates = []
    for match in matcher_output:
        start = match[1]
        end = match[2]
        # Ignore spans with more than 8 words in them, and spans at the start of the doc
        if end - start > 8 or start == 1:
            continue
        if end - start > 3:
            # Long form is inside the parens.
            # Take one word before.
            short_form_candidate = doc[start - 2 : start - 1]
            long_form_candidate = doc[start:end]
        else:
            # Normal case.
            # Short form is inside the parens.
            short_form_candidate = doc[start:end]

            # Sum character lengths of contents of parens.
            abbreviation_length = sum([len(x) for x in short_form_candidate])
            max_words = min(abbreviation_length + 5, abbreviation_length * 2)
            # Look up to max_words backwards
            long_form_candidate = doc[max(start - max_words - 1, 0) : start - 1]
        # add candidate to candidates if candidates pass filters
        if short_form_filter(short_form_candidate):
            candidates.append((long_form_candidate, short_form_candidate))

    return candidates


def short_form_filter(span: Span) -> bool:
    # All words are between length 2 and 10
    if not all([2 <= len(x) < 10 for x in span]):
        return False

    # At least 50% of the short form should be alpha
    if (sum([c.isalpha() for c in span.text]) / len(span.text)) < 0.5:
        return False

    # The first character of the short form should be alpha
    if not span.text[0].isalpha():
        return False
    return True

# # add function to add entity from match
# def add_event_ent(matcher, doc, i, matches):
#     # Get the current match and create tuple of entity label, start and end.
#     # Append entity to the doc's entity. (Don't overwrite doc.ents!)
#     match_id, start, end = matches[i]
#     entity = Span(doc, start, end, label="EVENT")
#     doc.ents += (entity,)
#     print(entity.text)


@Language.factory('AbbreviationDetector')
class AbbreviationDetector:
    """
    Detects abbreviations using the algorithm in "A simple algorithm for identifying
    abbreviation definitions in biomedical text.", (Schwartz & Hearst, 2003).

    This class sets the `._.abbreviations` attribute on spaCy Doc.

    The abbreviations attribute is a `List[Span]` where each Span has the `Span._.long_form`
    attribute set to the long form definition of the abbreviation.

    Note that this class does not replace the spans, or merge them.
    """

    def __init__(self, nlp, name) -> None:  # add name on init as instance name
        Doc.set_extension("abbreviations", default=[], force=True)
        Span.set_extension("long_form", default=None, force=True)

        self.matcher = Matcher(nlp.vocab)
        self.matcher.add(
            "parenthesis", [[{"ORTH": "("}, {"OP": "+"}, {"ORTH": ")"}]]
        )  # remove 'None' for on_match (now optional), pass pattern as list of list
        self.global_matcher = Matcher(nlp.vocab)
        self.name = name

    def find(self, span: Span, doc: Doc) -> Tuple[Span, Set[Span]]:
        """
        Functional version of calling the matcher for a single span.
        This method is helpful if you already have an abbreviation which
        you want to find a definition for.
        """
        dummy_matches = [(-1, int(span.start), int(span.end))]
        filtered = filter_matches(dummy_matches, doc)
        abbreviations = self.find_matches_for(filtered, doc)

        if not abbreviations:
            return span, set()
        else:
            return abbreviations[0]

    def __call__(self, doc: Doc) -> Doc:
        matches = self.matcher(doc)
        matches_no_brackets = [(x[0], x[1] + 1, x[2] - 1) for x in matches]
        filtered = filter_matches(matches_no_brackets, doc)
        occurences = self.find_matches_for(filtered, doc)

        for (long_form, short_forms) in occurences:
            for short in short_forms:
                short._.long_form = long_form
                doc._.abbreviations.append(short)
#                 # this works but you can't add the long as an ent it, just the label (e.g., 'PERSON')
#                 entity = Span(doc, short.start, short.end, label=long_form.text)
#                 doc.ents += (entity,)
        return doc

    def find_matches_for(
        self, filtered: List[Tuple[Span, Span]], doc: Doc
    ) -> List[Tuple[Span, Set[Span]]]:
        rules = {}
        all_occurences: Dict[Span, Set[Span]] = defaultdict(set)
        already_seen_long: Set[str] = set()
        already_seen_short: Set[str] = set()
        for (long_candidate, short_candidate) in filtered:
            short, long = find_abbreviation(long_candidate, short_candidate, STOP_WORDS)
            # We need the long and short form definitions to be unique, because we need
            # to store them so we can look them up later. This is a bit of a
            # pathalogical case also, as it would mean an abbreviation had been
            # defined twice in a document. There's not much we can do about this,
            # but at least the case which is discarded will be picked up below by
            # the global matcher. So it's likely that things will work out ok most of the time.
            
            # update for spacy 3.0: Change all references to .string to .text
            new_long = long.text not in already_seen_long if long else False
            new_short = short.text not in already_seen_short
            if long is not None and new_long and new_short:
                already_seen_long.add(long.text)
                already_seen_short.add(short.text)
                all_occurences[long].add(short)
                rules[long.text] = long
                # Add a rule to a matcher to find exactly this substring.
                self.global_matcher.add(
                    long.text, [[{"ORTH": x.text} for x in short]]
                )  # remove 'None' for on_match (now optional), pass pattern as list of list
        to_remove = set()
        global_matches = self.global_matcher(doc)
        for match, start, end in global_matches:
            string_key = self.global_matcher.vocab.strings[match]
            to_remove.add(string_key)
            all_occurences[rules[string_key]].add(doc[start:end])
        for key in to_remove:
            # Clean up the global matcher.
            self.global_matcher.remove(key)

        return list((k, v) for k, v in all_occurences.items())

In [4]:
from typing import Tuple, List, Optional, Set, Dict
from collections import defaultdict
from spacy.tokens import Span, Doc
from spacy.matcher import Matcher
from spacy.language import Language


def find_abbreviation(
    long_form_candidate: Span, short_form_candidate: Span, 
    STOP_WORDS: List = None
) -> Tuple[Span, Optional[Span]]:
    """
    Implements the abbreviation detection algorithm in "A simple algorithm
    for identifying abbreviation definitions in biomedical text.", (Schwartz & Hearst, 2003).

    The algorithm works by enumerating the characters in the short form of the abbreviation,
    checking that they can be matched against characters in a candidate text for the long form
    in order, as well as requiring that the first letter of the abbreviated form matches the
    _beginning_ letter of a word.

    Parameters
    ----------
    long_form_candidate: Span, required.
        The spaCy span for the long form candidate of the definition.
    short_form_candidate: Span, required.
        The spaCy span for the abbreviation candidate.

    Returns
    -------
    A Tuple[Span, Optional[Span]], representing the short form abbreviation and the
    span corresponding to the long form expansion, or None if a match is not found.
    """
    long_form = "".join([x.text_with_ws for x in long_form_candidate])
    short_form = "".join([x.text_with_ws for x in short_form_candidate])
    
    # create helper list for skipping stop words
    if STOP_WORDS:
        long_form_stop = [[True]*len(token.text_with_ws) 
                          if 
                          token.text in STOP_WORDS
                          else 
                          [False]*len(token.text_with_ws)
                         for token in long_form_candidate]
        # flatten nested list
        long_form_stop = [item for sublist in long_form_stop for item in sublist]
    else:
        long_form_stop = [True]*len(long_form)
    
    long_index = len(long_form) - 1
    short_index = len(short_form) - 1

    while short_index >= 0:
        current_char = short_form[short_index].lower()
        # We don't check non alpha-numeric characters.
        if not current_char.isalnum():
            short_index -= 1
            continue

            # Does the character match at this position? ...
        while (
            (long_index >= 0 and long_form[long_index].lower() != current_char)
            or
            # .... or if we are checking the first character of the abbreviation, we enforce
            # to be the _starting_ character of a span.
            (
                short_index == 0
                and long_index > 0
                and long_form[long_index - 1].isalnum()
            )
            or
            # ... or word is one of STOP_WORDS
            (
                long_form_stop[long_index]
            )
        ):
            long_index -= 1

        if long_index < 0:
            return short_form_candidate, None

        long_index -= 1
        short_index -= 1

    # The last subtraction will either take us on to a whitespace character, or
    # off the front of the string (i.e. long_index == -1). Either way, we want to add
    # one to get back to the start character of the long form
    long_index += 1

    # Now we know the character index of the start of the character span,
    # here we just translate that to the first token beginning after that
    # value, so we can return a spaCy span instead.
    word_lengths = 0
    starting_index = None
    for i, word in enumerate(long_form_candidate):
        # need to add 1 for the space characters
        word_lengths += len(word.text_with_ws)
        if word_lengths > long_index:
            starting_index = i
            break

    return short_form_candidate, long_form_candidate[starting_index:]


def filter_matches(
    matcher_output: List[Tuple[int, int, int]], doc: Doc
) -> List[Tuple[Span, Span]]:
    # Filter into two cases:
    # 1. <Short Form> ( <Long Form> )
    # 2. <Long Form> (<Short Form>) [this case is most common].
    candidates = []
    for match in matcher_output:
        start = match[1]
        end = match[2]
        # Ignore spans with more than 8 words in them, and spans at the start of the doc
        if end - start > 8 or start == 1:
            continue
        if end - start > 3:
            # Long form is inside the parens.
            # Take one word before.
            short_form_candidate = doc[start - 2 : start - 1]
            long_form_candidate = doc[start:end]
        else:
            # Normal case.
            # Short form is inside the parens.
            short_form_candidate = doc[start:end]

            # Sum character lengths of contents of parens.
            abbreviation_length = sum([len(x) for x in short_form_candidate])
            max_words = min(abbreviation_length + 5, abbreviation_length * 2)
            # Look up to max_words backwards
            long_form_candidate = doc[max(start - max_words - 1, 0) : start - 1]

        # add candidate to candidates if candidates pass filters
        if short_form_filter(short_form_candidate):
            candidates.append((long_form_candidate, short_form_candidate))

    return candidates


def short_form_filter(span: Span) -> bool:
    # All words are between length 2 and 10
    if not all([2 <= len(x) < 10 for x in span]):
        return False

    # At least 50% of the short form should be alpha
    if (sum([c.isalpha() for c in span.text]) / len(span.text)) < 0.5:
        return False

    # The first character of the short form should be alpha
    if not span.text[0].isalpha():
        return False
    return True


@Language.factory("abbreviation_detector")
class AbbreviationDetector:
    """
    Detects abbreviations using the algorithm in "A simple algorithm for identifying
    abbreviation definitions in biomedical text.", (Schwartz & Hearst, 2003).
    This class sets the `._.abbreviations` attribute on spaCy Doc.
    The abbreviations attribute is a `List[Span]` where each Span has the `Span._.long_form`
    attribute set to the long form definition of the abbreviation.
    Note that this class does not replace the spans, or merge them.
    Parameters
    ----------
    nlp: `Language`, a required argument for spacy to use this as a factory
    name: `str`, a required argument for spacy to use this as a factory
    """

    def __init__(self, nlp: Language, name: str = "abbreviation_detector") -> None:
        Doc.set_extension("abbreviations", default=[], force=True)
        Span.set_extension("long_form", default=None, force=True)

        self.matcher = Matcher(nlp.vocab)
        self.matcher.add("parenthesis", [[{"ORTH": "("}, {"OP": "+"}, {"ORTH": ")"}]])
        self.global_matcher = Matcher(nlp.vocab)

    def find(self, span: Span, doc: Doc) -> Tuple[Span, Set[Span]]:
        """
        Functional version of calling the matcher for a single span.
        This method is helpful if you already have an abbreviation which
        you want to find a definition for.
        """
        dummy_matches = [(-1, int(span.start), int(span.end))]
        filtered = filter_matches(dummy_matches, doc)
        abbreviations = self.find_matches_for(filtered, doc)

        if not abbreviations:
            return span, set()
        else:
            return abbreviations[0]

    def __call__(self, doc: Doc) -> Doc:
        matches = self.matcher(doc)
        matches_no_brackets = [(x[0], x[1] + 1, x[2] - 1) for x in matches]
        filtered = filter_matches(matches_no_brackets, doc)
        occurences = self.find_matches_for(filtered, doc)

        for (long_form, short_forms) in occurences:
            for short in short_forms:
                short._.long_form = long_form
                doc._.abbreviations.append(short)
        return doc

    def find_matches_for(
        self, filtered: List[Tuple[Span, Span]], doc: Doc
    ) -> List[Tuple[Span, Set[Span]]]:
        rules = {}
        all_occurences: Dict[Span, Set[Span]] = defaultdict(set)
        already_seen_long: Set[str] = set()
        already_seen_short: Set[str] = set()
        for (long_candidate, short_candidate) in filtered:
            short, long = find_abbreviation(long_candidate, short_candidate, STOP_WORDS)
            # We need the long and short form definitions to be unique, because we need
            # to store them so we can look them up later. This is a bit of a
            # pathalogical case also, as it would mean an abbreviation had been
            # defined twice in a document. There's not much we can do about this,
            # but at least the case which is discarded will be picked up below by
            # the global matcher. So it's likely that things will work out ok most of the time.
            new_long = long.text not in already_seen_long if long else False
            new_short = short.text not in already_seen_short
            if long is not None and new_long and new_short:
                already_seen_long.add(long.text)
                already_seen_short.add(short.text)
                all_occurences[long].add(short)
                rules[long.text] = long
                # Add a rule to a matcher to find exactly this substring.
                self.global_matcher.add(long.text, [[{"ORTH": x.text} for x in short]])
        to_remove = set()
        global_matches = self.global_matcher(doc)
        for match, start, end in global_matches:
            string_key = self.global_matcher.vocab.strings[match]
            to_remove.add(string_key)
            all_occurences[rules[string_key]].add(doc[start:end])
        for key in to_remove:
            # Clean up the global matcher.
            self.global_matcher.remove(key)

        return list((k, v) for k, v in all_occurences.items())

In [5]:
# abbreviation_pipe = AbbreviationDetector(nlp)
nlp = spacy.load("en_core_web_sm")

nlp.add_pipe("abbreviation_detector", before='ner')
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'abbreviation_detector',
 'ner',
 'attribute_ruler',
 'lemmatizer']

In [6]:
doc = nlp(text)

In [7]:
set([(str(abrv), abrv._.long_form) for abrv in doc._.abbreviations])

{('ANC', Agencia Nacional Minera),
 ('ASGM', Artisanal and small-scale gold mining),
 ('ASM', Artisanal and Small Miners),
 ('EIAs', Environmental Impact Studies),
 ('GOC', government of Colombia),
 ('IDEAM', Instituto de Hidrología, Meteorología y Estudios Ambientales),
 ('JAA', Jaime Arteaga & Asociados),
 ('MME', Ministry of Mines and Energy),
 ('MPUs', mining production units),
 ('NGOs', non- governmental organizations),
 ('NUFP', National Unified Formalization Plan),
 ('OECD', Organization for Economic Cooperation and Development),
 ('SBGA', Swiss Better Gold Association),
 ('SECO', Swiss Economic Cooperation and Development),
 ('UNEP', United Nations Environmental Programme),
 ('UNODC', United Nations Office for Drugs and Crime),
 ('USAID', United States Agency for International Development)}

In [9]:
doc

Formalization of Artisanal and Small-Scale Gold Mining in Colombia FORMALIZATION OF ARTISANAL AND SMALL-SCALE GOLD MINING IN COLOMBIA: A PROPOSAL FOR IMPROVING ENVIRONMENTAL, SOCIAL, AND ECONOMIC PERFORMANCE IN A POST-CONFLICT SCENARIO GREGORY MINNICK, PETER DOYLE, THOMAS HENTSCHEL Chemonics Inc., Projekt-Consult GmbH gminnick@chemonics.com Paper prepared for presentation at the “2020 WORLD BANK CONFERENCE ON LAND AND POVERTY” The World Bank - Washington DC, March 16-20, 2020 Copyright 2020 by author(s). All rights reserved. Readers may make verbatim copies of this document for non-commercial purposes by any means, provided that this copyright notice appears on all such copies. Abstract Artisanal and small-scale gold mining (ASGM) is an important economic activity in many rural areas, with a long tradition in Colombia. High rates of informality undermine governance, security, and legal economies, and cause significant environmental and social impacts. As the government of Colombia cons

In [8]:
set([(ent.text, ent.label_) for ent in doc.ents])

{('1', 'CARDINAL'),
 ('1,300', 'CARDINAL'),
 ('1.5', 'CARDINAL'),
 ('104 tons', 'QUANTITY'),
 ('11', 'CARDINAL'),
 ('12', 'CARDINAL'),
 ('12 percent', 'PERCENT'),
 ('13', 'CARDINAL'),
 ('135', 'CARDINAL'),
 ('14', 'CARDINAL'),
 ('14,357', 'CARDINAL'),
 ('15', 'CARDINAL'),
 ('150 tons', 'QUANTITY'),
 ('16', 'CARDINAL'),
 ('1666 of 2016', 'DATE'),
 ('17', 'CARDINAL'),
 ('18 years', 'DATE'),
 ('1990s-era', 'DATE'),
 ('1998', 'DATE'),
 ('2', 'CARDINAL'),
 ('2002', 'DATE'),
 ('2009', 'DATE'),
 ('2010', 'DATE'),
 ('2011', 'DATE'),
 ('2011/2012,3', 'CARDINAL'),
 ('2012', 'DATE'),
 ('2013', 'DATE'),
 ('2014', 'DATE'),
 ('2015', 'DATE'),
 ('2016', 'DATE'),
 ('2017', 'DATE'),
 ('2018', 'DATE'),
 ('2019', 'DATE'),
 ('2020', 'DATE'),
 ('2022', 'DATE'),
 ('2022 5', 'DATE'),
 ('2108', 'DATE'),
 ('250,000', 'MONEY'),
 ('29', 'CARDINAL'),
 ('3', 'CARDINAL'),
 ('3,000 kg', 'QUANTITY'),
 ('3,584', 'CARDINAL'),
 ('300', 'CARDINAL'),
 ('32', 'CARDINAL'),
 ('37th', 'ORDINAL'),
 ('4', 'CARDINAL'),
 ('4,133'

In [8]:
doc

Formalization of Artisanal and Small-Scale Gold Mining in Colombia FORMALIZATION OF ARTISANAL AND SMALL-SCALE GOLD MINING IN COLOMBIA: A PROPOSAL FOR IMPROVING ENVIRONMENTAL, SOCIAL, AND ECONOMIC PERFORMANCE IN A POST-CONFLICT SCENARIO GREGORY MINNICK, PETER DOYLE, THOMAS HENTSCHEL Chemonics Inc., Projekt-Consult GmbH gminnick@chemonics.com Paper prepared for presentation at the “2020 WORLD BANK CONFERENCE ON LAND AND POVERTY” The World Bank - Washington DC, March 16-20, 2020 Copyright 2020 by author(s). All rights reserved. Readers may make verbatim copies of this document for non-commercial purposes by any means, provided that this copyright notice appears on all such copies. Abstract Artisanal and small-scale gold mining (ASGM) is an important economic activity in many rural areas, with a long tradition in Colombia. High rates of informality undermine governance, security, and legal economies, and cause significant environmental and social impacts. As the government of Colombia cons

In [148]:
nlp = spacy.load("en_core_web_sm")

abbreviation_pipe = AbbreviationDetector(nlp)
nlp.add_pipe(abbreviation_pipe, before='ner')

In [163]:
# text = "the United States Agency (USA) wrote this text"

In [164]:
doc = nlp(text)

In [1]:
set([(str(abrv), abrv._.long_form) for abrv in doc._.abbreviations])

NameError: name 'doc' is not defined

In [7]:
# edited from scispacy Abbreviations module (add STOP WORDS to find_abbreviation)

from typing import Tuple, List, Optional, Set, Dict
from collections import defaultdict
from spacy.tokens import Span, Doc
from spacy.matcher import Matcher
from spacy.language import Language


def find_abbreviation(
    long_form_candidate: Span, short_form_candidate: Span
) -> Tuple[Span, Optional[Span]]:
    """
    Implements the abbreviation detection algorithm in "A simple algorithm
    for identifying abbreviation definitions in biomedical text.", (Schwartz & Hearst, 2003).

    The algorithm works by enumerating the characters in the short form of the abbreviation,
    checking that they can be matched against characters in a candidate text for the long form
    in order, as well as requiring that the first letter of the abbreviated form matches the
    _beginning_ letter of a word.

    Parameters
    ----------
    long_form_candidate: Span, required.
        The spaCy span for the long form candidate of the definition.
    short_form_candidate: Span, required.
        The spaCy span for the abbreviation candidate.

    Returns
    -------
    A Tuple[Span, Optional[Span]], representing the short form abbreviation and the
    span corresponding to the long form expansion, or None if a match is not found.
    """
    long_form = "".join([x.text_with_ws for x in long_form_candidate])  # Abstract Artisanal and small - scale gold mining
    short_form = "".join([x.text_with_ws for x in short_form_candidate])  # ASGM
    
    # create helper list for skipping stop words
    long_form_stop = {}
    j = 0
    for token in long_form_candidate:
        print(token.text_with_ws, len(token.text_with_ws))
        for i in range(j, j+len(token.text_with_ws)):
            long_form_tokens[i] = token.is_stop
            j+=len(token.text_with_ws)
                
#     if STOP_WORDS:
#         long_form_stop = [[True]*len(token.text_with_ws) 
#                           if 
#                           token.text in STOP_WORDS
#                           else 
#                           [False]*len(token.text_with_ws)
#                          for token in long_form_candidate]
#         # flatten nested list
#         long_form_stop = [item for sublist in long_form_stop for item in sublist]
#     else: 
#         long_form_stop = [True]*len(long_form)
    
    long_index = len(long_form) - 1
    short_index = len(short_form) - 1

    while short_index >= 0:
        print(long_form_stop[long_index])
        current_char = short_form[short_index].lower()  # m
        # Walk backwards through long_form to find current_char
        # We don't check non alpha-numeric characters.
        if not current_char.isalnum():
            short_index -= 1
            continue

            # Does the character match at this position? ...
        while (
            (long_index >= 0 and long_form[long_index].lower() != current_char)
            or
            # .... or if we are checking the first character of the abbreviation, we enforce
            # to be the _starting_ character of a span.
            (
                short_index == 0
                and long_index > 0
                and long_form[long_index - 1].isalnum()
            )
            or
            # ... or word is one of STOP_WORDS
            (
                
                long_form_stop[long_index]
            )
        ):
            long_index -= 1  # decrement after miss until long_index = 0

        if long_index < 0:
            return short_form_candidate, None

        long_index -= 1
        short_index -= 1

    # The last subtraction will either take us on to a whitespace character, or
    # off the front of the string (i.e. long_index == -1). Either way, we want to add
    # one to get back to the start character of the long form
    long_index += 1

    # Now we know the character index of the start of the character span,
    # here we just translate that to the first token beginning after that
    # value, so we can return a spaCy span instead.
    word_lengths = 0
    starting_index = None
    for i, word in enumerate(long_form_candidate):
        # need to add 1 for the space characters
        word_lengths += len(word.text_with_ws)
        if word_lengths > long_index:
            starting_index = i
            break

    return short_form_candidate, long_form_candidate[starting_index:]


def filter_matches(
    matcher_output: List[Tuple[int, int, int]], doc: Doc
) -> List[Tuple[Span, Span]]:
    # Filter into two cases:
    # 1. <Short Form> ( <Long Form> )
    # 2. <Long Form> (<Short Form>) [this case is most common].
    candidates = []
    for match in matcher_output:
        start = match[1]
        end = match[2]
        # Ignore spans with more than 8 words in them, and spans at the start of the doc
        if end - start > 8 or start == 1:
            continue
        if end - start > 3:
            # Long form is inside the parens.
            # Take one word before.
            short_form_candidate = doc[start - 2 : start - 1]
            long_form_candidate = doc[start:end]
        else:
            # Normal case.
            # Short form is inside the parens.
            short_form_candidate = doc[start:end]

            # Sum character lengths of contents of parens.
            abbreviation_length = sum([len(x) for x in short_form_candidate])
            max_words = min(abbreviation_length + 5, abbreviation_length * 2)
            # Look up to max_words backwards
            long_form_candidate = doc[max(start - max_words - 1, 0) : start - 1]
        # add candidate to candidates if candidates pass filters
        if short_form_filter(short_form_candidate):
            candidates.append((long_form_candidate, short_form_candidate))

    return candidates


def short_form_filter(span: Span) -> bool:
    # All words are between length 2 and 10
    if not all([2 <= len(x) < 10 for x in span]):
        return False

    # At least 50% of the short form should be alpha
    if (sum([c.isalpha() for c in span.text]) / len(span.text)) < 0.5:
        return False

    # The first character of the short form should be alpha
    if not span.text[0].isalpha():
        return False
    return True

@Language.factory('AbbreviationDetector')
class AbbreviationDetector:
    """
    Detects abbreviations using the algorithm in "A simple algorithm for identifying
    abbreviation definitions in biomedical text.", (Schwartz & Hearst, 2003).

    This class sets the `._.abbreviations` attribute on spaCy Doc.

    The abbreviations attribute is a `List[Span]` where each Span has the `Span._.long_form`
    attribute set to the long form definition of the abbreviation.

    Note that this class does not replace the spans, or merge them.
    """

    def __init__(self, nlp, name) -> None:  # add name on init as instance name
        Doc.set_extension("abbreviations", default=[], force=True)
        Span.set_extension("long_form", default=None, force=True)

        self.matcher = Matcher(nlp.vocab)
        self.matcher.add(
            "parenthesis", [[{"ORTH": "("}, {"OP": "+"}, {"ORTH": ")"}]]
        )  # remove 'None' for on_match (now optional), pass pattern as list of list
        self.global_matcher = Matcher(nlp.vocab)
        self.name = name

    def find(self, span: Span, doc: Doc) -> Tuple[Span, Set[Span]]:
        """
        Functional version of calling the matcher for a single span.
        This method is helpful if you already have an abbreviation which
        you want to find a definition for.
        """
        dummy_matches = [(-1, int(span.start), int(span.end))]
        filtered = filter_matches(dummy_matches, doc)
        abbreviations = self.find_matches_for(filtered, doc)

        if not abbreviations:
            return span, set()
        else:
            return abbreviations[0]

    def __call__(self, doc: Doc) -> Doc:
        matches = self.matcher(doc)
        matches_no_brackets = [(x[0], x[1] + 1, x[2] - 1) for x in matches]
        filtered = filter_matches(matches_no_brackets, doc)
        occurences = self.find_matches_for(filtered, doc)

        for (long_form, short_forms) in occurences:
            for short in short_forms:
                short._.long_form = long_form
                doc._.abbreviations.append(short)
        return doc

    def find_matches_for(
        self, filtered: List[Tuple[Span, Span]], doc: Doc
    ) -> List[Tuple[Span, Set[Span]]]:
        rules = {}
        all_occurences: Dict[Span, Set[Span]] = defaultdict(set)
        already_seen_long: Set[str] = set()
        already_seen_short: Set[str] = set()
        for (long_candidate, short_candidate) in filtered:
            short, long = find_abbreviation(long_candidate, short_candidate)
            # We need the long and short form definitions to be unique, because we need
            # to store them so we can look them up later. This is a bit of a
            # pathalogical case also, as it would mean an abbreviation had been
            # defined twice in a document. There's not much we can do about this,
            # but at least the case which is discarded will be picked up below by
            # the global matcher. So it's likely that things will work out ok most of the time.
            new_long = long.string not in already_seen_long if long else False
            new_short = short.string not in already_seen_short
            if long is not None and new_long and new_short:
                already_seen_long.add(long.string)
                already_seen_short.add(short.string)
                all_occurences[long].add(short)
                rules[long.string] = long
                # Add a rule to a matcher to find exactly this substring.
                self.global_matcher.add(
                    long.string, None, [{"ORTH": x.text} for x in short]
                )
        to_remove = set()
        global_matches = self.global_matcher(doc)
        for match, start, end in global_matches:
            string_key = self.global_matcher.vocab.strings[match]
            to_remove.add(string_key)
            all_occurences[rules[string_key]].add(doc[start:end])
        for key in to_remove:
            # Clean up the global matcher.
            self.global_matcher.remove(key)

        return list((k, v) for k, v in all_occurences.items())

TypeError: <class '__main__.AbbreviationDetector'> is a built-in class

In [5]:
# abbreviation_pipe = AbbreviationDetector(nlp)
nlp = spacy.load("en_core_web_sm")

nlp.add_pipe("AbbreviationDetector", before='ner')
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'AbbreviationDetector',
 'ner',
 'attribute_ruler',
 'lemmatizer']

In [6]:
doc = nlp(text)

ValueError: [E109] Component 'AbbreviationDetector' could not be run. Did you forget to call `initialize()`?

In [8]:
# my original attempt to convert to spacy 3.0 (now available as updated on github)

In [None]:
# edited from scispacy Abbreviations module (add STOP WORDS to find_abbreviation)

from typing import Tuple, List, Optional, Set, Dict
from collections import defaultdict
from spacy.tokens import Span, Doc
from spacy.matcher import Matcher
from spacy.language import Language


def find_abbreviation(
    long_form_candidate: Span, short_form_candidate: Span, 
    STOP_WORDS: List = None
) -> Tuple[Span, Optional[Span]]:
    """
    Implements the abbreviation detection algorithm in "A simple algorithm
    for identifying abbreviation definitions in biomedical text.", (Schwartz & Hearst, 2003).

    The algorithm works by enumerating the characters in the short form of the abbreviation,
    checking that they can be matched against characters in a candidate text for the long form
    in order, as well as requiring that the first letter of the abbreviated form matches the
    _beginning_ letter of a word.

    Parameters
    ----------
    long_form_candidate: Span, required.
        The spaCy span for the long form candidate of the definition.
    short_form_candidate: Span, required.
        The spaCy span for the abbreviation candidate.

    Returns
    -------
    A Tuple[Span, Optional[Span]], representing the short form abbreviation and the
    span corresponding to the long form expansion, or None if a match is not found.
    """
    long_form = "".join([x.text_with_ws for x in long_form_candidate])
    short_form = "".join([x.text_with_ws for x in short_form_candidate])
    
    # create helper list for skipping stop words
    if STOP_WORDS:
        long_form_stop = [[True]*len(token.text_with_ws) 
                          if 
                          token.text in STOP_WORDS
                          else 
                          [False]*len(token.text_with_ws)
                         for token in long_form_candidate]
        # flatten nested list
        long_form_stop = [item for sublist in long_form_stop for item in sublist]
    else:
        long_form_stop = [True]*len(long_form)
    
    long_index = len(long_form) - 1
    short_index = len(short_form) - 1

    while short_index >= 0:
        current_char = short_form[short_index].lower()
        # We don't check non alpha-numeric characters.
        if not current_char.isalnum():
            short_index -= 1
            continue

            # Does the character match at this position? ...
        while (
            (long_index >= 0 and long_form[long_index].lower() != current_char)
            or
            # .... or if we are checking the first character of the abbreviation, we enforce
            # to be the _starting_ character of a span.
            (
                short_index == 0
                and long_index > 0
                and long_form[long_index - 1].isalnum()
            )
            or
            # ... or word is one of STOP_WORDS
            (
                long_form_stop[long_index]
            )
        ):
            long_index -= 1

        if long_index < 0:
            return short_form_candidate, None

        long_index -= 1
        short_index -= 1

    # The last subtraction will either take us on to a whitespace character, or
    # off the front of the string (i.e. long_index == -1). Either way, we want to add
    # one to get back to the start character of the long form
    long_index += 1

    # Now we know the character index of the start of the character span,
    # here we just translate that to the first token beginning after that
    # value, so we can return a spaCy span instead.
    word_lengths = 0
    starting_index = None
    for i, word in enumerate(long_form_candidate):
        # need to add 1 for the space characters
        word_lengths += len(word.text_with_ws)
        if word_lengths > long_index:
            starting_index = i
            break

    return short_form_candidate, long_form_candidate[starting_index:]


def filter_matches(
    matcher_output: List[Tuple[int, int, int]], doc: Doc
) -> List[Tuple[Span, Span]]:
    # Filter into two cases:
    # 1. <Short Form> ( <Long Form> )
    # 2. <Long Form> (<Short Form>) [this case is most common].
    candidates = []
    for match in matcher_output:
        start = match[1]
        end = match[2]
        # Ignore spans with more than 8 words in them, and spans at the start of the doc
        if end - start > 8 or start == 1:
            continue
        if end - start > 3:
            # Long form is inside the parens.
            # Take one word before.
            short_form_candidate = doc[start - 2 : start - 1]
            long_form_candidate = doc[start:end]
        else:
            # Normal case.
            # Short form is inside the parens.
            short_form_candidate = doc[start:end]

            # Sum character lengths of contents of parens.
            abbreviation_length = sum([len(x) for x in short_form_candidate])
            max_words = min(abbreviation_length + 5, abbreviation_length * 2)
            # Look up to max_words backwards
            long_form_candidate = doc[max(start - max_words - 1, 0) : start - 1]
        # add candidate to candidates if candidates pass filters
        if short_form_filter(short_form_candidate):
            candidates.append((long_form_candidate, short_form_candidate))

    return candidates


def short_form_filter(span: Span) -> bool:
    # All words are between length 2 and 10
    if not all([2 <= len(x) < 10 for x in span]):
        return False

    # At least 50% of the short form should be alpha
    if (sum([c.isalpha() for c in span.text]) / len(span.text)) < 0.5:
        return False

    # The first character of the short form should be alpha
    if not span.text[0].isalpha():
        return False
    return True

@Language.factory('AbbreviationDetector')
class AbbreviationDetector:
    """
    Detects abbreviations using the algorithm in "A simple algorithm for identifying
    abbreviation definitions in biomedical text.", (Schwartz & Hearst, 2003).

    This class sets the `._.abbreviations` attribute on spaCy Doc.

    The abbreviations attribute is a `List[Span]` where each Span has the `Span._.long_form`
    attribute set to the long form definition of the abbreviation.

    Note that this class does not replace the spans, or merge them.
    """

    def __init__(self, nlp, name) -> None:  # add name on init as instance name
        Doc.set_extension("abbreviations", default=[], force=True)
        Span.set_extension("long_form", default=None, force=True)

        self.matcher = Matcher(nlp.vocab)
        self.matcher.add(
            "parenthesis", [[{"ORTH": "("}, {"OP": "+"}, {"ORTH": ")"}]]
        )  # remove 'None' for on_match (now optional), pass pattern as list of list
        self.global_matcher = Matcher(nlp.vocab)
        self.name = name

    def find(self, span: Span, doc: Doc) -> Tuple[Span, Set[Span]]:
        """
        Functional version of calling the matcher for a single span.
        This method is helpful if you already have an abbreviation which
        you want to find a definition for.
        """
        dummy_matches = [(-1, int(span.start), int(span.end))]
        filtered = filter_matches(dummy_matches, doc)
        abbreviations = self.find_matches_for(filtered, doc)

        if not abbreviations:
            return span, set()
        else:
            return abbreviations[0]

    def __call__(self, doc: Doc) -> Doc:
        matches = self.matcher(doc)
        matches_no_brackets = [(x[0], x[1] + 1, x[2] - 1) for x in matches]
        filtered = filter_matches(matches_no_brackets, doc)
        occurences = self.find_matches_for(filtered, doc)

        for (long_form, short_forms) in occurences:
            for short in short_forms:
                short._.long_form = long_form
                doc._.abbreviations.append(short)
        return doc

    def find_matches_for(
        self, filtered: List[Tuple[Span, Span]], doc: Doc
    ) -> List[Tuple[Span, Set[Span]]]:
        rules = {}
        all_occurences: Dict[Span, Set[Span]] = defaultdict(set)
        already_seen_long: Set[str] = set()
        already_seen_short: Set[str] = set()
        for (long_candidate, short_candidate) in filtered:
            short, long = find_abbreviation(long_candidate, short_candidate, STOP_WORDS)
            # We need the long and short form definitions to be unique, because we need
            # to store them so we can look them up later. This is a bit of a
            # pathalogical case also, as it would mean an abbreviation had been
            # defined twice in a document. There's not much we can do about this,
            # but at least the case which is discarded will be picked up below by
            # the global matcher. So it's likely that things will work out ok most of the time.
            
            # update for spacy 3.0: Change all references to .string to .text
            new_long = long.text not in already_seen_long if long else False
            new_short = short.text not in already_seen_short
            if long is not None and new_long and new_short:
                already_seen_long.add(long.text)
                already_seen_short.add(short.text)
                all_occurences[long].add(short)
                rules[long.text] = long
                # Add a rule to a matcher to find exactly this substring.
                self.global_matcher.add(
                    long.text, [[{"ORTH": x.text} for x in short]]
                )  # remove 'None' for on_match (now optional), pass pattern as list of list
        to_remove = set()
        global_matches = self.global_matcher(doc)
        for match, start, end in global_matches:
            string_key = self.global_matcher.vocab.strings[match]
            to_remove.add(string_key)
            all_occurences[rules[string_key]].add(doc[start:end])
        for key in to_remove:
            # Clean up the global matcher.
            self.global_matcher.remove(key)

        return list((k, v) for k, v in all_occurences.items())