In [120]:
# requires 3.8 for compatibility with scispacy
!python --version

Python 3.8.6


In [121]:
from tika import parser
import re
import os
import spacy
from spacy.pipeline import EntityRuler
from collections import OrderedDict
from spacy.lang.en.stop_words import STOP_WORDS as STOP_WORDS_EN
from spacy.lang.es.stop_words import STOP_WORDS as STOP_WORDS_ES
from spacy import displacy

STOP_WORDS = STOP_WORDS_EN | STOP_WORDS_ES

In [122]:
def read_text(f):
    """converts pdf or txt file to string"""
    _, file_extension = os.path.splitext(f)
    if file_extension == '.txt':
        with open(f, encoding="utf8") as reader:
            text = reader.read()
    elif file_extension == '.pdf':
        raw = parser.from_file(f)
        text = raw['content']
        if not text:
            print('No text found')
    else:
        print("Incorrect file extension")
    
#     # remove new lines  # NOT NEEDED WITH SPLIT/JOIN BELOW
#     text = re.sub('\n\n+', '\n\n', text)
#     text = re.sub('\n+\s\n+', '\n\n', text)  
    # drop punctuation except periods, apostrophes, and hyphens
    # (note some files use different characters than ASCII apostrophes)
#     text = re.sub(r'[/:"]', " ", text)  # r'[()/:"]'
    # drop bullets (requires utf8 encoding)
    text = text.replace('\u2022', '')
    text = text.replace('\u2219', '')
    text = text.replace('\ufeff?', '')
    # drop periods when used in TOC, etc (i.e., multiple periods except elipsis)
    text = re.sub(r'\.\.\.\.+', '', text)  # four or more dots
    # replace elipses so it's not confused by spacy's sentence parser
    text = re.sub(r'\.\.\.', '---', text)  # three dots
    text = re.sub(r'\.\s\.\s\.\s', '---', text)  # three dots with spaces
    text = text.replace('\u2026', '---')  # elipse character
    # replace two dots
    text = re.sub(r'\.\.', '', text)  # two dots
    
    # remove excess spaces
    text = ' '.join(text.split())
    
    return text[:1000000]  # spacy nlp character limit is 1000000

In [123]:
report_folder = r'D:\data\essc-knowledge-base\reports_eng'
report = '04-14-Minnick-723_paper.pdf'
text = read_text(os.path.join(report_folder, report))
text

'Formalization of Artisanal and Small-Scale Gold Mining in Colombia FORMALIZATION OF ARTISANAL AND SMALL-SCALE GOLD MINING IN COLOMBIA: A PROPOSAL FOR IMPROVING ENVIRONMENTAL, SOCIAL, AND ECONOMIC PERFORMANCE IN A POST-CONFLICT SCENARIO GREGORY MINNICK, PETER DOYLE, THOMAS HENTSCHEL Chemonics Inc., Projekt-Consult GmbH gminnick@chemonics.com Paper prepared for presentation at the “2020 WORLD BANK CONFERENCE ON LAND AND POVERTY” The World Bank - Washington DC, March 16-20, 2020 Copyright 2020 by author(s). All rights reserved. Readers may make verbatim copies of this document for non-commercial purposes by any means, provided that this copyright notice appears on all such copies. Abstract Artisanal and small-scale gold mining (ASGM) is an important economic activity in many rural areas, with a long tradition in Colombia. High rates of informality undermine governance, security, and legal economies, and cause significant environmental and social impacts. As the government of Colombia con

In [124]:
# edited from scispacy Abbreviations module (add STOP WORDS to find_abbreviation)

from typing import Tuple, List, Optional, Set, Dict
from collections import defaultdict
from spacy.tokens import Span, Doc
from spacy.matcher import Matcher


def find_abbreviation(
    long_form_candidate: Span, short_form_candidate: Span, STOP_WORDS: List
) -> Tuple[Span, Optional[Span]]:
    """
    Implements the abbreviation detection algorithm in "A simple algorithm
    for identifying abbreviation definitions in biomedical text.", (Schwartz & Hearst, 2003).

    The algorithm works by enumerating the characters in the short form of the abbreviation,
    checking that they can be matched against characters in a candidate text for the long form
    in order, as well as requiring that the first letter of the abbreviated form matches the
    _beginning_ letter of a word.

    Parameters
    ----------
    long_form_candidate: Span, required.
        The spaCy span for the long form candidate of the definition.
    short_form_candidate: Span, required.
        The spaCy span for the abbreviation candidate.

    Returns
    -------
    A Tuple[Span, Optional[Span]], representing the short form abbreviation and the
    span corresponding to the long form expansion, or None if a match is not found.
    """
    long_form = "".join([x.text_with_ws for x in long_form_candidate])  # Abstract Artisanal and small - scale gold mining
    short_form = "".join([x.text_with_ws for x in short_form_candidate])  # ASGM
    
    # create helper list for skipping stop words
    if STOP_WORDS:
        long_form_stop = [[True]*len(token.text_with_ws) 
                          if 
                          token.text in STOP_WORDS
                          else 
                          [False]*len(token.text_with_ws)
                         for token in long_form_candidate]
        # flatten nested list
        long_form_stop = [item for sublist in long_form_stop for item in sublist]
    else:
        long_form_stop = [True]*len(long_form)
    
    long_index = len(long_form) - 1
    short_index = len(short_form) - 1

    while short_index >= 0:
        current_char = short_form[short_index].lower()  # m
        # Walk backwards through long_form to find current_char
        # We don't check non alpha-numeric characters.
        if not current_char.isalnum():
            short_index -= 1
            continue

            # Does the character match at this position? ...
        while (
            (long_index >= 0 and long_form[long_index].lower() != current_char)
            or
            # .... or if we are checking the first character of the abbreviation, we enforce
            # to be the _starting_ character of a span.
            (
                short_index == 0
                and long_index > 0
                and long_form[long_index - 1].isalnum()
            )
            or
            # ... or word is one of STOP_WORDS
            (
                long_form_stop[long_index]
            )
        ):
            long_index -= 1  # decrement after miss until long_index = 0

        if long_index < 0:
            return short_form_candidate, None

        long_index -= 1
        short_index -= 1

    # The last subtraction will either take us on to a whitespace character, or
    # off the front of the string (i.e. long_index == -1). Either way, we want to add
    # one to get back to the start character of the long form
    long_index += 1

    # Now we know the character index of the start of the character span,
    # here we just translate that to the first token beginning after that
    # value, so we can return a spaCy span instead.
    word_lengths = 0
    starting_index = None
    for i, word in enumerate(long_form_candidate):
        # need to add 1 for the space characters
        word_lengths += len(word.text_with_ws)
        if word_lengths > long_index:
            starting_index = i
            break

    return short_form_candidate, long_form_candidate[starting_index:]


def filter_matches(
    matcher_output: List[Tuple[int, int, int]], doc: Doc
) -> List[Tuple[Span, Span]]:
    # Filter into two cases:
    # 1. <Short Form> ( <Long Form> )
    # 2. <Long Form> (<Short Form>) [this case is most common].
    candidates = []
    for match in matcher_output:
        start = match[1]
        end = match[2]
        # Ignore spans with more than 8 words in them, and spans at the start of the doc
        if end - start > 8 or start == 1:
            continue
        if end - start > 3:
            # Long form is inside the parens.
            # Take one word before.
            short_form_candidate = doc[start - 2 : start - 1]
            long_form_candidate = doc[start:end]
        else:
            # Normal case.
            # Short form is inside the parens.
            short_form_candidate = doc[start:end]

            # Sum character lengths of contents of parens.
            abbreviation_length = sum([len(x) for x in short_form_candidate])
            max_words = min(abbreviation_length + 5, abbreviation_length * 2)
            # Look up to max_words backwards
            long_form_candidate = doc[max(start - max_words - 1, 0) : start - 1]
        # add candidate to candidates if candidates pass filters
        if short_form_filter(short_form_candidate):
            candidates.append((long_form_candidate, short_form_candidate))

    return candidates


def short_form_filter(span: Span) -> bool:
    # All words are between length 2 and 10
    if not all([2 <= len(x) < 10 for x in span]):
        return False

    # At least 50% of the short form should be alpha
    if (sum([c.isalpha() for c in span.text]) / len(span.text)) < 0.5:
        return False

    # The first character of the short form should be alpha
    if not span.text[0].isalpha():
        return False
    return True


class AbbreviationDetector:
    """
    Detects abbreviations using the algorithm in "A simple algorithm for identifying
    abbreviation definitions in biomedical text.", (Schwartz & Hearst, 2003).

    This class sets the `._.abbreviations` attribute on spaCy Doc.

    The abbreviations attribute is a `List[Span]` where each Span has the `Span._.long_form`
    attribute set to the long form definition of the abbreviation.

    Note that this class does not replace the spans, or merge them.
    """

    def __init__(self, nlp) -> None:
        Doc.set_extension("abbreviations", default=[], force=True)
        Span.set_extension("long_form", default=None, force=True)

        self.matcher = Matcher(nlp.vocab)
        self.matcher.add(
            "parenthesis", None, [{"ORTH": "("}, {"OP": "+"}, {"ORTH": ")"}]
        )
        self.global_matcher = Matcher(nlp.vocab)

    def find(self, span: Span, doc: Doc) -> Tuple[Span, Set[Span]]:
        """
        Functional version of calling the matcher for a single span.
        This method is helpful if you already have an abbreviation which
        you want to find a definition for.
        """
        dummy_matches = [(-1, int(span.start), int(span.end))]
        filtered = filter_matches(dummy_matches, doc)
        abbreviations = self.find_matches_for(filtered, doc)

        if not abbreviations:
            return span, set()
        else:
            return abbreviations[0]

    def __call__(self, doc: Doc) -> Doc:
        matches = self.matcher(doc)
        matches_no_brackets = [(x[0], x[1] + 1, x[2] - 1) for x in matches]
        filtered = filter_matches(matches_no_brackets, doc)
        occurences = self.find_matches_for(filtered, doc)

        for (long_form, short_forms) in occurences:
            for short in short_forms:
                short._.long_form = long_form
                doc._.abbreviations.append(short)
        return doc

    def find_matches_for(
        self, filtered: List[Tuple[Span, Span]], doc: Doc
    ) -> List[Tuple[Span, Set[Span]]]:
        rules = {}
        all_occurences: Dict[Span, Set[Span]] = defaultdict(set)
        already_seen_long: Set[str] = set()
        already_seen_short: Set[str] = set()
        for (long_candidate, short_candidate) in filtered:
            short, long = find_abbreviation(long_candidate, short_candidate, STOP_WORDS)
            # We need the long and short form definitions to be unique, because we need
            # to store them so we can look them up later. This is a bit of a
            # pathalogical case also, as it would mean an abbreviation had been
            # defined twice in a document. There's not much we can do about this,
            # but at least the case which is discarded will be picked up below by
            # the global matcher. So it's likely that things will work out ok most of the time.
            new_long = long.string not in already_seen_long if long else False
            new_short = short.string not in already_seen_short
            if long is not None and new_long and new_short:
                already_seen_long.add(long.string)
                already_seen_short.add(short.string)
                all_occurences[long].add(short)
                rules[long.string] = long
                # Add a rule to a matcher to find exactly this substring.
                self.global_matcher.add(
                    long.string, None, [{"ORTH": x.text} for x in short]
                )
        to_remove = set()
        global_matches = self.global_matcher(doc)
        for match, start, end in global_matches:
            string_key = self.global_matcher.vocab.strings[match]
            to_remove.add(string_key)
            all_occurences[rules[string_key]].add(doc[start:end])
        for key in to_remove:
            # Clean up the global matcher.
            self.global_matcher.remove(key)

        return list((k, v) for k, v in all_occurences.items())


In [125]:
nlp = spacy.load("en_core_web_sm")

abbreviation_pipe = AbbreviationDetector(nlp)
nlp.add_pipe(abbreviation_pipe, before='ner')
doc = nlp(text)

In [126]:
print("Abbreviation", "\t", "Definition")
set([(str(abrv), abrv._.long_form) for abrv in doc._.abbreviations])

Abbreviation 	 Definition


{('ANC', Agencia Nacional Minera),
 ('ASGM', Artisanal and small-scale gold mining),
 ('ASM', Artisanal and Small Miners),
 ('EIAs', Environmental Impact Studies),
 ('GOC', government of Colombia),
 ('IDEAM', Instituto de Hidrología, Meteorología y Estudios Ambientales),
 ('JAA', Jaime Arteaga & Asociados),
 ('MME', Ministry of Mines and Energy),
 ('MPUs', mining production units),
 ('NGOs', non- governmental organizations),
 ('NUFP', National Unified Formalization Plan),
 ('OECD', Organization for Economic Cooperation and Development),
 ('SBGA', Swiss Better Gold Association),
 ('SECO', Swiss Economic Cooperation and Development),
 ('UNEP', United Nations Environmental Programme),
 ('UNODC', United Nations Office for Drugs and Crime),
 ('USAID', United States Agency for International Development)}

In [127]:
print('\n\n'.join([sent.text for sent in doc.sents]))

Formalization of Artisanal and Small-Scale Gold Mining in Colombia FORMALIZATION OF ARTISANAL AND SMALL-SCALE GOLD MINING IN COLOMBIA:

A PROPOSAL FOR IMPROVING ENVIRONMENTAL, SOCIAL, AND ECONOMIC PERFORMANCE IN A POST-CONFLICT

SCENARIO GREGORY MINNICK, PETER DOYLE, THOMAS HENTSCHEL Chemonics Inc., Projekt-Consult GmbH

gminnick@chemonics.com

Paper prepared for presentation at the “2020 WORLD BANK CONFERENCE ON LAND AND POVERTY”

The World Bank - Washington DC, March 16-20, 2020

Copyright 2020 by author(s).

All rights reserved.

Readers may make verbatim copies of this document for non-commercial purposes by any means, provided that this copyright notice appears on all such copies.

Abstract Artisanal and small-scale gold mining (ASGM) is an important economic activity in many rural areas, with a long tradition in Colombia.

High rates of informality undermine governance, security, and legal economies, and cause significant environmental and social impacts.

As the government of Co

In [128]:
displacy.render([sent for sent in doc.sents][9])

In [129]:
spacy.explain('AUX')

'auxiliary'

In [130]:
# reduce document to true sentences (subject, verb)
sents = []
verb_tags = set(['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'])
noun_tags = set(['NN', 'NNP', 'NNPS', 'NS'])
for sent in doc.sents:
    POS = set([token.tag_ for token in sent])
    if (POS & verb_tags) and (POS & noun_tags):
        sents.append(sent)
print('\n\n'.join([sent.text for sent in sents]))

A PROPOSAL FOR IMPROVING ENVIRONMENTAL, SOCIAL, AND ECONOMIC PERFORMANCE IN A POST-CONFLICT

SCENARIO GREGORY MINNICK, PETER DOYLE, THOMAS HENTSCHEL Chemonics Inc., Projekt-Consult GmbH

Paper prepared for presentation at the “2020 WORLD BANK CONFERENCE ON LAND AND POVERTY”

Readers may make verbatim copies of this document for non-commercial purposes by any means, provided that this copyright notice appears on all such copies.

Abstract Artisanal and small-scale gold mining (ASGM) is an important economic activity in many rural areas, with a long tradition in Colombia.

High rates of informality undermine governance, security, and legal economies, and cause significant environmental and social impacts.

As the government of Colombia consolidates post-conflict conditions, the United States and Switzerland are providing assistance through the “Oro Legal” Activity of the United States Agency for International Development and “Better Gold Initiative for Artisanal and Small-Scale Mining,” 

In [114]:
# nlp = spacy.load("en_core_web_sm")
# ruler = EntityRuler(nlp, overwrite_ents=True).from_disk('data/inputs/entity-patterns.jsonl')
# nlp.add_pipe("entity_ruler")
# ruler = EntityRuler(nlp)
# nlp.add_pipe(ruler, before='ner')

In [131]:
test_sent = next(x for i, x in enumerate(doc.sents) if i==9)
test_sent

Abstract Artisanal and small-scale gold mining (ASGM) is an important economic activity in many rural areas, with a long tradition in Colombia.

In [34]:
# import spacy
# from scispacy.abbreviation import AbbreviationDetector

# nlp = spacy.load("en_core_web_sm")

# abbreviation_pipe = AbbreviationDetector(nlp)
# nlp.add_pipe(abbreviation_pipe, before='ner')

# def replace_acronyms(text):
#     doc = nlp(text)
#     altered_tok = [tok.text for tok in doc]
#     for abrv in doc._.abbreviations:
#         altered_tok[abrv.start] = str(abrv._.long_form)

#     return(" ".join(altered_tok))

# replace_acronyms(text)

'Formalization of Artisanal and Small - Scale Gold Mining in Colombia FORMALIZATION OF ARTISANAL AND SMALL - SCALE GOLD MINING IN COLOMBIA : A PROPOSAL FOR IMPROVING ENVIRONMENTAL , SOCIAL , AND ECONOMIC PERFORMANCE IN A POST - CONFLICT SCENARIO GREGORY MINNICK , PETER DOYLE , THOMAS HENTSCHEL Chemonics Inc. , Projekt - Consult GmbH gminnick@chemonics.com Paper prepared for presentation at the “ 2020 WORLD BANK CONFERENCE ON LAND AND POVERTY ” The World Bank - Washington DC , March 16 - 20 , 2020 Copyright 2020 by author(s ) . All rights reserved . Readers may make verbatim copies of this document for non - commercial purposes by any means , provided that this copyright notice appears on all such copies . Abstract Artisanal and small - scale gold mining ( and small-scale gold mining ) is an important economic activity in many rural areas , with a long tradition in Colombia . High rates of informality undermine governance , security , and legal economies , and cause significant environm

In [45]:
# remove stop words for acronyms
doc = nlp(text)
stop_text = ' '.join([token.text for token in doc if token.text not in STOP_WORDS])

In [46]:
stop_text

'Formalization Artisanal Small - Scale Gold Mining Colombia FORMALIZATION OF ARTISANAL AND SMALL - SCALE GOLD MINING IN COLOMBIA : A PROPOSAL FOR IMPROVING ENVIRONMENTAL , SOCIAL , AND ECONOMIC PERFORMANCE IN A POST - CONFLICT SCENARIO GREGORY MINNICK , PETER DOYLE , THOMAS HENTSCHEL Chemonics Inc. , Projekt - Consult GmbH gminnick@chemonics.com Paper prepared presentation “ 2020 WORLD BANK CONFERENCE ON LAND AND POVERTY ” The World Bank - Washington DC , March 16 - 20 , 2020 Copyright 2020 author(s ) . All rights reserved . Readers verbatim copies document non - commercial purposes means , provided copyright notice appears copies . Abstract Artisanal small - scale gold mining ( ASGM ) important economic activity rural areas , long tradition Colombia . High rates informality undermine governance , security , legal economies , cause significant environmental social impacts . As government Colombia consolidates post - conflict conditions , United States Switzerland providing assistance “

In [52]:
# Add the abbreviation pipe to the spacy pipeline.
# abbreviation_pipe = AbbreviationDetector(nlp)
# nlp.add_pipe(abbreviation_pipe)
# nlp = spacy.load("en_core_web_sm", disable=['tagger', 'parser', 'ner'])
# abbreviation_pipe = AbbreviationDetector(nlp)
# nlp.add_pipe(abbreviation_pipe)
# with nlp.select_pipes(disable=['tagger', 'parser', 'ner']):
import spacy
from scispacy.abbreviation import AbbreviationDetector
doc = nlp(stop_text)

In [50]:
print("Abbreviation", "\t", "Definition")
for abrv in doc._.abbreviations:
    print(f"{abrv} \t ({abrv.start}, {abrv.end}) {abrv._.long_form}")

Abbreviation 	 Definition
ASGM 	 (1059, 1060) Artisanal small - scale gold mining
ASGM 	 (1465, 1466) Artisanal small - scale gold mining
ASGM 	 (1844, 1845) Artisanal small - scale gold mining
ASGM 	 (5591, 5592) Artisanal small - scale gold mining
ASGM 	 (1427, 1428) Artisanal small - scale gold mining
ASGM 	 (4340, 4341) Artisanal small - scale gold mining
ASGM 	 (4294, 4295) Artisanal small - scale gold mining
ASGM 	 (4778, 4779) Artisanal small - scale gold mining
ASGM 	 (1945, 1946) Artisanal small - scale gold mining
ASGM 	 (359, 360) Artisanal small - scale gold mining
ASGM 	 (2306, 2307) Artisanal small - scale gold mining
ASGM 	 (1364, 1365) Artisanal small - scale gold mining
ASGM 	 (2131, 2132) Artisanal small - scale gold mining
ASGM 	 (3679, 3680) Artisanal small - scale gold mining
ASGM 	 (3111, 3112) Artisanal small - scale gold mining
ASGM 	 (1581, 1582) Artisanal small - scale gold mining
ASGM 	 (320, 321) Artisanal small - scale gold mining
ASGM 	 (3859, 3860) Artisa

In [132]:
#increase sentence length limit to 200, reduce sentences returned to 7, and limit total text length to 1000 chars
# pass the joined sentence list instead of the full text

from heapq import nlargest
from string import punctuation

nlp.max_length = 1000000

def summarize(raw_docx):    
    raw_text = raw_docx
    docx = nlp(raw_text)
    stopwords = list(STOP_WORDS)
    # Build Word Frequency
    # word.text is tokenization in spacy
    word_frequencies = {}  
    for word in docx:  
        if word.text not in stopwords:
            if word.text not in word_frequencies.keys():
                word_frequencies[word.text] = 1
            else:
                word_frequencies[word.text] += 1

    maximum_frequncy = max(word_frequencies.values())

    for word in word_frequencies.keys():  
        word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
    
    # Sentence Tokens
    sentence_list = [ sentence for sentence in docx.sents ]

    # Calculate Sentence Score and Ranking
    sentence_scores = {}  
    for sent in sentence_list:  
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if len(sent.text.split(' ')) < 200:  # exclude very large sentences
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word.text.lower()]
                    else:
                        sentence_scores[sent] += word_frequencies[word.text.lower()]

    # Find N Largest
    summary_sentences = nlargest(7, sentence_scores, key=sentence_scores.get)
    final_sentences = [ w.text for w in summary_sentences ]
    summary = ''
    for sent in final_sentences:
        if len(summary) < 1000:
            summary += sent
            summary += ' '
    
    return summary

In [133]:
summarize(' '.join([sent.text for sent in sents]))

'A series of focus groups was conducted between April and May 2018, including 13 consultations with small-scale gold miners and other sector stakeholders involved in the formalization process in five departments (Antioquia, Chocó, Sur de Bolivar, Santander, and Cauca), and five others with representatives from academia, non- governmental organizations (NGOs), large private mining companies, and government authorities linked to the sector. A single, one-stop shop or window for formalization of small-scale mining operators would be created based on: (i) aligning the requirements and review of mine operational and environmental instruments to the scale of production and on-the-ground footprint; (ii) reasonable and firm guidelines and response times for authorities to review and approve environmental and operational instruments; (iii) decentralization of several key 14 functions of the ANM to the department level; and (iv) issuance of required Mineral Commercialization Permits according to

## Alternative summarizer
https://medium.com/luisfredgs/automatic-text-summarization-made-simple-with-python-f9c3c645e34a

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [sent.text.lower() for sent in sents]
cv = CountVectorizer(stop_words=list(STOP_WORDS))   
cv_fit=cv.fit_transform(corpus)    
word_list = cv.get_feature_names();    
count_list = cv_fit.toarray().sum(axis=0)
word_frequency = dict(zip(word_list,count_list))
val=sorted(word_frequency.values())
higher_word_frequencies = [word for word,freq in word_frequency.items() if freq in val[-3:]]
print("\nWords with higher frequencies: ", higher_word_frequencies)
# gets relative frequency of words
higher_frequency = val[-1]
for word in word_frequency.keys():  
    word_frequency[word] = (word_frequency[word]/higher_frequency)
sentence_rank={}
for sent in doc.sents:
    for word in sent :       
        if word.text.lower() in word_frequency.keys():            
            if sent in sentence_rank.keys():
                sentence_rank[sent]+=word_frequency[word.text.lower()]
            else:
                sentence_rank[sent]=word_frequency[word.text.lower()]
top_sentences=(sorted(sentence_rank.values())[::-1])
top_sent=top_sentences[:5]
summary=[]
for sent,strength in sentence_rank.items():  
    if strength in top_sent:
        summary.append(sent)
    else:
        continue
for i in summary:
    print(i,end=" ")

In [35]:
import numpy as np
class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.65 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, not_entity_types, lower=False):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for entity in sent.ents:  # entities instead of words
                # Store words only with cadidate POS tag
                if entity.label_ not in not_entity_types \
                    and entity.text not in STOP_WORDS:
                    # use entity id if present
                    if entity.ent_id_:  # isinstance(entity.ent_id, str):
                        tag = entity.ent_id_
                    else:
                        tag = entity.text
                    if lower is True:
                        selected_words.append(tag.lower())
                    else:
                        selected_words.append(tag)
                    if entity.label_=="ALT":
                        pass
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmetric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        keywords = {}
        node_weight = OrderedDict(sorted(self.node_weight.items(), 
                                         key=lambda t: t[1], reverse=True))
        keywords = {k: node_weight[k] for k in list(node_weight)[:number]}
        return keywords
        # for i, (key, value) in enumerate(node_weight.items()):
        #     # keywords.append(f'{i+1}) {key} - {str(value)}')
        
        
    def analyze(self, text, 
                not_entity_types=['DATE', 'TIME', 'PERCENT',
                                   'MONEY', 'QUANTITY', 'ORDINAL',
                                   'CARDINAL', 'PERSON', 'GPE'], 
                window_size=4, lower=False, stopwords=set(STOP_WORDS)):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, not_entity_types, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initialization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [37]:
tr4w = TextRank4Keyword()
tr4w.analyze(' '.join([sent.text for sent in sents]), window_size=8, lower=False)
keywords = tr4w.get_keywords(30)
keywords

{'ASGM': 4.758989874539294,
 'ASM': 2.456032903750662,
 'Antioquia': 2.3853144752435633,
 'BGI': 2.092193904230131,
 'MME': 2.024198880898157,
 'Swiss': 1.761098517890129,
 'ANM': 1.7441977932496,
 'International Development': 1.7300787263016413,
 'Mercury': 1.4469697226938312,
 'NUFP': 1.4386273464392496,
 'the Swiss Better Gold Association': 1.3674046419311905,
 'OECD': 1.1935845241463876,
 'Colombian': 1.1783905096269116,
 'Better Gold': 1.173477086071709,
 'ELN': 1.1202896037570513,
 'the National Development Plan 2018 – 2022': 1.0454658539142543,
 'USAID': 1.043687453219417,
 'the Minamata Convention': 1.0018052490096208,
 'Chemonics Inc.': 1.0,
 'Projekt-Consult GmbH': 1.0,
 'the “2020 WORLD BANK CONFERENCE ON LAND AND POVERTY”': 1.0,
 'The World Bank - Washington DC': 1.0,
 'the USAID Oro Legal Activity': 1.0,
 'SECO Better Gold Initiative': 1.0,
 'MPU': 1.0,
 'PTO': 1.0,
 'EIA': 1.0,
 'Gold Mining': 1.0,
 'World Political Review': 1.0,
 'Oro Legal': 0.9709413261128962}

In [39]:
tags = list(keywords.keys())
tags

['ASGM',
 'ASM',
 'Antioquia',
 'BGI',
 'MME',
 'Swiss',
 'ANM',
 'International Development',
 'Mercury',
 'NUFP',
 'the Swiss Better Gold Association',
 'OECD',
 'Colombian',
 'Better Gold',
 'ELN',
 'the National Development Plan 2018 – 2022',
 'USAID',
 'the Minamata Convention',
 'Chemonics Inc.',
 'Projekt-Consult GmbH',
 'the “2020 WORLD BANK CONFERENCE ON LAND AND POVERTY”',
 'The World Bank - Washington DC',
 'the USAID Oro Legal Activity',
 'SECO Better Gold Initiative',
 'MPU',
 'PTO',
 'EIA',
 'Gold Mining',
 'World Political Review',
 'Oro Legal']

In [53]:
import wikipedia
def search_wiki(tags):
    results = {}
    for tag in tags:
        try:
            result = wikipedia.page(tag, auto_suggest=False).title
        except wikipedia.exceptions.DisambiguationError as e:
            result = e.options
        except wikipedia.exceptions.PageError as e:
            try:
                result = wikipedia.page(tag, auto_suggest=True).title
            except wikipedia.exceptions.PageError as e:
                result = 'None'
            except wikipedia.exceptions.DisambiguationError as e:
                result = e.options
        except:
            pass
        results[tag] = result
    return results

In [54]:
tags_long = set([ent.text for ent in doc.ents if ent.label_ not in ['CARDINAL', 'DATE', 'MONEY', 'ORDINAL', 'PERCENT', 'PERSON', 'QUANTITY', 'TIME', 'LAW']])

In [55]:
tags_long

{'ANC',
 'ANM',
 'ARM',
 'ASGM',
 'ASM',
 'ASM’s',
 'AUC',
 'Actividad Minera de Oro',
 'Afro-Colombian',
 'Antioquia',
 'Antioquia ASGM',
 'Antioquia and Chocó',
 'Arteaga & Asociados',
 'Artisanal & Small-Scale Mining',
 'BGI',
 'Better Gold',
 'Better Gold Initiative',
 'Better Gold Initiative for Artisanal and Small-Scale Mining',
 'Bogota',
 'Bogotá D.C.',
 'Bolivia',
 'Chemonics Inc.',
 'Chocó',
 'Chocó Department',
 'Colombia',
 'Colombian',
 'Colombians',
 'Conflict-Affected',
 'Consulta',
 'Cordy',
 'Corte Constitucional',
 'Devolution of Claims for Formalization',
 'EIA',
 'ELN',
 'Echavarria, C., 2014',
 'El Bagre',
 'El Clan del Golfo',
 'Environmental Management',
 'Ficha Mineral – Oro',
 'Global Mercury Assessment',
 'Global Report on',
 'Gold Mining',
 'Grupos de Interés',
 'Henao',
 'Hruschka',
 'IDEAM',
 'International Development',
 'Istmina',
 'JAA',
 'K.',
 'La Guajira',
 'Law',
 'Legalization and Formalization in Colombia Formalization',
 'London',
 'MADS',
 'MME',

In [56]:
dropped_tags = [tag for tag in tags_long if tag not in tags]
dropped_tags

['Swiss Better Gold Association',
 'Regional Autonomous Corporations',
 'Small-Scale Mining',
 'Wotruba et.al',
 'ASM’s',
 'Environmental Management',
 'Meteorología',
 'El Clan del Golfo',
 'Devolution of Claims for Formalization',
 'the “Special Reserve Area',
 'the Constitutional Court',
 'MADS',
 'Task Force Titan',
 'Swiss Economic Cooperation and Development',
 'Paris',
 'Better Gold Initiative for Artisanal and Small-Scale Mining',
 'Law',
 'Actividad Minera de Oro',
 'USAID’s',
 'Special Mining Reserve Areas',
 'Swiss Better Gold Association Continuous Improvement Escalator for Artisanal and Small Miners',
 'AUC',
 'the San Juan',
 'Title',
 'La Guajira',
 'Switzerland',
 'the Colombian Ministry of Commerce',
 'Peru',
 'Post-Conflict Scenario1',
 'Universidad',
 'State',
 'the Ministry of Agriculture',
 'Swiss Agency for Development and Cooperation',
 'The Ministry of Mines and Energy',
 'Arteaga & Asociados',
 'Corte Constitucional',
 'Global Report on',
 'Better Gold Initiati

In [57]:
wiki_results = search_wiki(tags_long)
wiki_results

{'Swiss Better Gold Association': 'Bigger, Better, Faster, More!',
 'Regional Autonomous Corporations': 'None',
 'Small-Scale Mining': 'Artisanal mining',
 'Wotruba et.al': 'None',
 'the National Development Plan 2018 – 2022': 'None',
 'ASM’s': 'ASM International',
 'MPU': ['Vas-occlusive contraception',
  'MPU-401',
  'Microprocessor unit',
  'History of general-purpose CPUs',
  'Memory protection unit',
  'Minimum publishable unit',
  'Myanmar Payment Union',
  'IATA code: MPU',
  'Mișcarea Politică Unirea'],
 'World Political Review': 'Harvard Political Review',
 'Environmental Management': 'Environmental resource management',
 'Meteorología': 'Meteorology',
 'OECD': 'OECD',
 'El Clan del Golfo': 'Clan del Golfo',
 'Devolution of Claims for Formalization': 'English independence',
 'the “Special Reserve Area': 'Nature reserve',
 'the Constitutional Court': 'Constitutional court',
 'MADS': ['Mads (given name)',
  'MADS-box',
  'Metadata Authority Description Schema'],
 'Task Force Tit

In [58]:
wikipedia.page('Mozambique', auto_suggest=False).pageid

'19301'

In [59]:
import json
import urllib

api_key = open('../secrets/google_api.txt').read()

service_url = 'https://kgsearch.googleapis.com/v1/entities:search'

def search_google(tags, score=False):
    results = {}
    for tag in tags:
        query = tag
        params = {
            'query': query,
            'limit': 1,
            'indent': True,
            'key': api_key,
        }
        url = service_url + '?' + urllib.parse.urlencode(params)
        response = json.loads(urllib.request.urlopen(url).read())
        for element in response['itemListElement']:
            if element:
                if score:
                    print(tag + " | " + element['result']['name'] + ' (' + str(element['resultScore']) + ')')
                else:
                    results[tag] = element['result']['name']
            else:
                results[tag] = 'None'
    return results

In [60]:
google_results = search_google(tags_long, score=True)
# google_results

Swiss Better Gold Association | The Adventures of Tom Sawyer (0.0407535620033741)
MPU | Mpumalanga (636.1278076171875)
World Political Review | Donald Trump (0.826413631439209)
Environmental Management | Environmental management system (1782.614135742188)
Meteorología | Meteorology (1140.65380859375)
OECD | Organisation for Economic Co-operation and Development (4467.5595703125)
El Clan del Golfo | Clan del Golfo (25.76739120483398)
the “Special Reserve Area | Pride and Prejudice (1.930147886276245)
the Constitutional Court | Federal Constitutional Court (22.47382545471191)
MADS | Mads Mikkelsen (4724.3603515625)
Paris | Paris Saint-Germain F.C. (24350.201171875)
ELN | Elastin (1294.682861328125)
Law | Law (9136.5283203125)
The World Bank - Washington DC | World Bank (75.3290023803711)
AUC | Auction (3719.479736328125)
the San Juan | Theodore Roosevelt (8882.408203125)
Title | Title (2833.96240234375)
Switzerland | Switzerland (12754.037109375)
the Colombian Ministry of Commerce | Mini

In [61]:
for key in google_results.keys():
    if google_results[key] == wiki_results[key]:
        print(key + " " + " | " + google_results[key])
    elif type(wiki_results[key]) == type([]) and google_results[key] in wiki_results[key]:
        print(key + " " + "in list | " + google_results[key])
    else:
        print(key + " | False")

In [62]:
wiki_results['Chemonics Inc.']

'Chemonics'

In [63]:
# print all tags over/under specified score
from __future__ import print_function
import json
import urllib

api_key = open('../secrets/google_api.txt').read()

service_url = 'https://kgsearch.googleapis.com/v1/entities:search'


for tag in tags[:31]:
    query = tag
    params = {
        'query': query,
        'limit': 1,
        'indent': True,
        'key': api_key,
    }
    url = service_url + '?' + urllib.parse.urlencode(params)
    response = json.loads(urllib.request.urlopen(url).read())
    for element in response['itemListElement']:
        if element['resultScore']:
            print(tag + " | " + element['result']['name'] + ' (' + str(element['resultScore']) + ')')

ASM | ASMR (6137.53125)
Antioquia | Antioquia District (311.1219177246094)
BGI | Baldur's Gate III (1244.6767578125)
MME | Mitsubishi Motors Europe (316.1616821289062)
Swiss | Swiss International Air Lines (4496.4248046875)
ANM | ANM (1055.330200195312)
International Development | International development (2873.77685546875)
Mercury | Mercury (6359.38818359375)
the Swiss Better Gold Association | The Adventures of Tom Sawyer (0.8150712251663208)
OECD | Organisation for Economic Co-operation and Development (4467.5595703125)
Colombian | Colombians (689.3011474609375)
Better Gold | Sido (1.839211344718933)
ELN | Elastin (1294.682861328125)
USAID | United States Agency for International Development (244.7430877685547)
the Minamata Convention | Third meeting of the Conference of the Parties to the Minamata Convention on Mercury (13)
Chemonics Inc. | Chemonics (185.1817169189453)
Projekt-Consult GmbH | Internationale Projekt Consult GmbH (361.29833984375)
the “2020 WORLD BANK CONFERENCE ON 

In [64]:
import sys
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"
tag = "Mozambique"
query = f"""SELECT distinct ?item ?itemLabel ?itemDescription WHERE{{  
  ?item ?label "{tag}"@en.  
  ?article schema:about ?item .
  ?article schema:inLanguage "en" .
  ?article schema:isPartOf <https://en.wikipedia.org/>.	
  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}    
}}"""

print(query)

def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


results = get_results(endpoint_url, query)

for result in results["results"]['bindings']:
    print(result['itemDescription']['value'], result['item']['value'].split('/')[-1])

SELECT distinct ?item ?itemLabel ?itemDescription WHERE{  
  ?item ?label "Mozambique"@en.  
  ?article schema:about ?item .
  ?article schema:inLanguage "en" .
  ?article schema:isPartOf <https://en.wikipedia.org/>.	
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }    
}
sovereign state in Africa Q1029
style of vigorous music, developed by Pello el Afrokan (Pedro Izquierdo) in Cuba in 1963, and subsequently adapted by Eddie Palmieri in New York in the 1960s in a quite different form Q2720293
1965 film by Robert Lynn Q12126300
Wikimedia disambiguation page Q443360
song by Bob Dylan Q5793520


In [65]:
import sys
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"



def search_wikidata(tags, query):
    
    results_dict = {}

    def get_results(endpoint_url, tag):
        query = f"""SELECT distinct ?item ?itemLabel ?itemDescription WHERE{{  
          ?item ?label "{tag}"@en.  
          ?article schema:about ?item .
          ?article schema:inLanguage "en" .
          ?article schema:isPartOf <https://en.wikipedia.org/>.	
          SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}    
        }}"""
        
        user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
        # TODO adjust user agent; see https://w.wiki/CX6
        sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        return sparql.query().convert()
    
    
    for tag in tags:
        results = get_results(endpoint_url, tag)
        
        results_list = []
        
        for result in results["results"]['bindings']:
            if 'itemLabel' in result.keys():
                results_list.append(result['itemLabel']['value'])                                     
            if 'itemDescription' in result.keys():
                results_list.append(result['itemDescription']['value'])
            if 'item' in result.keys():
                results_list.append(result['item']['value'].split('/')[-1])
                                     
        results_dict[tag] = results_list
    
    return results_dict

In [66]:
wdata_results = search_wikidata(tags_long, query)

In [67]:
wdata_results

{'Swiss Better Gold Association': [],
 'Regional Autonomous Corporations': [],
 'Small-Scale Mining': [],
 'Wotruba et.al': [],
 'the National Development Plan 2018 – 2022': [],
 'ASM’s': [],
 'MPU': ['MPU',
  'Wikimedia disambiguation page',
  'Q140059',
  'Memory protection unit',
  'computer hardware component for security',
  'Q55622769'],
 'World Political Review': [],
 'Environmental Management': [],
 'Meteorología': [],
 'OECD': ['Organisation for Economic Cooperation and Development',
  'international economic organisation',
  'Q41550'],
 'El Clan del Golfo': [],
 'Devolution of Claims for Formalization': [],
 'the “Special Reserve Area': [],
 'the Constitutional Court': [],
 'MADS': ['Metadata Authority Description Schema', 'XML schema', 'Q6822472'],
 'Task Force Titan': [],
 'Swiss Economic Cooperation and Development': [],
 'Paris': ['Paris',
  'town in Maine, USA',
  'Q934294',
  'Paris',
  'city in Missouri',
  'Q960025',
  'Paris',
  'city in Idaho',
  'Q984459',
  'Paris

In [68]:
print(list(tags_long)[:5])

['Swiss Better Gold Association', 'Regional Autonomous Corporations', 'Small-Scale Mining', 'Wotruba et.al', 'the National Development Plan 2018 – 2022']


In [69]:
import tagme
tagme.GCUBE_TOKEN = open('../secrets/tagme_token.txt').read()

In [70]:
annotations = tagme.annotate(' '.join(list(tags_long)[:5]))

In [71]:
for ann in annotations.get_annotations(0.1):
    print(ann)

Swiss -> Switzerland (score: 0.171812504529953)
Association -> Voluntary association (score: 0.15964217483997345)
Regional -> Regionalism (politics) (score: 0.16937711834907532)
Autonomous -> Autonomy (score: 0.14247506856918335)
Corporations -> Corporation (score: 0.1377449780702591)
Mining -> Mining (score: 0.1838894933462143)
et -> Tundra (score: 0.10005931556224823)
National Development Plan -> National Development Plan (score: 0.12883435189723969)


In [72]:
annotations.__str__()

'124msec, 11 annotations'

In [73]:
' '.join(list(tags_long)[:5])

'Swiss Better Gold Association Regional Autonomous Corporations Small-Scale Mining Wotruba et.al the National Development Plan 2018 – 2022'

In [74]:
' '.join([sent.text for sent in sents[5:10]])

'As the government of Colombia consolidates post-conflict conditions, the United States and Switzerland are providing assistance through the “Oro Legal” Activity of the United States Agency for International Development and “Better Gold Initiative for Artisanal and Small-Scale Mining,” a public-private partnership between the Swiss Better Gold Association and Swiss State Secretariat for Economic Affairs. A consultative process with a broad array of stakeholder groups yielded consensus on the principal obstacles to ASGM formalization, which create serious disincentives to entry into this process. To enable ASGM operators to transition to formality, a recommendation emerged for a National Unified Formalization Plan, which rests on six proposals that largely align with current policy and legislation. Context and problem statement Recent administrations of Colombia have highlighted the importance of legality, pointing to the mining sector as a particular area of interest for national devel

In [75]:
', '.join(list(tags_long))

'Swiss Better Gold Association, Regional Autonomous Corporations, Small-Scale Mining, Wotruba et.al, the National Development Plan 2018 – 2022, ASM’s, MPU, World Political Review, Environmental Management, Meteorología, OECD, El Clan del Golfo, Devolution of Claims for Formalization, the “Special Reserve Area, the Constitutional Court, MADS, Task Force Titan, Swiss Economic Cooperation and Development, Paris, ELN, Better Gold Initiative for Artisanal and Small-Scale Mining, Law, The World Bank - Washington DC, Actividad Minera de Oro, USAID’s, Special Mining Reserve Areas, Swiss Better Gold Association Continuous Improvement Escalator for Artisanal and Small Miners, AUC, the San Juan, Title, La Guajira, Switzerland, the Colombian Ministry of Commerce, Peru, Post-Conflict Scenario1, Universidad, State, the Ministry of Agriculture, Oro Legal, Swiss Agency for Development and Cooperation, The Ministry of Mines and Energy, Arteaga & Asociados, Corte Constitucional, Global Report on, Better

In [76]:
!python --version

Python 3.8.6


In [81]:
# Get relatedness between a pair of entities specified by title.
rels = tagme.relatedness_title(("Chemonics", "United states agency for international development"))
print ("Obama and italy have a semantic relation of", rels.relatedness[0].rel)

Obama and italy have a semantic relation of None


In [None]:
# Get relatedness between a pair of entities specified by Wikipedia ID.
rels = tagme.relatedness_wid((31717, 534366))
print ("IDs 31717 and 534366 have a semantic relation of ", rels.relatedness[0].rel)

# Get relatedness between three pairs of entities specified by title.
# The last entity does not exist, hence the value for that pair will be None.
rels = tagme.relatedness_title([("Barack_Obama", "Italy"),
                                ("Italy", "Germany"),
                                ("Italy", "BAD ENTITY NAME")])
for rel in rels.relatedness:
    print (rel)

# You can also build a dictionary
rels_dict = dict(rels)
print (rels_dict[("Barack Obama", "Italy")])