In [14]:
from tika import parser
import re
import os
import spacy
from spacy.pipeline import EntityRuler
from collections import OrderedDict
from spacy.lang.en.stop_words import STOP_WORDS as STOP_WORDS_EN
from spacy.lang.es.stop_words import STOP_WORDS as STOP_WORDS_ES
from spacy import displacy

STOP_WORDS = STOP_WORDS_EN | STOP_WORDS_ES

In [278]:
def read_text(f):
    """converts pdf or txt file to string"""
    _, file_extension = os.path.splitext(f)
    if file_extension == '.txt':
        with open(f, encoding="utf8") as reader:
            text = reader.read()
    elif file_extension == '.pdf':
        raw = parser.from_file(f)
        text = raw['content']
        if not text:
            print('No text found')
    else:
        print("Incorrect file extension")
    
#     # remove new lines  # NOT NEEDED WITH SPLIT/JOIN BELOW
#     text = re.sub('\n\n+', '\n\n', text)
#     text = re.sub('\n+\s\n+', '\n\n', text)  
    # drop punctuation except periods, apostrophes, and hyphens
    # (note some files use different characters than ASCII apostrophes)
    text = re.sub(r'[()/:"]', " ", text)
    # drop bullets (requires utf8 encoding)
    text = text.replace('\u2022', '')
    text = text.replace('\u2219', '')
    text = text.replace('\ufeff?', '')
    # drop periods when used in TOC, etc (i.e., multiple periods except elipsis)
    text = re.sub(r'\.\.\.\.+', '', text)  # four or more dots
    # replace elipses so it's not confused by spacy's sentence parser
    text = re.sub(r'\.\.\.', '---', text)  # three dots
    text = re.sub(r'\.\s\.\s\.\s', '---', text)  # three dots with spaces
    text = text.replace('\u2026', '---')  # elipse character
    # replace two dots
    text = re.sub(r'\.\.', '', text)  # two dots
    
    # remove excess spaces
    text = ' '.join(text.split())
    
    return text[:1000000]  # spacy nlp character limit is 1000000

In [279]:
report_folder = r'F:\data\essc-knowledge-base\reports_eng'

In [311]:
report = '04-14-Minnick-723_paper.pdf'

In [312]:
text = read_text(os.path.join(report_folder, report))
text

'Formalization of Artisanal and Small-Scale Gold Mining in Colombia FORMALIZATION OF ARTISANAL AND SMALL-SCALE GOLD MINING IN COLOMBIA A PROPOSAL FOR IMPROVING ENVIRONMENTAL, SOCIAL, AND ECONOMIC PERFORMANCE IN A POST-CONFLICT SCENARIO GREGORY MINNICK, PETER DOYLE, THOMAS HENTSCHEL Chemonics Inc., Projekt-Consult GmbH gminnick@chemonics.com Paper prepared for presentation at the “2020 WORLD BANK CONFERENCE ON LAND AND POVERTY” The World Bank - Washington DC, March 16-20, 2020 Copyright 2020 by author s . All rights reserved. Readers may make verbatim copies of this document for non-commercial purposes by any means, provided that this copyright notice appears on all such copies. Abstract Artisanal and small-scale gold mining ASGM is an important economic activity in many rural areas, with a long tradition in Colombia. High rates of informality undermine governance, security, and legal economies, and cause significant environmental and social impacts. As the government of Colombia consol

In [313]:
nlp = spacy.load("en_core_web_sm")
# ruler = EntityRuler(nlp, overwrite_ents=True).from_disk('data/inputs/entity-patterns.jsonl')
nlp.add_pipe("entity_ruler")

<spacy.pipeline.entityruler.EntityRuler at 0x96d7aa7380>

In [314]:
doc = nlp(text)

In [315]:
# reduce document to true sentences (subject, verb)
sents = []
for sent in doc.sents:
    POS = [token.pos_ for token in sent]
    if 'VERB' in POS and ('PROPN' in POS or 'NOUN' in POS):
        sents.append(sent)
print('\n\n'.join([sent.text for sent in sents]))

Formalization of Artisanal and Small-Scale Gold Mining in Colombia FORMALIZATION OF ARTISANAL AND SMALL-SCALE GOLD MINING IN COLOMBIA A PROPOSAL FOR IMPROVING ENVIRONMENTAL, SOCIAL, AND ECONOMIC PERFORMANCE IN A POST-CONFLICT SCENARIO GREGORY MINNICK, PETER DOYLE, THOMAS HENTSCHEL Chemonics Inc., Projekt-Consult GmbH

gminnick@chemonics.com Paper prepared for presentation at the “2020 WORLD BANK CONFERENCE ON LAND AND POVERTY” The World Bank - Washington DC, March 16-20, 2020

All rights reserved.

Readers may make verbatim copies of this document for non-commercial purposes by any means, provided that this copyright notice appears on all such copies.

High rates of informality undermine governance, security, and legal economies, and cause significant environmental and social impacts.

As the government of Colombia consolidates post-conflict conditions, the United States and Switzerland are providing assistance through the “Oro Legal” Activity of the United States Agency for Internatio

In [316]:
# No changes, just pass the joined sentence list instead of the full text

from heapq import nlargest
from string import punctuation

nlp.max_length = 1000000

def summarize(raw_docx):    
    raw_text = raw_docx
    docx = nlp(raw_text)
    stopwords = list(STOP_WORDS)
    # Build Word Frequency
    # word.text is tokenization in spacy
    word_frequencies = {}  
    for word in docx:  
        if word.text not in stopwords:
            if word.text not in word_frequencies.keys():
                word_frequencies[word.text] = 1
            else:
                word_frequencies[word.text] += 1

    maximum_frequncy = max(word_frequencies.values())

    for word in word_frequencies.keys():  
        word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
    
    # Sentence Tokens
    sentence_list = [ sentence for sentence in docx.sents ]

    # Calculate Sentence Score and Ranking
    sentence_scores = {}  
    for sent in sentence_list:  
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if len(sent.text.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word.text.lower()]
                    else:
                        sentence_scores[sent] += word_frequencies[word.text.lower()]

    # Find N Largest
    summary_sentences = nlargest(7, sentence_scores, key=sentence_scores.get)
    final_sentences = [ w.text for w in summary_sentences ]
    summary = ' '.join(final_sentences)
    
    return summary

In [317]:
summarize(' '.join([sent.text for sent in sents]))

'Like cocaine, illegal mining poses a litany of security, governance, social, and environmental challenges DuPée, M. C., 2018 . National mining policies, laws, regulations, and norms across several ministries, agencies, and levels of government create a legal framework that is ill-adapted for ASGM. M. C. “Already a Scourge, Illegal Gold Mining in Colombia Is Getting Worse,” World Political Review, July 27, 2018. Theoretically straightforward, utilization of route hampered by overlapping claims, incomplete mining and cadastral records, red tape, and limited existence of unclaimed areas. As in many countries, once legality can be determined, the formalization process in Colombia is wrought with onerous, excessively costly requirements, and lengthy review and approval processes. Technical expertise, budget resources, and political capital are concentrated in national entities while regional and local capacity, resources, and political will vary widely. In Colombia, BGI for ASM is working 

In [321]:
import numpy as np
class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.65 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, not_entity_types, lower=False):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for entity in sent.ents:  # entities instead of words
                # Store words only with cadidate POS tag
                if entity.label_ not in not_entity_types \
                    and entity.text not in STOP_WORDS:
                    # use entity id if present
                    if entity.ent_id_:  # isinstance(entity.ent_id, str):
                        tag = entity.ent_id_
                    else:
                        tag = entity.text
                    if lower is True:
                        selected_words.append(tag.lower())
                    else:
                        selected_words.append(tag)
                    if entity.label_=="ALT":
                        pass
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmetric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        keywords = {}
        node_weight = OrderedDict(sorted(self.node_weight.items(), 
                                         key=lambda t: t[1], reverse=True))
        keywords = {k: node_weight[k] for k in list(node_weight)[:number]}
        return keywords
        # for i, (key, value) in enumerate(node_weight.items()):
        #     # keywords.append(f'{i+1}) {key} - {str(value)}')
        
        
    def analyze(self, text, 
                not_entity_types=['DATE', 'TIME', 'PERCENT',
                                   'MONEY', 'QUANTITY', 'ORDINAL',
                                   'CARDINAL', 'PERSON', 'GPE'], 
                window_size=4, lower=False, stopwords=set(STOP_WORDS)):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, not_entity_types, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initialization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [322]:
tr4w = TextRank4Keyword()
tr4w.analyze(' '.join([sent.text for sent in sents]), window_size=8, lower=False)
keywords = tr4w.get_keywords(30)
keywords

{'ASGM': 4.6475800588696545,
 'ASM': 2.5818163303684747,
 'Antioquia': 2.2686434493943595,
 'BGI': 1.945806178563064,
 'MME': 1.8210616743885546,
 'Swiss': 1.806235067494491,
 'NUFP': 1.4763070523782211,
 'Mercury': 1.4557283067746432,
 'International Development': 1.3696903536079184,
 'Colombian': 1.2407858828825313,
 'Better Gold': 1.2282100567652776,
 'ANM': 1.2204689475512156,
 'Oro Legal': 1.1571966632861512,
 'ELN': 1.1401448371947778,
 'the National Development Plan 2018 – 2022': 1.0604040986309788,
 'the Minamata Convention': 1.0103579364042725,
 'Chemonics Inc.': 1.0,
 'Projekt-Consult GmbH': 1.0,
 'the “2020 WORLD BANK CONFERENCE ON LAND AND POVERTY”': 1.0,
 'The World Bank - Washington DC': 1.0,
 'United States Agency': 1.0,
 'International Development USAID': 1.0,
 'Swiss Economic Cooperation and Development SECO': 1.0,
 'the USAID Oro Legal Activity': 1.0,
 'SECO Better Gold Initiative': 1.0,
 'UNEP': 1.0,
 'ASGM Cordy': 1.0,
 'Environmental Impact Studies': 1.0,
 'Spanish

In [325]:
tags = list(keywords.keys())
tags

['ASGM',
 'ASM',
 'Antioquia',
 'BGI',
 'MME',
 'Swiss',
 'NUFP',
 'Mercury',
 'International Development',
 'Colombian',
 'Better Gold',
 'ANM',
 'Oro Legal',
 'ELN',
 'the National Development Plan 2018 – 2022',
 'the Minamata Convention',
 'Chemonics Inc.',
 'Projekt-Consult GmbH',
 'the “2020 WORLD BANK CONFERENCE ON LAND AND POVERTY”',
 'The World Bank - Washington DC',
 'United States Agency',
 'International Development USAID',
 'Swiss Economic Cooperation and Development SECO',
 'the USAID Oro Legal Activity',
 'SECO Better Gold Initiative',
 'UNEP',
 'ASGM Cordy',
 'Environmental Impact Studies',
 'Spanish',
 'MPU']

In [365]:
import wikipedia
def search_wiki(tags):
    results = {}
    for tag in tags:
        try:
            result = wikipedia.page(tag, auto_suggest=False).title
        except wikipedia.exceptions.DisambiguationError as e:
            result = e.options
        except wikipedia.exceptions.PageError as e:
            try:
                result = wikipedia.page(tag, auto_suggest=True).title
            except wikipedia.exceptions.PageError as e:
                result = 'None'
        except:
            pass
        results[tag] = result
    return results

In [366]:
tags_long = set([ent.text for ent in doc.ents if ent.label_ not in ['CARDINAL', 'DATE', 'MONEY', 'ORDINAL', 'PERCENT', 'PERSON', 'QUANTITY', 'TIME']])

In [367]:
wiki_results = search_wiki(tags_long)
wiki_results

{'Better Gold Initiative for Artisanal and Small-Scale Mining': 'Artisanal mining',
 'Ficha Mineral – Oro, 2108': 'None',
 'State': ['State Magazine',
  'The State (newspaper)',
  'Our State',
  'States Records',
  'The State (band)',
  'State (Todd Rundgren album)',
  'States (album)',
  'Klinik',
  'The State (album)',
  'The State (1993 TV series)',
  'The State (2017 TV series)',
  'The States (TV series)',
  'State (polity)',
  'Sovereign state',
  'List of sovereign states',
  'Constituent state',
  'Federated state',
  'States and union territories of India',
  'States and territories of Australia',
  'States of Brazil',
  'States of Germany',
  'States of Mexico',
  'States of Nigeria',
  'States of South Sudan',
  'States of Sudan',
  'States of Austria',
  'States and federal territories of Malaysia',
  'States of Micronesia',
  'States and regions of Somalia',
  'States of Venezuela',
  'U.S. state',
  'Rechtsstaat',
  'Nation-state',
  'The Estates',
  'United States Depart

In [354]:
import json
import urllib

api_key = open('../secrets/google_api.txt').read()

service_url = 'https://kgsearch.googleapis.com/v1/entities:search'

def search_google(tags):
    results = {}
    for tag in tags:
        query = tag
        params = {
            'query': query,
            'limit': 1,
            'indent': True,
            'key': api_key,
        }
        url = service_url + '?' + urllib.parse.urlencode(params)
        response = json.loads(urllib.request.urlopen(url).read())
        for element in response['itemListElement']:
            if element:
                results[tag] = element['result']['name']
            else:
                results[tag] = 'None'
    return results

In [355]:
google_results = search_google(tags)
google_results

{'ASM': 'ASMR',
 'Antioquia': 'Antioquia District',
 'BGI': "Baldur's Gate III",
 'MME': 'Mitsubishi Motors Europe',
 'Swiss': 'Swiss International Air Lines',
 'Mercury': 'Mercury',
 'International Development': 'International development',
 'Colombian': 'Colombians',
 'Better Gold': 'Sido',
 'ANM': 'ANM',
 'ELN': 'Elastin',
 'the Minamata Convention': 'Third meeting of the Conference of the Parties to the Minamata Convention on Mercury',
 'Chemonics Inc.': 'Chemonics',
 'Projekt-Consult GmbH': 'Internationale Projekt Consult GmbH',
 'the “2020 WORLD BANK CONFERENCE ON LAND AND POVERTY”': 'International Monetary Fund',
 'The World Bank - Washington DC': 'World Bank',
 'United States Agency': 'United States Environmental Protection Agency',
 'International Development USAID': 'United States Agency for International Development',
 'Swiss Economic Cooperation and Development SECO': 'Tenders: Installation of 738 pieces of LED floodlight luminaires',
 'UNEP': 'United Nations Environment Pr

In [369]:
for key in google_results.keys():
    if google_results[key] == wiki_results[key]:
        print(key + " " + " | " + google_results[key])
    elif type(wiki_results[key]) == type([]) and google_results[key] in wiki_results[key]:
        print(key + " " + "in list | " + google_results[key])
    else:
        print(False)

False
Antioquia in list | Antioquia District
False
MME in list | Mitsubishi Motors Europe
Swiss in list | Swiss International Air Lines
False
International Development  | International development
Colombian in list | Colombians
False
False
False
False
Chemonics Inc.  | Chemonics
False
False
False
False
International Development USAID  | United States Agency for International Development
False
UNEP  | United Nations Environment Programme
False
False
False


In [364]:
wiki_results['Chemonics Inc.']

'Chemonics(Suggestion)'