In [2]:
import yake
from tika import parser
import re
import os

In [3]:
def read_text(f):
    """converts pdf or txt file to string"""
    _, file_extension = os.path.splitext(f)
    if file_extension == '.txt':
        with open(f, encoding="utf8") as reader:
            text = reader.read()
    elif file_extension == '.pdf':
        raw = parser.from_file(f)
        text = raw['content']
        if not text:
            print('No text found')
    else:
        print("Incorrect file extension")
    
#     # remove new lines  # NOT NEEDED WITH SPLIT/JOIN BELOW
#     text = re.sub('\n\n+', '\n\n', text)
#     text = re.sub('\n+\s\n+', '\n\n', text)  
    # drop punctuation except periods, apostrophes, and hyphens
    # (note some files use different characters than ASCII apostrophes)
    text = re.sub(r'[()/:"]', " ", text)
    # drop bullets (requires utf8 encoding)
    text = text.replace('\u2022', '')
    text = text.replace('\u2219', '')
    text = text.replace('\ufeff?', '')
    # drop periods when used in TOC, etc (i.e., multiple periods except elipsis)
    text = re.sub(r'\.\.\.\.+', '', text)  # four or more dots
    # replace elipses so it's not confused by spacy's sentence parser
    text = re.sub(r'\.\.\.', '---', text)  # three dots
    text = re.sub(r'\.\s\.\s\.\s', '---', text)  # three dots with spaces
    text = text.replace('\u2026', '---')  # elipse character
    # replace two dots
    text = re.sub(r'\.\.', '', text)  # two dots
    
    # remove excess spaces
    text = ' '.join(text.split())
    
    return text[:1000000]  # spacy nlp character limit is 1000000

In [4]:
report_folder = r'D:\data\essc-knowledge-base\reports_eng'

In [5]:
report = '04-14-Minnick-723_paper.pdf'

In [6]:
text = read_text(os.path.join(report_folder, report))

In [10]:
kw_extractor = yake.KeywordExtractor()
# text = """spaCy is an open-source software library for advanced natural language processing, written in the programming languages Python and Cython. The library is published under the MIT license and its main developers are Matthew Honnibal and Ines Montani, the founders of the software company Explosion."""
language = "en"
deduplication_threshold = 0.9
numOfKeywords = 30
custom_kw_extractor = yake.KeywordExtractor(lan=language, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)
keywords = custom_kw_extractor.extract_keywords(text)
for kw in keywords:
    print(kw)

('scenario gregory minnick', 0.0004760500127513386)
('gold', 0.001556957971529783)
('antioquia asgm operators', 0.005657256687719865)
('small-scale', 0.013614461744106172)
('bgi', 0.02040323062924821)
('key', 0.0706393236716569)
('eia', 0.10229885591651727)
('mpu', 0.11078386897942487)
('due', 0.13961465996933056)
('projekt-consult gmbh', 0.23282441769080334)
('16-20', 0.43457390939012097)
('n’t', 0.4353449597600033)
('iván', 0.5868716588242664)
('paz', 0.6046274344464135)
('flow', 0.8630308175829986)


In [11]:
from rake_nltk import Rake
rake_nltk_var = Rake()
# text = """spaCy is an open-source software library for advanced natural language processing,
# written in the programming languages Python and Cython. The library is published under the MIT license
# and its main developers are Matthew Honnibal and Ines Montani, the founders of the software company Explosion."""
rake_nltk_var.extract_keywords_from_text(text)
keyword_extracted = rake_nltk_var.get_ranked_phrases()
print(keyword_extracted)

['estudio de la cadena del mercurio en colombia con énfasis en la actividad minera de oro', 'consulta con grupos de interés para una propuesta', 'plan nacional de desarrollo 2018 – 2022 pacto por colombia', 'government identifies 29 “ mercury hot spots ” located', 'la política de formalización minera de oro', '7 “ neutral stakeholder ,” neither expanding', 'small miners asm source swiss better gold association figure 2', 'expand asgm formalization national unified formalization plan nufp', 'national unified formalization plan nufp whose objective', 'mostly female subsistence “ barequeros ” benefit', 'swiss better gold association continuous improvement escalator', '“ oro legal ” activity supports asgm operators', '11 mining formalization projects comprising 135 mpus', 'asociación de agromineros del cauca alone', 'references agencia nacional minera anc', 'vulnerable populations hentschel et al ., 2002', 'censo minero departamental 2010 – 2011', 'thomas hentschel chemonics inc ., projekt