### This file searches for keywords among pdf files and return the surrounding sentences around it
#### ***implemented using spacy***

In [1]:
import pandas as pd
import numpy as np
import os
import io

from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage

import spacy

#### load search word and documents as list

In [3]:
searchList = ['FinTech','TechFin','digital','technology','Blockchain',
              'distributed','Bitcoin','ICO','cryptocurrency','mobile','online',
              'cyber','InsurTech','RegTech','micro']

searchList = [x.lower() for x in searchList]

In [5]:
input_folder = r'U:\\My Documents\\Python\\Text Mining\\_archive\PDF Extraction\files'

file_name = os.listdir(input_folder)
file_path = [os.path.join(input_folder, x) for x in file_name]

def extract_text_from_pdf(pdf_path):
    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = TextConverter(resource_manager, fake_file_handle)
    page_interpreter = PDFPageInterpreter(resource_manager, converter)
 
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, 
                                      caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)
 
        text = fake_file_handle.getvalue()
 
    # close open handles
    converter.close()
    fake_file_handle.close()
 
    if text:
        return text

if __name__ == '__main__':
    doc_list = list(map(lambda x: extract_text_from_pdf(x), file_path))
    print(len(doc_list)) #     print(extract_text_from_pdf(file_path[0]))

27


In [6]:
# !python -m spacy download en_core_web_sm (not working on server. Instead, manually download and then use pip install)

# from spacy.lang.en import English
# nlp = English()
# load document with spacy small model using spacy.pipe

nlp = spacy.load('en_core_web_md', disable = ['tagger'])
print(nlp.pipe_names)

#nlp.add_pipe(nlp.create_pipe('sentencizer'))
new_doc_list = list(nlp.pipe(doc_list))

['parser', 'ner']


In [7]:
# Create pattern Doc objects and add to matcher
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
patterns = list(nlp.pipe(searchList))
matcher.add('Fintech', None, *patterns)

pos_sent =[]

for doc in new_doc_list:
    matches = matcher(doc)
    pos_sent.append([doc[start: end].sent.text for match_id, start, end in matches])

pos_sent

[['Gradual opening of the financial sector to foreign investors could improve services and transfer technology and know how.',
  'Also, gradual entry of global banks into Ethiopia would facilitate transmission of know-how and technology and help address CBR losses.',
  'A gradual process of opening the financial sector to foreign participation would improve financial services availability while increasing FDI and encouraging transmission of technology and know-how.',
  'Fuel and commodity exporters as well as small and micro-states were excluded from the sample.',
  'The near-term export outlook reflects the positive impact of earlier infrastructure investments including the Hawassa Industrial Park, the new railway line to Djibouti, and hydropower facilities and electricity transmission lines that have come online.'],
 ['In addition, legislation regulating and widening the range of activities of micro-financial institutions (MFIs) is currently before parliament, and the CBK is working 

In [8]:
pos_sent[1][0].strip()

'In addition, legislation regulating and widening the range of activities of micro-financial institutions (MFIs) is currently before parliament, and the CBK is working on regulation allowing for the creation of investment funds.'

In [60]:
from spacy import displacy
displacy.render(new_doc_list[3][10:100], style='ent')

In [61]:
for ent in new_doc_list[3][10:100].ents:
    print(ent.text, ent.ent_id, ent.label_)

BOLIVIA 0 GPE
2018 0 DATE
IMF 0 ORG
every year 0 DATE
the 2018 Article IV 0 EVENT
Bolivia 0 GPE
the Executive Board 0 ORG
November 9 0 DATE
2018 0 DATE


In [65]:
country = [ent.lower_ for d in new_doc_list for ent in d[0:100].ents if ent.label_ =='GPE' ]
len(country)

49

In [80]:
title = [list(d.sents)[3:5] for d in new_doc_list]

assert len(new_doc_list)==len(title)

In [81]:
output = pd.DataFrame(data={'title': title, 'sentences': pos_sent, 'file': file_name})

output_long = output.apply(lambda x: pd.Series(x['sentences']), axis = 1).stack().reset_index(level=1, drop = True)

output_long.name = 'context'

output_long = pd.DataFrame(output_long)

In [82]:
final_output = pd.merge(output, output_long, how = 'outer', left_index=True, right_index=True)

In [83]:
final_output[['title','file','context']].to_csv('output.csv')