### This file searches for keywords among pdf files and return the surrounding sentences around it
#### ***implemented using spacy***

In [1]:
import pandas as pd
import numpy as np
import os
import io

from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage

import spacy

#### load search word and documents as list

In [2]:
searchList = ['FinTech','TechFin','digital','technology','Blockchain',
              'distributed','Bitcoin','ICO','cryptocurrency','mobile','online',
              'cyber','InsurTech','RegTech','micro']

searchList = [x.lower() for x in searchList]

In [3]:
input_folder = r'U:\\My Documents\\Python\\Text Mining\\_archive\PDF Extraction\files'

file_name = os.listdir(input_folder)
file_path = [os.path.join(input_folder, x) for x in file_name]

def extract_text_from_pdf(pdf_path):
    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = TextConverter(resource_manager, fake_file_handle)
    page_interpreter = PDFPageInterpreter(resource_manager, converter)
 
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, 
                                      caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)
 
        text = fake_file_handle.getvalue()
 
    # close open handles
    converter.close()
    fake_file_handle.close()
 
    if text:
        return text

if __name__ == '__main__':
    doc_list = list(map(lambda x: extract_text_from_pdf(x), file_path))
    print(len(doc_list)) #     print(extract_text_from_pdf(file_path[0]))

27


#### Create custom component and extension attributes for phrase match

In [17]:
# !python -m spacy download en_core_web_sm (not working on server. Instead, manually download and then use pip install)

# from spacy.lang.en import English
# nlp = English()
# load document with spacy small model using spacy.pipe

nlp = spacy.load('en_core_web_md')
print(nlp.pipe_names)

['tagger', 'parser', 'ner']


In [18]:
# Create pattern Doc objects and add to matcher
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
patterns = list(nlp.pipe(searchList))
matcher.add('Fintech', None, *patterns)

In [19]:
from spacy.tokens import Span

def match_component(doc):
    '''create an entity Span with label = fintech_related '''
    matches = matcher(doc)
    spans = [Span(doc, start, end, label = 'fintech_related') for match_id, start, end in matches]
    # Overwrite the doc.ents with the matched spans
    doc.ents = spans
    return doc

# add the component to the pipeline
nlp.add_pipe(match_component, last = True)
print('Pipeline: ', nlp.pipe_names)


Pipeline:  ['tagger', 'parser', 'ner', 'match_component']


In [20]:
test = nlp(doc_list[0], disable = ['tagger','parser','ner'])
test.ents

(technology, technology, technology, micro, online)

####  Extension attributes for sentence number

##### Create property extension attributes for getting sentence id of span

In [21]:
def get_sent_id(span):
    '''get the id of the sentence containing the span'''
    my_sent = span.sent.text
    
    for i, s in enumerate(list(doc.sents)):
        if s.text == my_sent:
            return i
    
    return None

Span.set_extension('sent_id', getter = get_sent_id, force = True)

##### Create method extension attribute for getting sentences from doc

In [22]:
from spacy.tokens import Doc

def get_sentences(doc, start_id, end_id):
    '''return the sentence text between start_id and end_id of the document sentence list '''
    sentences = list(doc.sents)[start_id: end_id]
    return ' '.join([s.text for s in sentences])

Doc.set_extension('get_sent', method = get_sentences, force = True)

#### Process document list with spacy pipeline

In [23]:
new_doc_list = list(nlp.pipe(doc_list))

In [24]:
#test
# new_doc_list[0][168:170]._.sent_id
# new_doc_list[0]._.get_sent(56,58)
# doc0 = new_doc_list[0]
# doc0.ents[1].label_

In [25]:
pos_sent =[]

for doc in new_doc_list:  
    doc_para = []
    
    for ent in doc.ents:
        if ent.label_ == 'fintech_related': # not needed, other entities were already overwritten
            # print(ent.text)
            sent_id = ent._.sent_id
            paragraphs = doc._.get_sent(start_id = sent_id -1, end_id = sent_id + 2)
            doc_para.append(paragraphs)
        
    pos_sent.append(doc_para)

pos_sent

[['3 through banks could deepen financial intermediation, reduce opportunities for corruption and improve the business climate. Gradual opening of the financial sector to foreign investors could improve services and transfer technology and know how. Directors also noted that continued efforts are required to strengthen the AML/CFT framework.',
  'Ethiopia would benefit from structural reforms such as channeling the payment of taxes through banks, which would reduce taxpayers’ and administration’s costs, increase financial intermediation and development, and reduce opportunities for corruption. Also, gradual entry of global banks into Ethiopia would facilitate transmission of know-how and technology and help address CBR losses. Joining the African Continental Free Trade Agreement and accelerating progress on WTO accession would also improve access to foreign markets and support exports.',
  'Allowing the payment of taxes through banks could yield significant benefits by deepening financ

#### save output with titles

In [26]:
title = [list(d.sents)[3:5] for d in new_doc_list]
assert len(new_doc_list)==len(title)

In [43]:
output = pd.DataFrame(data={'title': title, 'sentences': pos_sent, 'file': file_name})

output_long = output.apply(lambda x: pd.Series(x['sentences']), axis = 1).stack().reset_index(level=1, drop = True)

output_long.name = 'context'

output_long = pd.DataFrame(output_long)

final_output = pd.merge(output, output_long, how = 'outer', left_index=True, right_index=True)

## remove duplicates
final_output.groupby('context').aggregate({'title':'first', 'file':'first'}).sort_values('file').to_csv('output.csv')

#### Country NER isn't able to capture all countries

In [10]:
from spacy import displacy
displacy.render(list(new_doc_list[4].sents)[0:7], style='ent')

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


In [11]:
for ent in new_doc_list[3][10:100].ents:
    print(ent.text, ent.ent_id, ent.label_)

BOLIVIA 0 GPE
2018 0 DATE
IMF 0 ORG
every year 0 DATE
the 2018 Article IV 0 EVENT
Bolivia 0 GPE
the Executive Board 0 ORG
November 9 0 DATE
2018 0 DATE


In [12]:
country = [ent.lower_ for d in new_doc_list for ent in d[0:100].ents if ent.label_ =='GPE' ]
len(country)

49