In [1]:
from os import listdir
from os.path import isfile, join
import string
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import requests



In [2]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

def convert_pdf_to_txt(path, maxpages=0):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    caching = True
    pagenos = set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    result = retstr.getvalue()
    retstr.close()
    return result

In [3]:
TEST_FLAG = False
DATA_DIR = './data/'
files = listdir(DATA_DIR)
files.remove('.DS_Store')
files.remove('test.pdf')
files = list(map(lambda x: join(DATA_DIR, x), files))
if (TEST_FLAG):
    files = [join(DATA_DIR, 'test.pdf')]
files

['./data/FC038032234_20130716.pdf',
 './data/FC038032235_20130716.pdf',
 './data/FC038032122_20130715.pdf',
 './data/FC038032237_20130716.pdf',
 './data/FC038031951_20130712.pdf']

In [4]:
files_text = {}
num_pages = 3
for f in files:
    print(f)
    text = convert_pdf_to_txt(f, 3)
    files_text[f] = text

./data/FC038032234_20130716.pdf
./data/FC038032235_20130716.pdf
./data/FC038032122_20130715.pdf
./data/FC038032237_20130716.pdf
./data/FC038031951_20130712.pdf


In [5]:
files_lines = {}
for f in files_text:
    text = files_text[f]
    text = ''.join(x for x in text if x in string.printable)
    lines = text.split('\n')
    lines = list(map(lambda x: x.strip(), lines))
    lines = list(filter(None, lines))
    files_lines[f] = lines

In [6]:
for f in files_lines:
    saved = None
    data = []
    for line in lines:
        if saved == None:
            saved = line
            continue
        if line[0].islower() or line[0]=='(' or line[0].isdigit() or saved[-1]==':':
            saved += ' ' + line
        else:
            data.append(saved)
            saved = line
    files_lines[f] = data

In [7]:
files_lines

{'./data/FC038032234_20130716.pdf': ['COMMERZBANK AKTIENGESELLSCHAFT',
  'Frankfurt am Main',
  'ISIN DE000CZ5N5S8',
  'Final Terms dated 12 July 2013 relating to',
  'TURBO Warrants',
  'CALL relating to the',
  'CAC40 Index* to be publicly offered in the French Republic and to be admitted to trading on Euronext Paris S.A. with respect to the',
  'Base Prospectus dated 8 May 2013 relating to',
  'TURBO Warrants',
  'Unlimited TURBO Warrants',
  '______________________________',
  '* "CAC40" and "CAC" are registered trademarks of Euronext N.V. or its subsidiaries.',
  'INTRODUCTION',
  'These Final Terms have been prepared for the purpose of Article 5 (4) of Directive 2003/71/EC (the  Prospectus  Directive")  as  amended  (which  includes  the  amendments  made  by  Directive 2010/73/EU (the "2010 PD Amending Directive") to the extent that such amendments have been implemented in a relevant Member State of the European Economic Area), as implemented by the relevant provisions of the EU

In [8]:
payload = {'query': 'test', 'project': 'current'}
r = requests.post("http://localhost:5000/parse", json=payload)
r.json()

{'intent': {'name': 'currency', 'confidence': 0.8520165557197518},
 'entities': [],
 'intent_ranking': [{'name': 'currency', 'confidence': 0.8520165557197518},
  {'name': 'price', 'confidence': 0.05233856546720778},
  {'name': 'date', 'confidence': 0.0496056537996836},
  {'name': 'size', 'confidence': 0.04603922501335707}],
 'text': 'test',
 'project': 'current',
 'model': 'model_20190201-132659'}

In [9]:
CUTOFF = 0.5
docs_data = []

for f in files_lines:
    print(f)
    lines = files_lines[f]
    doc_entities = {}
    
    for line in lines:
        payload = {'query': line, 'project': 'current'}
        r = requests.post("http://localhost:5000/parse", json=payload)
        if r.status_code != 200:
            continue

        response = r.json()
        
        for entityObj in response['entities']:
#             print(line)
#             print(entityObj)
            if (entityObj['confidence'] < CUTOFF):
                continue
            entity = entityObj['entity']
            value = entityObj['value']
            
            doc_entities[entity] = value
    docs_data.append(doc_entities)

docs_data

./data/FC038032234_20130716.pdf
./data/FC038032235_20130716.pdf
./data/FC038032122_20130715.pdf
./data/FC038032237_20130716.pdf
./data/FC038031951_20130712.pdf


[{'date': '15 july 2013', 'size': '1,000,000', 'currency': 'eur'},
 {'date': '15 july 2013', 'size': '1,000,000', 'currency': 'eur'},
 {'date': '15 july 2013', 'size': '1,000,000', 'currency': 'eur'},
 {'date': '15 july 2013', 'size': '1,000,000', 'currency': 'eur'},
 {'date': '15 july 2013', 'size': '1,000,000', 'currency': 'eur'}]

In [10]:
all_entities = ['date', 'currency', 'price', 'size']
docs = []
for doc_entities in docs_data:
    temp = []
    for entity in all_entities:
        if entity not in doc_entities:
            temp.append(None)
        else:
            temp.append(doc_entities[entity])
    docs.append(temp)
docs

[['15 july 2013', 'eur', None, '1,000,000'],
 ['15 july 2013', 'eur', None, '1,000,000'],
 ['15 july 2013', 'eur', None, '1,000,000'],
 ['15 july 2013', 'eur', None, '1,000,000'],
 ['15 july 2013', 'eur', None, '1,000,000']]

In [11]:
matrix = list(map(lambda x: list(map(lambda y: 1 if y!=None else 0, x)), docs))
matrix

[[1, 1, 0, 1], [1, 1, 0, 1], [1, 1, 0, 1], [1, 1, 0, 1], [1, 1, 0, 1]]

In [12]:
df = pd.DataFrame(matrix, columns=all_entities)
df

Unnamed: 0,date,currency,price,size
0,1,1,0,1
1,1,1,0,1
2,1,1,0,1
3,1,1,0,1
4,1,1,0,1
