In [14]:
from os import listdir
from os.path import isfile, join
import string
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import requests

In [15]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

def convert_pdf_to_txt(path, maxpages=0):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    caching = True
    pagenos = set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    result = retstr.getvalue()
    retstr.close()
    return result

In [29]:
DATA_DIR = './data/'
files = listdir(DATA_DIR)
files.remove('.DS_Store')
files.remove('test.pdf')
files = list(map(lambda x: join(DATA_DIR, x), files))
files[:5]

['./data/FC038033130_20130725.pdf',
 './data/FC038032234_20130716.pdf',
 './data/FC038032344_20130717.pdf',
 './data/FC038033996_20130801.pdf',
 './data/FC038032345_20130717.pdf']

In [31]:
files_text = {}
num_pages = 3
for f in files:
    print(f)
    text = convert_pdf_to_txt(f, 3)
    files_text[f] = text

./data/FC038033130_20130725.pdf
./data/FC038032234_20130716.pdf
./data/FC038032344_20130717.pdf
./data/FC038033996_20130801.pdf
./data/FC038032345_20130717.pdf
./data/FC038033013_20130724.pdf
./data/FC038033136_20130725.pdf
./data/FC038032235_20130716.pdf
./data/FC038032991_20130724.pdf
./data/FC038032491_20130719.pdf
./data/FC038032238_20130716.pdf
./data/FC038032338_20130717.pdf
./data/FC038034154_20130802.pdf
./data/FC038032490_20130719.pdf
./data/FC038032603_20130719.pdf
./data/FC038033299_20130726.pdf
./data/FC038033573_20130730.pdf
./data/FC038032492_20130719.pdf
./data/FC038033138_20130725.pdf
./data/FC038032122_20130715.pdf
./data/FC038032751_20130722.pdf
./data/FC038032875_20130723.pdf
./data/FC038033998_20130801.pdf
./data/FC038032493_20130719.pdf
./data/FC038032873_20130723.pdf
./data/FC038033572_20130730.pdf
./data/FC038033295_20130726.pdf
./data/FC038032237_20130716.pdf
./data/FC038034226_20130805.pdf
./data/FC038033134_20130725.pdf
./data/FC038033995_20130801.pdf
./data/F

In [46]:
files_lines = {}
for f in files_text:
    text = files_text[f]
    text = ''.join(x for x in text if x in string.printable)
    lines = text.split('\n')
    lines = list(map(lambda x: x.strip(), lines))
    lines = list(filter(None, lines))
    files_lines[f] = lines

In [47]:
for f in files_lines:
    saved = None
    data = []
    for line in lines:
        if saved == None:
            saved = line
            continue
        if line[0].islower() or line[0]=='(' or line[0].isdigit() or saved[-1]==':':
            saved += ' ' + line
        else:
            data.append(saved)
            saved = line
    files_lines[f] = data

In [48]:
files_lines

{'./data/FC038033130_20130725.pdf': ['COMMERZBANK AKTIENGESELLSCHAFT',
  'Frankfurt am Main',
  'ISIN DE000CZ5N8J1',
  'Final Terms dated 17 July 2013 relating to',
  'TURBO Warrants',
  'CALL relating to shares of Accor S.A. to be publicly offered in the French Republic and to be admitted to trading on Euronext Paris S.A. with respect to the',
  'Base Prospectus dated 8 May 2013 relating to',
  'TURBO Warrants',
  'Unlimited TURBO Warrants',
  'INTRODUCTION',
  'These Final Terms have been prepared for the purpose of Article 5 (4) of Directive 2003/71/EC (the  Prospectus  Directive")  as  amended  (which  includes  the  amendments  made  by  Directive 2010/73/EU (the "2010 PD Amending Directive") to the extent that such amendments have been implemented in a relevant Member State of the European Economic Area), as implemented by the relevant provisions of the EU member states, in connection with Regulation 809/2004 of the',
  'European Commission and must be read in conjunction with th

In [51]:
payload = {'query': 'test', 'project': 'current'}
r = requests.post("http://localhost:5000/parse", json=payload)
r.json()

{'intent': {'name': 'currency', 'confidence': 0.8989496388491887},
 'entities': [],
 'intent_ranking': [{'name': 'currency', 'confidence': 0.8989496388491887},
  {'name': 'date', 'confidence': 0.0364827000492327},
  {'name': 'price', 'confidence': 0.03307530342559886},
  {'name': 'size', 'confidence': 0.03149235767597991}],
 'text': 'test',
 'project': 'current',
 'model': 'model_20190201-130854'}

In [64]:
CUTOFF = 0.5
docs_data = []

for f in files_lines:
    print(f)
    lines = files_lines[f]
    doc_entities = {}
    
    for line in lines:
        payload = {'query': line, 'project': 'current'}
        r = requests.post("http://localhost:5000/parse", json=payload)
        if r.status_code != 200:
            continue

        response = r.json()
        
        for entityObj in response['entities']:
#             print(line)
#             print(entityObj)
            if (entityObj['confidence'] < CUTOFF):
                continue
            entity = entityObj['entity']
            value = entityObj['value']
            
            doc_entities[entity] = value
    docs_data.append(doc_entities)

docs_data

./data/FC038033130_20130725.pdf
Page 2
{'start': 5, 'end': 6, 'value': '2', 'entity': 'currency', 'confidence': 0.23361205530783266, 'extractor': 'ner_crf'}
Currency of the Issue: EUR
{'start': 23, 'end': 26, 'value': 'eur', 'entity': 'currency', 'confidence': 0.8851153273872928, 'extractor': 'ner_crf'}
./data/FC038032234_20130716.pdf
Page 2
{'start': 5, 'end': 6, 'value': '2', 'entity': 'currency', 'confidence': 0.23361205530783266, 'extractor': 'ner_crf'}
Currency of the Issue: EUR
{'start': 23, 'end': 26, 'value': 'eur', 'entity': 'currency', 'confidence': 0.8851153273872928, 'extractor': 'ner_crf'}
./data/FC038032344_20130717.pdf
Page 2
{'start': 5, 'end': 6, 'value': '2', 'entity': 'currency', 'confidence': 0.23361205530783266, 'extractor': 'ner_crf'}
Currency of the Issue: EUR
{'start': 23, 'end': 26, 'value': 'eur', 'entity': 'currency', 'confidence': 0.8851153273872928, 'extractor': 'ner_crf'}
./data/FC038033996_20130801.pdf
Page 2
{'start': 5, 'end': 6, 'value': '2', 'entity':

KeyboardInterrupt: 

In [54]:
all_entities = ['date', 'currency', 'price', 'size']
docs = []
for doc_entities in docs_data:
    temp = []
    for entity in all_entities:
        if entity not in doc_entities:
            temp.append(None)
        else:
            temp.append(doc_entities[entity])
    docs.append(temp)
docs

[['18 july 2013', 'eur', None, None],
 ['18 july 2013', 'eur', None, None],
 ['18 july 2013', 'eur', None, None],
 ['18 july 2013', 'eur', None, None],
 ['18 july 2013', 'eur', None, None],
 ['18 july 2013', 'eur', None, None],
 ['18 july 2013', 'eur', None, None],
 ['18 july 2013', 'eur', None, None],
 ['18 july 2013', 'eur', None, None],
 ['18 july 2013', 'eur', None, None],
 ['18 july 2013', 'eur', None, None],
 ['18 july 2013', 'eur', None, None],
 ['18 july 2013', 'eur', None, None],
 ['18 july 2013', 'eur', None, None],
 ['18 july 2013', 'eur', None, None],
 ['18 july 2013', 'eur', None, None],
 ['18 july 2013', 'eur', None, None],
 ['18 july 2013', 'eur', None, None],
 ['18 july 2013', 'eur', None, None],
 ['18 july 2013', 'eur', None, None],
 ['18 july 2013', 'eur', None, None],
 ['18 july 2013', 'eur', None, None],
 ['18 july 2013', 'eur', None, None],
 ['18 july 2013', 'eur', None, None],
 ['18 july 2013', 'eur', None, None],
 ['18 july 2013', 'eur', None, None],
 ['18 july 2

In [55]:
matrix = list(map(lambda x: list(map(lambda y: 1 if y!=None else 0, x)), docs))
matrix

[[1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0],
 [1, 1, 0, 0]]

In [57]:
df = pd.DataFrame(matrix, columns=all_entities)
df

Unnamed: 0,date,currency,price,size
0,1,1,0,0
1,1,1,0,0
2,1,1,0,0
3,1,1,0,0
4,1,1,0,0
5,1,1,0,0
6,1,1,0,0
7,1,1,0,0
8,1,1,0,0
9,1,1,0,0
