In [24]:
import string
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import requests



In [2]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    str = retstr.getvalue()
    retstr.close()
    return str

In [5]:
test_file = './data/test.pdf'
text = convert_pdf_to_txt(test_file)
text

' Goal\u200b : delete all manual setup to build a goal-oriented bot (document creation, price \nestimation, etc…) \n \nProblem\u200b : the manual setup of those kind of chatbots is time-consuming and error-prone.  \nNo scripted solutions seem to be efficient enough to reduce setup duration and errors.  \nWe always turn around a useless question : is it better to use filters or gotoressources / bots.  \nCan be explained deeper if needed.  \n \nHypothesis\u200b : switching the way we see the problem could offer a solution.  \nThe final goal of these chatbots is to perform an action.  \nLet’s say the action is creation a contract for a bank’s corporate customers (guarantee \ncontracts). \n \nLet’s also say we have access to 10k+ of those contracts. \n \nHere’s our hypothesis (so the question for Dino) :  \n \n\n1) Know the list of attributes needed \n2) Turn those 10k+ pdf files into text files (OCR, some open source stuff should exist) \n3) Manually train the NLU to extract entities (nee

In [22]:
text = ''.join(x for x in text if x in string.printable)
lines = text.split('\n')
lines = list(map(lambda x: x.strip(), lines))
lines = list(filter(None, lines))
lines

['Goal : delete all manual setup to build a goal-oriented bot (document creation, price',
 'estimation, etc)',
 'Problem : the manual setup of those kind of chatbots is time-consuming and error-prone.',
 'No scripted solutions seem to be efficient enough to reduce setup duration and errors.',
 'We always turn around a useless question : is it better to use filters or gotoressources / bots.',
 'Can be explained deeper if needed.',
 'Hypothesis : switching the way we see the problem could offer a solution.',
 'The final goal of these chatbots is to perform an action.',
 'Lets say the action is creation a contract for a banks corporate customers (guarantee',
 'contracts).',
 'Lets also say we have access to 10k+ of those contracts.',
 'Heres our hypothesis (so the question for Dino) :',
 '1) Know the list of attributes needed',
 '2) Turn those 10k+ pdf files into text files (OCR, some open source stuff should exist)',
 '3) Manually train the NLU to extract entities (needed attributes) fro

In [23]:
saved = None
data = []
for line in lines:
    if saved == None:
        saved = line
        continue
    if line[0].islower():
        saved += line
    else:
        data.append(saved)
        saved = line
        
data

['Goal : delete all manual setup to build a goal-oriented bot (document creation, priceestimation, etc)',
 'Problem : the manual setup of those kind of chatbots is time-consuming and error-prone.',
 'No scripted solutions seem to be efficient enough to reduce setup duration and errors.',
 'We always turn around a useless question : is it better to use filters or gotoressources / bots.',
 'Can be explained deeper if needed.',
 'Hypothesis : switching the way we see the problem could offer a solution.',
 'The final goal of these chatbots is to perform an action.',
 'Lets say the action is creation a contract for a banks corporate customers (guaranteecontracts).',
 'Lets also say we have access to 10k+ of those contracts.',
 'Heres our hypothesis (so the question for Dino) :',
 '1) Know the list of attributes needed',
 '2) Turn those 10k+ pdf files into text files (OCR, some open source stuff should exist)',
 '3) Manually train the NLU to extract entities (needed attributes) from a sample

In [47]:
payload = {'query': 'test', 'project': 'current'}
r = requests.post("http://localhost:5000/parse", json=payload)
r.json()

{'intent': None,
 'entities': [{'start': 0,
   'end': 4,
   'value': 'test',
   'entity': 'name',
   'confidence': 0.8660116913795302,
   'extractor': 'ner_crf'}],
 'intent_ranking': [],
 'text': 'test',
 'project': 'current',
 'model': 'model_20190131-142422'}

In [53]:
doc_entities = {}
for line in lines:
    payload = {'query': line, 'project': 'current'}
    r = requests.post("http://localhost:5000/parse", json=payload)
    if r.status_code != 200:
        continue
        
    response = r.json()
    for entityObj in response['entities']:
        entity = entityObj['entity']
        value = entityObj['value']
        doc_entities[entity] = value

In [54]:
doc_entities

{'name': 'bot'}

In [56]:
docs_data = [doc_entities]

In [68]:
all_entities = ['name', 'value', 'currency']
docs = []
for doc_entities in docs_data:
    temp = []
    for entity in all_entities:
        if entity not in doc_entities:
            temp.append(None)
        else:
            temp.append(doc_entities[entity])
    docs.append(temp)
docs

[['bot', None, None]]

In [69]:
matrix = list(map(lambda x: list(map(lambda y: 1 if y!=None else 0, x)), docs))
matrix

[[1, 0, 0]]

In [71]:
df = pd.DataFrame(matrix, columns=['name', 'value', 'currency'])
df

Unnamed: 0,name,value,currency
0,1,0,0
