In [986]:
from operator import itemgetter
import pandas as pd
import fitz
import json
from bs4 import BeautifulSoup
def fonts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool
    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.getText("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles


In [987]:
import camelot

In [988]:
doc = fitz.open('/home/jupyter/fpr_pdf3.pdf')

In [989]:
import fitz  # this is pymupdf

with fitz.open('/home/jupyter/fpr_pdf3.pdf') as doc:
    text = ""
    for page in doc:
        text += page.getText()
        #print(text)

#print(text)

In [990]:
doc = fitz.open('/home/jupyter/fpr_pdf3.pdf')

In [991]:
font_counts, styles = fonts(doc, granularity=False)

In [992]:
font_counts, styles

([('9.960000038146973', 724),
  ('9.0', 18),
  ('14.039999961853027', 6),
  ('12.960000038146973', 4),
  ('11.039999961853027', 3),
  ('6.480000019073486', 2),
  ('21.959999084472656', 1)],
 {'21.959999084472656': {'size': 21.959999084472656, 'font': 'ArialMT'},
  '9.960000038146973': {'size': 9.960000038146973, 'font': 'ArialMT'},
  '12.960000038146973': {'size': 12.960000038146973, 'font': 'ArialMT'},
  '14.039999961853027': {'size': 14.039999961853027, 'font': 'ArialMT'},
  '9.0': {'size': 9.0, 'font': 'ArialMT'},
  '6.480000019073486': {'size': 6.480000019073486, 'font': 'ArialMT'},
  '11.039999961853027': {'size': 11.039999961853027,
   'font': 'BCDFEE+Calibri'}})

In [993]:
def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.
    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict
    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag 
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)

    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag

In [994]:
def headers_para(doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.getText("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text
                # REMEMBER: multiple fonts and sizes are possible IN one block
                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                block_string = size_tag[s['size']] + s['text']
                            else:
                                if s['size'] == previous_s['size']:

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = size_tag[s['size']] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = size_tag[s['size']] + s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    header_para.append(block_string)
                                    block_string = size_tag[s['size']] + s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    block_string += "|"

                header_para.append(block_string)
    return header_para


In [995]:
#doc = fitz.open('/home/jupyter/FPR_281020_3.pdf')
document = '/home/jupyter/fpr_pdf3.pdf'
doc = fitz.open(document)
toc = doc.getToC()

font_counts, styles = fonts(doc, granularity=False)

size_tag = font_tags(font_counts, styles)

elements = headers_para(doc, size_tag)

with open("doc2.json", 'w') as json_out:
    json.dump(elements, json_out)

In [996]:
tables = camelot.read_pdf(document , flavor='stream')

In [997]:
noway = tables[2].df

#df.to_csv()

In [998]:
noway

Unnamed: 0,0,1
0,xisting policy,
1,This will Field_15 the following Field_16:,
2,R\nepeat for multiple policies,
3,Name,Elnaz Nobari
4,Product,potatoe Level Life Cover tomatoe
5,Sum assured,£13
6,Term,200 years 10 millenium
7,Premium,£400 a month
8,In trust,Yes
9,Guaranteed insurability,No


In [999]:
way = noway[noway[1] != '']
way.columns = ["Field","Value"]
print(way)

                     Field                             Value
3                     Name                      Elnaz Nobari
4                  Product  potatoe Level Life Cover tomatoe
5              Sum assured                               £13
6                     Term            200 years 10 millenium
7                  Premium                      £400 a month
8                 In trust                               Yes
9  Guaranteed insurability                                No


### fitz.Document.getXmlMetadata('/home/jupyter/FPR_281020_3.pdf')

In [1000]:
#!/usr/bin/python

#from bs4 import BeautifulSoup

with open('doc2.json', 'r') as f:

    contents = f.read()

    soup = BeautifulSoup(contents, 'lxml')

    #print(soup.find('ul', attrs={ 'id' : 'mylist'}))
    #print(soup.find('ul', id='h1'))
    root = soup.body
        #root = soup.body
    for tag in soup.find_all('h1'):
        out_name = f'{tag.name}: {tag.text}'
    #root_childs = [e.name for e in root.descendants if e.name is not None]
    #print(root_childs)
out_name

'h1: My protection recommendations |", "", "'

In [1001]:
#contents

In [1002]:
page = doc[3]

In [1003]:
#g = page.getTextPage()

In [1004]:
#g2 = g.extractBLOCKS(3)

In [1005]:
k2 = page.getText("dict")
#doc.TextPage.extractText()

In [1006]:
#k2

In [1007]:
for p in k2['blocks']:
    #print( p['bbox'])
    if p['type'] == 0:
        for pp in p['lines']:
            for ppp in pp['spans']:
                print(ppp['text'])
                print(ppp['bbox'])
        #print(pp['spans']['bbox'])
    #print(p)
    #print(p['label'])
    #print(p['words'])
    #print(p['linking'])
    #print(p['id'])

    #print('')

Lifestyle - critical illness 
(72.02400207519531, 70.32801055908203, 211.4928741455078, 88.13504791259766)
Lifestyle protection for you 
(72.02400207519531, 86.08699035644531, 239.39312744140625, 105.3779525756836)
Summarise the customer’s needs and personal reasons for reviewing this area. Keep concise and 
(72.02400207519531, 112.68348693847656, 507.31427001953125, 123.81067657470703)
take care not to repeat information from elsewhere in the report.  
(72.02400207519531, 123.35299682617188, 359.9588928222656, 137.0380401611328)
INSERT_39 
(72.02400207519531, 135.71298217773438, 128.18887329101562, 149.3980255126953)
Repeat the following product details for each recommended policy. Ensure all details match the 
(72.02400207519531, 148.19296264648438, 495.307861328125, 161.8780059814453)
illustration(s). For joint cases, add customer names if applicable. 
(72.02400207519531, 160.55300903320312, 359.4788818359375, 174.23805236816406)
Name 
(77.42400360107422, 173.08279418945312, 107.428

In [1008]:
#soup = bs4.BeautifulSoup(k2,'xml')

In [1009]:
#soup = BeautifulSoup(xml_data, 'xml')
docx2 = 'Protection recommendations_1.docx'
document2 = zipfile.ZipFile(docx2)
xml_data = document2.read('word/document.xml')
soup = BeautifulSoup(xml_data, 'xml')

In [1011]:
#soup

In [1013]:
#with open("docsoup.json", 'w') as json_out:
    #json.dump(soup, json_out)

In [1014]:
#xml_input = open(k2,"r",encoding="utf-8")

ocr_lines = soup.findAll("span", {"class": "ocr_line"})
#We will save coordinates of line and the text contained in the line in lines_structure list
lines_structure = []
for line in ocr_lines:
    line_text = line.text.replace("\n"," ").strip()
    title = line['title']
    #The coordinates of the bounding box
    x1,y1,x2,y2 = map(int, title[5:title.find(";")].split())
    lines_structure.append({"x1":x1,"y1":y1,"x2":x2,"y2":y2,"text": line_text})

In [1015]:
lines_structure

[]

In [1016]:
import bs4
#xml_input = open("output_hocr.hocr","r",encoding="utf-8")
soup = bs4.BeautifulSoup(kxml,'lxml')
ocr_lines = soup.findAll("span", {"class": "ocr_line"})
#We will save coordinates of line and the text contained in the line in lines_structure list
lines_structure = []
for line in ocr_lines:
    line_text = line.text.replace("\n"," ").strip()
    title = line['title']
    #The coordinates of the bounding box
    x1,y1,x2,y2 = map(int, title[5:title.find(";")].split())
    lines_structure.append({"x1":x1,"y1":y1,"x2":x2,"y2":y2,"text": line_text})

NameError: name 'kxml' is not defined

In [None]:
lines_structure

In [1017]:
with open('/home/jupyter/doc.json') as json_file:
    data = json.load(json_file)

In [1018]:
datpdf = pd.DataFrame.from_dict(data)

In [1019]:
datpdf.head(50)

Unnamed: 0,0
0,||
1,|
2,<s2>HSBC UK Bank plc. Registered in England & ...
3,<s3>Authorised by the Prudential Regulation Au...
4,
5,<h1>Financial Planning Report |
6,
7,<h5>Elnaz Nobari ||||||||||||||||||
8,<p>Prepared by || INSERT adviser name and post...
9,|


In [1020]:
p = ('/home/jupyter/FPR_01112020_all.xlsm')
df = pd.read_excel(
    p ,header = None
)

In [1021]:
len(df.columns)

12

In [1022]:
for i in range(0,len(df.columns)-1):
    df[i] = df[i].str.replace("\r", "")
#df.applymap(lambda x: x.replace("\r", ""))

In [1023]:
UniqueNames = df[2].unique()

In [1024]:
def unique_non_null(s):
    return s.dropna().unique()

In [1025]:
uns = unique_non_null(df[2])

In [1026]:
uns

array(['Introduction', 'My protection recommendations',
       'Money for investment', 'My SELECT recommendations',
       'My estate planning recommendations', 'Your agreed risk level',
       'Reason for your agreed risk level', 'SELECT information',
       'Other financial planning areas', 'My other recommendations ',
       'What this will cost you', 'Risks and implications ',
       'What else you need to know', 'Documents you should read',
       'Next steps'], dtype=object)

In [1027]:
import docx
from bs4 import BeautifulSoup
import zipfile

In [1028]:
docx2 = 'Protection recommendations_1.docx'

In [1029]:
bestand = docx.Document(docx2)
tabellen = bestand.tables

In [1030]:
alletabellen = []     
for i, tabel in enumerate(tabellen):
    for row in tabellen[i].rows:
        for cell in row.cells:
            for paragraph in cell.paragraphs:
                alletabellen.append(paragraph.text)

#get data from all the dropdown lists
import lxml.etree
docx2 = 'Protection recommendations_1.docx'
document2 = zipfile.ZipFile(docx2)
xml_data = document2.read('word/document.xml')

tree = lxml.etree.fromstring(xml_data)
document2.close()

In [1031]:
from docx.api import Document
document = Document('Protection recommendations_1.docx')
table = document.tables

In [1032]:
import docx
 
# open connection to Word Document
doc = docx.Document('Protection recommendations_1.docx')
 
# read in each paragraph in file
result = [[p.text] for p in doc.paragraphs]

In [1033]:
result

[['My protection recommendations'],
 ['In this section I’ll explain my recommendations to meet your needs in the area of ‘Protecting you and your family’.'],
 ['Lifestyle - life'],
 ['Lifestyle protection for your family'],
 ['Summarise the customer’s needs and personal reasons for reviewing this area. Keep concise and take care not to repeat information from elsewhere in the report. '],
 ['INSERT_2'],
 ['Repeat the following product details for each recommended policy. Ensure all details match the illustration(s). For joint cases, add customer names if applicable.'],
 ['This will  your need to  if , to help INSERT_5 meet  financial commitments and maintain  lifestyle.'],
 [''],
 ['Multiple needs covered by single policy'],
 ['This policy will be used to cover needs with differing . I’ve recommended this, rather than separate policies, .'],
 [''],
 ['Multiple policies but single policy cheaper'],
 ['You could cover your needs with a single policy with a lower total cost. However, you p

In [1034]:
from docx2python import docx2python
 
# extract docx content
doc_result = docx2python('Protection recommendations_1.docx')
doc_result.body

[[[['My protection recommendations',
    'In this section I’ll explain my recommendations to meet your needs in the area of ‘Protecting you and your family’.',
    'Lifestyle - life',
    'Lifestyle protection for your family',
    'Summarise the customer’s needs and personal reasons for reviewing this area. Keep concise and take care not to repeat information from elsewhere in the report. ',
    'INSERT_2',
    'Repeat the following product details for each recommended policy. Ensure all details match the illustration(s). For joint cases, add customer names if applicable.']]],
 [[['Name'], ['INSERT_3']],
  [['Product'], ['S_Field_1_0 Field_1 policy type']],
  [['Field_2'], ['Field_3']],
  [['Term'], ['Field_4']],
  [['Premium'], ['£INSERT_4 a month']],
  [['Payable on'], ['Field_5']],
  [['Trust recommended'], ['Field_6']]],
 [[['',
    'This will Field_7 your need to Field_8 if Field_9, to help INSERT_5 meet Field_10 financial commitments and maintain Field_11 lifestyle.',
    '',
  

In [1035]:
pd.DataFrame(doc_result.body[6][1:])

In [1036]:
pd.DataFrame(doc_result.body[1][1:]).\
                            applymap(lambda val: val[0].strip("\t"))

Unnamed: 0,0,1
0,Product,S_Field_1_0 Field_1 policy type
1,Field_2,Field_3
2,Term,Field_4
3,Premium,£INSERT_4 a month
4,Payable on,Field_5
5,Trust recommended,Field_6


In [1037]:
import pandas as pd
pd.DataFrame(doc_result.body[1][1:])

Unnamed: 0,0,1
0,[Product],[S_Field_1_0 Field_1 policy type]
1,[Field_2],[Field_3]
2,[Term],[Field_4]
3,[Premium],[£INSERT_4 a month]
4,[Payable on],[Field_5]
5,[Trust recommended],[Field_6]


In [1040]:
from bs4 import BeautifulSoup
from html import escape
from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.__t = 0
        self.lines = []
        self.__current_line = ''
        self.__current_tag = ''

    @staticmethod
    def __attr_str(attrs):
        return ' '.join('{}="{}"'.format(name, escape(value)) for (name, value) in attrs)

    def handle_starttag(self, tag, attrs):
        if tag != self.__current_tag:
            self.lines += [self.__current_line]

        self.__current_line = '\t' * self.__t + '<{}>'.format(tag + (' ' + self.__attr_str(attrs) if attrs else ''))
        self.__current_tag = tag
        self.__t += 1

    def handle_endtag(self, tag):
        self.__t -= 1
        if tag != self.__current_tag:
            self.lines += [self.__current_line]
            self.lines += ['\t' * self.__t + '</{}>'.format(tag)]
        else:
            self.lines += [self.__current_line + '</{}>'.format(tag)]

        self.__current_line = ''

    def handle_data(self, data):
        self.__current_line += data

    def get_parsed_string(self):
        return '\n'.join(l for l in self.lines if l)


In [1042]:
parser = MyHTMLParser()
#print('custom html parser:')
#print('*' * 80)
parser.feed(str(soup))
#print(parser.get_parsed_string())

In [1043]:
soup = BeautifulSoup(xml_data, 'xml')
gegevens = soup.findAll('listItem')     #search dropdownlists (n = 12)

dropdownlist = []
dropdownlistdata = []

#for i in gegevens:
    #print(i)#dropdownlist.append(i.find('value'))

In [1044]:
cbox = []
pbox = []
divs = soup.find_all('sdt')
for div in divs:
    kaleh = div('sdtPr')
    pacheh = div('sdtContent')
    if (len(kaleh) == len(pacheh)):
        for i in range(0,len(kaleh)-1):
            k1 = kaleh[i]('comboBox')
            p1 = pacheh[i]('r')
            if k1!=[] and  p1!=[] :
                for kav in k1:

                    k2 =  [num["w:value"] for num in kav]
                    cbox.append(k2)
###here p1[0] was chosen as manual inspection 
                for div in p1[0]('t'):
                    pres = (div.text)
                    pbox.append(pres)


In [1045]:
 dropdown_result = [list(x) for x in zip(cbox, pbox)]

In [1046]:
for res22 in dropdown_result:
    res22[0].pop(0)

In [1047]:
from fuzzywuzzy import fuzz

In [1048]:
k_tables = []
k_all_tbl = []
divs = soup.find_all('tbl')
#print(divs)
for i in range(0,len(divs)):
    k_tables = []
    tbl = divs[i]
    #print(tbl, "____")
    rows = divs[i]('tr')
    #print(rows)
    for ii in range(0, len(divs[i]('tr'))):
        #row in rows:
        columns = divs[i]('tr')[ii]('tc')
        k2 =  [num("t")[0].contents for num in columns]
        k_tables.append(k2)
    #print(k_tables,"___________________________")
    k_all_tbl.append(k_tables)
print(k_all_tbl)

[[[['Name'], ['INSERT_3']], [['Product'], ['S_Field_1_0']], [['Field_2'], ['Field_3']], [['Term'], ['Field_4']], [['Premium'], ['£']], [['Payable on'], ['Field_5']], [['Trust recommended'], ['Field_6']]], [[['Name'], ['INSERT_9']], [['Product'], ['INSERT_10']], [['Sum assured'], ['£']], [['Term'], ['INSERT_13']], [['Premium'], ['£']], [['In trust'], ['Field_17']], [['Guaranteed insurability option available'], ['Field_18']]], [[['Name'], ['INSERT_40']], [['Product'], ['S_Field_34_33']], [['Sum assured'], ['£']], [['Term'], ['Field_30']], [['Premium'], ['£']]], [[['Name'], ['INSERT_46']], [['Product'], ['INSERT_47']], [['Sum assured'], ['£']], [['Term'], ['INSERT_50']], [['Premium'], ['£']], [['Guaranteed insurability option available'], ['S_FIELD_43_42']]], [[['Name'], ['INSERT_67']], [['Product'], ['S_Field_50_49']], [['Benefit amount'], ['£']], [['Term'], ['Field_44']], [['Premium'], ['£']]], [[['Name'], ['INSERT_73']], [['Product'], ['INSERT_74']], [['Benefit amount'], ['£']], [['Te

In [1050]:
for ik in range(0,len(k_all_tbl)):
    #print(1)
    for ikk in range(0,len(k_all_tbl[ik])):
        for ikkk in range(0,(len (k_all_tbl[ik][ikk]))):
        #print(2)
            for res1 in dropdown_result:
                #print (3)
                #print(k_all_tbl[ik][ikk][0][0], res1[1] )

                if k_all_tbl[ik][ikk][ikkk][0] == res1[1]:
                    k_all_tbl[ik][ikk][ikkk][0] = res1[0]


In [1051]:
#k_all_tbl.replace({'[':''}, regex=True)

In [1052]:
way

Unnamed: 0,Field,Value
3,Name,Elnaz Nobari
4,Product,potatoe Level Life Cover tomatoe
5,Sum assured,£13
6,Term,200 years 10 millenium
7,Premium,£400 a month
8,In trust,Yes
9,Guaranteed insurability,No


In [1053]:
df = pd.DataFrame()
dataframe = []
#dataframe.append(None)
for ilk in range(0,len(k_all_tbl)):
    
    df = pd.DataFrame(k_all_tbl[ilk])
    df.columns = ['Field','Value']
    for cols in df:
        for irow in range(0,len(df[cols])):
            while isinstance(df[cols][irow], list) == True and len(df[cols][irow])==1:
                    df[cols][irow] = df[cols][irow][0]
                    continue
                    if isinstance(df[cols][irow], list) == False or len(df[cols][irow])>1 :
                        break
    dataframe.append(df)

In [1054]:
import itertools
import collections
from collections.abc import Iterable

In [1055]:
list_df = dataframe[0].values.tolist()
flatten = lambda *n: (e for a in n
        for e in (flatten(*a) if isinstance(a, (tuple, list)) else (a,)))

#l1 = (list(flatten(list_df)))
list_df
    

[['Name', 'INSERT_3'],
 ['Product', ['Aegon', 'Aviva', 'HSBC', 'Zurich']],
 [['Sum assured', 'Benefit amount'],
  ['£ii_Field_52_1', '£ii_Field_53_2 a month']],
 ['Term', ['ii_Field_62_1 years', 'to age ii_Field_63_2', 'Whole of life']],
 ['Premium', '£'],
 ['Payable on', ['Death', 'First death']],
 ['Trust recommended', 'Field_6']]

In [1056]:
target = way.values.tolist()
for i in range(0,len(dataframe)):
    list_df=[]
    
    list_df = dataframe[i].values.tolist()
    flatten = lambda *n: (e for a in n
        for e in (flatten(*a) if isinstance(a, (tuple, list)) else (a,)))

    l1 = (list(flatten(list_df)))
    l2 = list(flatten(target))
    var1 = set(l1) & set(l2)
    print(var1)

{'Term', 'Name', 'Product', 'Sum assured', 'Premium'}
{'Term', 'Yes', 'In trust', 'Name', 'No', 'Product', 'Sum assured', 'Premium'}
{'Term', 'Name', 'Product', 'Sum assured', 'Premium'}
{'Term', 'Name', 'Product', 'Sum assured', 'Premium'}
{'Name', 'Product', 'Term', 'Premium'}
{'Term', 'Yes', 'Name', 'No', 'Product', 'Premium'}
{'Term', 'Name', 'Product', 'Sum assured', 'Premium'}
{'Term', 'Yes', 'In trust', 'Name', 'No', 'Product', 'Sum assured', 'Premium'}
set()


In [1070]:
dataframe[0].Field

0                             Name
1                          Product
2    [Sum assured, Benefit amount]
3                             Term
4                          Premium
5                       Payable on
6                Trust recommended
Name: Field, dtype: object

In [1083]:
myset1 = set(way.Field)
#print(myset1)
pdf_list = list(myset1)
tmpl_list=[]
tmpl_list = list(dataframe[0].Field)

#for gg in range(0,len(tmpl_list)):
    #if isinstance(tmpl_list[gg], list) == False:
        #tmpl_list[gg]= list(tmpl_list[gg])
        
    
import itertools
#lst = [['a'], ['b', 'c'], ['d', ['e', 'f']]]
outp = list(itertools.product(tmpl_list))
out = []
for i in outp:
    temp = []
    for j in i:
        if isinstance(j, list):
            for k in j:
                temp.append(k)
        else:
            temp.append(j)
    out.append(temp)
print(out)

tmpl_list2  = out

outp = list(itertools.product(*tmpl_list2))
out = []
for i in outp:
    temp = []
    for j in i:
        if isinstance(j, list):
            for k in j:
                temp.append(k)
        else:
            temp.append(j)
    out.append(temp)
print(out)

[['Name'], ['Product'], ['Sum assured', 'Benefit amount'], ['Term'], ['Premium'], ['Payable on'], ['Trust recommended']]
[['Name', 'Product', 'Sum assured', 'Term', 'Premium', 'Payable on', 'Trust recommended'], ['Name', 'Product', 'Benefit amount', 'Term', 'Premium', 'Payable on', 'Trust recommended']]


In [1082]:
out

[['N', 'P', 'Sum assured', 'T', 'P', 'P', 'T'],
 ['N', 'P', 'Sum assured', 'T', 'P', 'P', 'r'],
 ['N', 'P', 'Sum assured', 'T', 'P', 'P', 'u'],
 ['N', 'P', 'Sum assured', 'T', 'P', 'P', 's'],
 ['N', 'P', 'Sum assured', 'T', 'P', 'P', 't'],
 ['N', 'P', 'Sum assured', 'T', 'P', 'P', ' '],
 ['N', 'P', 'Sum assured', 'T', 'P', 'P', 'r'],
 ['N', 'P', 'Sum assured', 'T', 'P', 'P', 'e'],
 ['N', 'P', 'Sum assured', 'T', 'P', 'P', 'c'],
 ['N', 'P', 'Sum assured', 'T', 'P', 'P', 'o'],
 ['N', 'P', 'Sum assured', 'T', 'P', 'P', 'm'],
 ['N', 'P', 'Sum assured', 'T', 'P', 'P', 'm'],
 ['N', 'P', 'Sum assured', 'T', 'P', 'P', 'e'],
 ['N', 'P', 'Sum assured', 'T', 'P', 'P', 'n'],
 ['N', 'P', 'Sum assured', 'T', 'P', 'P', 'd'],
 ['N', 'P', 'Sum assured', 'T', 'P', 'P', 'e'],
 ['N', 'P', 'Sum assured', 'T', 'P', 'P', 'd'],
 ['N', 'P', 'Sum assured', 'T', 'P', 'a', 'T'],
 ['N', 'P', 'Sum assured', 'T', 'P', 'a', 'r'],
 ['N', 'P', 'Sum assured', 'T', 'P', 'a', 'u'],
 ['N', 'P', 'Sum assured', 'T', 'P', 'a'

In [1079]:
tmpl_list

[['N', 'a', 'm', 'e'],
 ['P', 'r', 'o', 'd', 'u', 'c', 't'],
 ['Sum assured', 'Benefit amount'],
 ['T', 'e', 'r', 'm'],
 ['P', 'r', 'e', 'm', 'i', 'u', 'm'],
 ['P', 'a', 'y', 'a', 'b', 'l', 'e', ' ', 'o', 'n'],
 ['T',
  'r',
  'u',
  's',
  't',
  ' ',
  'r',
  'e',
  'c',
  'o',
  'm',
  'm',
  'e',
  'n',
  'd',
  'e',
  'd']]

In [960]:
wrd_tbl_0 = pd.DataFrame(k_all_tbl[0])
wrd_tbl_0.columns = ['Field','Value']
#wrd_tbl_0[0] = wrd_tbl_0[0].str.get(0)
#wrd_tbl_0[0] = wrd_tbl_0[0].str.get(0)

In [827]:
#loop to remove all lists from inside dataframe and replace with values 
for cols in wrd_tbl_0:
    for irow in range(0,len(wrd_tbl_0[cols])):
        while isinstance(wrd_tbl_0[cols][irow], list) == True:
            wrd_tbl_0[cols][irow] = wrd_tbl_0[cols][irow][0]
            continue
            if isinstance(wrd_tbl_0[cols][irow], list) == False :
                break
                
        #while type(row) != str:
           # row = row[0]
          #i += 1

In [891]:
len(reviews)

9

In [925]:
way

Unnamed: 0,Field,Value
3,Name,Elnaz Nobari
4,Product,potatoe Level Life Cover tomatoe
5,Sum assured,£13
6,Term,200 years 10 millenium
7,Premium,£400 a month
8,In trust,Yes
9,Guaranteed insurability,No


In [945]:
way.Field[3]

'Name'

In [946]:
fuzz.ratio(dataframe[0].Field[0], way.Field[3])

100

In [218]:
import xmltodict

with open(soup) as fd:
    doc = xmltodict.parse(fd.read())

TypeError: expected str, bytes or os.PathLike object, not BeautifulSoup

In [357]:
data = []
for label in soup.select('comboBox'):
    data.append({ label.text.strip(): label.find_next_sibling().text.strip() })

AttributeError: 'NoneType' object has no attribute 'text'

In [221]:
for product_div in soup.find_all('w', {'w': 'value'}):
    product_tag = product_div.find('listItem')
    if product_tag:
        print (product_tag.attrs['id'])

In [227]:
all_tags = [tag.name for tag in soup.find_all()]
myset = set(all_tags)
print(myset)

{'tblW', 'color', 'pStyle', 'tblGrid', 'r', 'tc', 'sdtEndPr', 'id', 'listItem', 'b', 'tcPr', 'tblLook', 'vertAlign', 'pPr', 'lastRenderedPageBreak', 'rStyle', 'tblStyle', 'docPart', 'sdt', 'pgSz', 'rPr', 'tcBorders', 'showingPlcHdr', 'tblBorders', 'insideV', 'placeholder', 'tr', 'document', 'rFonts', 'sectPr', 'pgMar', 'tcW', 't', 'p', 'cols', 'tbl', 'body', 'tblPr', 'left', 'gridCol', 'shd', 'trHeight', 'br', 'docGrid', 'sdtPr', 'comboBox', 'trPr', 'right', 'top', 'bottom', 'iCs', 'repeatingSection', 'sdtContent', 'repeatingSectionItem', 'highlight'}


In [407]:
head_tag = soup.sdt
head_tag
for child in head_tag.children:
    k2 = (child, "_____________________________")

In [418]:
#soup.find_all('sdt')[6].contents[2]['w:t']#'t')

dali =[]
mozart = []
beeth = []
for i in range(0,len(soup.findAll('sdt'))):
    #try:
        lolz = soup.findAll('sdt')[i].contents[2]
        dali = lolz("t")
        #beeth = [num["w:t"] for num in lolz]
        mozart.append(dali)

In [420]:
#soup.findAll('sdt')[0].contents[2]
for row in tr:
    print(row.find('td', {'class': 'price'}).text)

<w:sdtContent><w:sdt><w:sdtPr><w:rPr><w:b/></w:rPr><w:id w:val="2052105906"/><w:placeholder><w:docPart w:val="FA5837C7B0AF452AA8A6CBBA1D252A1B"/></w:placeholder><w15:repeatingSectionItem/></w:sdtPr><w:sdtEndPr><w:rPr><w:b w:val="0"/></w:rPr></w:sdtEndPr><w:sdtContent><w:tbl><w:tblPr><w:tblStyle w:val="TableGrid"/><w:tblW w:type="auto" w:w="0"/><w:tblBorders><w:top w:color="auto" w:space="0" w:sz="0" w:val="none"/><w:left w:color="auto" w:space="0" w:sz="0" w:val="none"/><w:bottom w:color="auto" w:space="0" w:sz="0" w:val="none"/><w:right w:color="auto" w:space="0" w:sz="0" w:val="none"/><w:insideV w:color="auto" w:space="0" w:sz="0" w:val="none"/></w:tblBorders><w:tblLook w:firstColumn="1" w:firstRow="1" w:lastColumn="0" w:lastRow="0" w:noHBand="0" w:noVBand="1" w:val="04A0"/></w:tblPr><w:tblGrid><w:gridCol w:w="2589"/><w:gridCol w:w="6437"/></w:tblGrid><w:tr w14:paraId="06FE69F0" w14:textId="77777777" w:rsidR="00F01ED7" w:rsidRPr="002E1A73" w:rsidTr="00DB6E7E"><w:tc><w:tcPr><w:tcW w:t

In [429]:
#mozart

In [716]:
#soup.find_all('sdt')[3].contents

#dali =[]
#mozart = []
#beeth = []
#for i in range(3,len(soup.findAll('sdt'))):
    #try:
        #lolz = soup.findAll('comboBox')[i].contents
        #dali = [num["w:value"] for num in lolz]
        #beeth = [num["w:t"] for num in lolz]
        #mozart.append([beeth, dali])

In [335]:
i_tag = soup.find_all('comboBox')[11]

for i in soup.find_all('comboBox'):
    #print(child)
    my_text = str(i.previousSibling).strip()
    print(my_text, "-------------------------") 

In [356]:
import re
maxdepth = 1
indent_size = 2
x = (soup.prettify())

x2 = xmltodict.parse(x, process_namespaces=True)

for key in x2:
   k =(json.dumps(x2[key], indent=4, default=str))

In [270]:
print(head_tag)

None


In [315]:
t1 = soup.findAll('comboBox')[90].contents#['listItem']
#title_tag = t1[0]
t1

[<w:listItem w:value="Choose an item."/>,
 <w:listItem w:displayText="the premiums are the lowest of all the companies on our list of recommended insurers." w:value="the premiums are the lowest of all the companies on our list of recommended insurers."/>,
 <w:listItem w:displayText="the features and benefits of the policy meet your specific requirements and the premiums are the lowest of all the companies on our list of recommended insurers." w:value="the features and benefits of the policy meet your specific requirements and the premiums are the lowest of all the companies on our list of recommended insurers."/>,
 <w:listItem w:displayText="the features and benefits of the policy meet your specific requirements." w:value="the features and benefits of the policy meet your specific requirements."/>,
 <w:listItem w:displayText="ii_Field_995_1 other reason." w:value="ii_Field_995_1 other reason."/>]

In [324]:
dali =[]
mozart = []
for i in range(0,len(soup.findAll('comboBox'))):
        lolz = soup.findAll('comboBox')[i].contents
        dali = [num["w:value"] for num in lolz]
        mozart.append(dali)
    #t1 = soup.findAll('comboBox')[i].contents
    #for j in range(0, len(soup.findAll('comboBox')[i].contents)):
        #for ii in in range(1, len(soup.findAll('comboBox')[i].contents[j])):
   
    
        
    

In [325]:
mozart

[['Choose an item.', 'Aegon', 'Aviva', 'HSBC', 'Zurich'],
 ['Choose an item.',
  'Level Life Cover',
  'Family Income Benefit',
  'Whole of Life Cover'],
 ['Choose an item.', 'Sum assured', 'Benefit amount'],
 ['Choose an item.', '£ii_Field_52_1', '£ii_Field_53_2 a month'],
 ['Choose an item.',
  'ii_Field_62_1 years',
  'to age ii_Field_63_2',
  'Whole of life'],
 ['Choose an item.', 'Death', 'First death'],
 ['Choose an item.', 'Yes', 'No'],
 ['Choose an item.', 'meet', 'partially meet'],
 ['Choose an item.',
  'replace your earned income',
  'provide a lump sum',
  'provide a monthly income'],
 ['Choose an item.', 'you die', 'either of you dies'],
 ['Choose an item.', 'their', 'his', 'her'],
 ['Choose an item.', 'their', 'his', 'her'],
 ['Choose an item.', 'sums assured and terms', 'sums assured', 'terms'],
 ['Choose an item.',
  'as the total cost is lower',
  'as you feel a single policy is more convenient, despite costing more',
  'because ii_Field_154_1'],
 ['Choose an item.', '

In [289]:
len(soup.findAll('comboBox'))

91

In [287]:
len(t1)

5

In [317]:
t1[1]

<w:listItem w:displayText="the premiums are the lowest of all the companies on our list of recommended insurers." w:value="the premiums are the lowest of all the companies on our list of recommended insurers."/>

In [255]:
t1.attrs

{}

In [251]:
soup.comboBox.lisItem

In [243]:
t = soup.findAll('listItem')[2]
type(t)
t.text
type(t.attrs)
t.attrs #['w:value']

{'w:displayText': 'Aviva', 'w:value': 'Aviva'}

In [None]:
links = soup.find_all('w')

In [None]:
for link in links:
    print(link.attrs['listItem'])

In [None]:
soup.listItem.value

In [None]:
import argparse
from xml.etree import ElementTree


# ------------------------------------------------------------------------------
def reformat(
        input_xml: str,
        output_xml: str,
):
    tree = ElementTree.parse(input_xml)

    # remove extraneous newlines and whitespace from text elements
    for element in tree.getiterator():
        if element.text:
            element.text = element.text.strip()

    # write the updated XML into the annotations output directory
    tree.write(output_xml)


# ------------------------------------------------------------------------------
if __name__ == "__main__":

    # parse the command line arguments
    args_parser = argparse.ArgumentParser()
    args_parser.add_argument(
        "--in",
        required=True,
        type=str,
        help="file path of original XML",
    )
    args_parser.add_argument(
        "--out",
        required=True,
        type=str,
        help="file path of reformatted XML",
    )
    args = vars(args_parser.parse_args())

    reformat(
        args["in"],
        args["out"],
    )

In [None]:
 #with open(r'Protection recommendations_1.docx', encoding="utf8", errors='ignore') as f:
    #soup5 = BeautifulSoup(f, 'lxml')

In [None]:
all_tags = [tag.name for tag in soup.find_all()]

In [None]:
myset = set(all_tags)
print(myset)

In [None]:
for row in soup.find_all('column'):
    print(row.text)

In [None]:
import lxml.etree
document = zipfile.ZipFile('Protection recommendations_1.docx')
xml_content = document.read('word/document.xml')
#tree = lxml.etree.parse(xml_content)

In [None]:
from lxml import etree
from io import BytesIO

myString = xml_content

tree = etree.parse(BytesIO(myString))

In [None]:
pi = tree.xpath("//processing-instruction()")[0] 

In [None]:
gegevans

In [428]:
#soup.findAll('tbl')
#print (soup.prettify())

In [None]:
soup.findAll('listItem') 

In [None]:
#convert to string for if statements
number = str(dropdownlist[0])
job = str(dropdownlist[1])
vehicle = str(dropdownlist[7])

if number == '<w:result w:val="1"/>' :
    dropdownlistdata.append('0,3')
elif number == '<w:result w:val="2"/>' :
    dropdownlistdata.append('1,2')
elif number == '<w:result w:val="3"/>' :
    dropdownlistdata.append('onbekend')
else:
    dropdownlistdata.append('geen')

if job  == '<w:result w:val="1"/>' :
    dropdownlistdata.append('nee')
else:
    dropdownlistdata.append('ja')

if vehicle == '<w:result w:val="1"/>' :
    dropdownlistdata.append('nee')
else:
    dropdownlistdata.append('ja')

#show data
print (alletabellen)
print (dropdownlistdata)

In [None]:
Sec_title = list(df[2])
Sec_title  = [x for x in Sec_title if str(x) != 'nan']

In [None]:
hl_title = list (df[1])
hl_title  = [x for x in hl_title if str(x) != 'nan']

In [None]:
Sec_title

In [None]:
hl_title

In [None]:
import os
import sys
scriptpath = "/home/jupyter/elmodules"
sys.path.append(os.path.abspath(scriptpath))

In [None]:
import data_func

In [None]:
import PyPDF2
#import data_func
import csv

reader = PyPDF2.PdfFileReader('/home/jupyter/FPR_281020_3.pdf')

print(reader.documentInfo)

num_of_pages = reader.numPages
print('Number of pages: ' + str(num_of_pages))

In [None]:
title_list = []
pagenum_list = []
title_formatted_list = []
for item in table_of_contents_raw:
        title, pagenum = \
            data_func.split_to_title_and_pagenum(item)
        if title != None:
            title_list.append(title)
            pagenum_list.append(pagenum)
            title_formatted_list.append(
                data_func.convert_title_to_filename(title))
            
# for page_list, we need to add the last page as well
pagenum_list.append(num_of_pages + 1)

In [None]:

for i in range(1, len(Sec_title)):
    title_formatted = Sec_title[i]
    #page_start = pagenum_list[i] - 1
    #page_end = pagenum_list[i+1] - 2
    
    writer = PyPDF2.PdfFileWriter()

    for page in range(0,num_of_pages + 1):
        writer.addPage(reader.getPage(page))
    
    output_filename = './data/original/pdfs/' + title_formatted + '.pdf'

    with open(output_filename, 'wb') as output:
        writer.write(output)