In [2]:
!pip install pikepdf
!pip install fitz

Collecting pikepdf
  Downloading pikepdf-4.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 33.8 MB/s 
Installing collected packages: pikepdf
Successfully installed pikepdf-4.4.1
Collecting fitz
  Downloading fitz-0.0.1.dev2-py2.py3-none-any.whl (20 kB)
Collecting nipype
  Downloading nipype-1.7.0-py3-none-any.whl (3.2 MB)
[K     |████████████████████████████████| 3.2 MB 35.3 MB/s 
[?25hCollecting configobj
  Downloading configobj-5.0.6.tar.gz (33 kB)
Collecting configparser
  Downloading configparser-5.2.0-py3-none-any.whl (19 kB)
Collecting pyxnat
  Downloading pyxnat-1.4.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 7.3 MB/s 
Collecting simplejson>=3.8.0
  Downloading simplejson-3.17.6-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (130 kB)
[K     |████████████████████████████████| 130 kB 56.8 MB/s 
Collecting rdflib>=5.0.0
  Downloading rdfl

In [2]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.19.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.7 MB)
[K     |████████████████████████████████| 8.7 MB 21.8 MB/s 
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.19.4


In [10]:
import pandas as pd
from operator import itemgetter
import fitz
import json
import pikepdf
import glob
import time
from tqdm import tqdm

def get_urls(document):
    pdf_file = pikepdf.Pdf.open(document)
    urls = []
    # iterate over PDF pages
    for page in pdf_file.pages: 
        try:
            for annots in page.get("/Annots"):
                try:
                    uri = annots.get("/A").get("/URI")
                    #print(uri)
                    if uri is not None:
                        urls.append(str(uri))
                except AttributeError:
                    pass
        except TypeError:
            pass
    return urls


def fonts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool
    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.getText("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles


def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.
    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict
    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)

    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag


def headers_para(doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.getText("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                block_string = size_tag[s['size']] + s['text']
                            else:
                                if s['size'] == previous_s['size']:

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = size_tag[s['size']] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = size_tag[s['size']] + s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    header_para.append(block_string)
                                    block_string = size_tag[s['size']] + s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    block_string += "|"

                header_para.append(block_string)

    return header_para


def get_cx_tags():

    document = r'/content/386353_493110N WTW_redacted.pdf'
    #selected_docs = ['rx70252.pdf','rx63069.pdf','rx76207.pdf','rx66176.pdf','rx2435.pdf','rx51665.pdf']
    # selected_docs= ex_df.filename.tolist()
    # path= r'/home/maghosh/share/MRC/data/all_docs'
    # filename_list=[]
    # for each in selected_docs:
    #     filename_list.append(path + "/"+each)
        
#     ############ for extra file #######################
#     path= r'/home/dichowd/share/MRC/Algo_Code/Final_workspace'
#     filename_list = glob.glob(path + "/*.pdf")
    ##############################

    # no_of_files=len(filename_list)
    result_df = pd.DataFrame()
    start_time = time.time()
    # for j in tqdm(range(0,no_of_files)):
    for j in tqdm(range(0,1)):    
        try:
            result_dict={}
            #filename=filename_list[j]
            filename = document
            doc = fitz.open(filename)
            urls = get_urls(filename)
            content_id = filename.strip(path).strip('rx').split('.')[0]
            font_counts, styles = fonts(doc, granularity=False)

            size_tag = font_tags(font_counts, styles)

            elements = headers_para(doc, size_tag)
            head_txt = [i.replace('|','').strip('<h1>') for i in elements if i.startswith('<h') ]
            result_dict['content_id'] = content_id
            result_dict["content_headers"] = head_txt
            re_url= re.findall(r'(https?://\S+/|www.\S+)', ' '.join(elements))

            result_dict['additional_links'] = list(set(urls + re_url))
            result_dict['text'] = " ".join(elements).replace('|','')
            result_df = result_df.append(result_dict, ignore_index=True)
        except Exception as inst:
            print(type(inst))    # the exception instance
            continue
            result_dict["filename"]=os.path.basename(filename)
    return  result_df 



if __name__ == '__main__':
    output = get_cx_tags()

    

100%|██████████| 1/1 [00:00<00:00, 33.24it/s]

<class 'NameError'>





In [13]:
print(output)

Empty DataFrame
Columns: []
Index: []


In [27]:
from operator import itemgetter
import fitz
import json


def fonts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool
    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.getText("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles


def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.
    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict
    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)

    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag


def headers_para(doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.getText("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                block_string = size_tag[s['size']] + s['text']
                            else:
                                if s['size'] == previous_s['size']:

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = size_tag[s['size']] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = size_tag[s['size']] + s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    header_para.append(block_string)
                                    block_string = size_tag[s['size']] + s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    block_string += "|"

                header_para.append(block_string)

    return header_para


def main():

    document = '/content/386353_493110N WTW_redacted.pdf'
    doc = fitz.open(document)

    font_counts, styles = fonts(doc, granularity=False)

    size_tag = font_tags(font_counts, styles)

    elements = headers_para(doc, size_tag)
    final_text = '\n'.join(elements)
    final_text = final_text.replace('|','').replace('..','').replace('  ',' ')

    with open("doc.json", 'w') as json_out:
        json.dump(elements, json_out)

    return final_text


if __name__ == '__main__':
    output_text = main()

Deprecation: 'getText' removed from class 'Page' after v1.19 - use 'get_text'.


In [28]:
print(output_text)





<h3>Underwriting report, 



<p>Report date: September 2019 

<s2>1 


<p>Contents 
<p>Report purpose and limitations 3
<p>1. Executive summary 4
<p>2. Company Profile .4
<p>3. Location/insured premises5
<p>4. Activities on site .7
<p>5. Working hours 9
<p>6. Routines and general safety .9
<p>7. Fire safety 11
<p>8. Fire detection and alarm system . 13
<p>9. Burglar alarm and protection . 18
<p>10. IT systems 18
<p>11. External production equipment and storage . 18
<p>12. Utilities 19
<p>13. Estimated building values 20
<p>14. Business interruption . 22
<p>15. Loss Estimates 22
<p>16. Reccommendations 24
<p>17. Photo documentation . 26
<p>18. Appendix Fire technical drawing . 31
<p>19. Appendix NatCat . 32



<s6>- CONFIDENTIALITY – 

<s2>This document belongs to  and any alteration or distribution is not allowed without a prior accept or 
<s2>authorization by .


<p>Disclaimer 

<s1>The purpose of this report is to provide underwriters with underwriting information and to assist t

In [None]:
def find_str(s, char):
    index = 0
    if char in s:
        c = char[0]
        for ch in s:
            if ch == c:
                if s[index:index+len(char)] == char:
                    return index
            index += 1
    return -1


def find_tags(s, char):
    index = 0
    index_list = []
    if char in s:
        c = char[0]
        for ch in s:
            if ch == c:
                if s[index:index+len(char)] == char:
                    index_list.append(index)
            index += 1
        return index_list
    return -1

def list_insert(list, n):
      
    # Searching for the position
    for i in range(len(list)):
        if list[i] > n:
            index = i
            break
      
    # Inserting n in the list
    list = list[:i] + [n] + list[i:]
    return list

def get_index(objects,elem):    
    previous = next_ = None
    l = len(objects)
    for index, obj in enumerate(objects):
        if obj == elem:
            if index > 0:

                previous = objects[index - 1]
            if index == 0:
                previous = objects[index]
            if index < (l - 1):
                next_ = objects[index + 1]
            if index == (l - 1):
                next_ = objects[index]
    
    return int(previous),int(elem),int(next_)

def get_idx_para(a_list,elem):
    for index, elem in enumerate(a_list):
        if (index+1 < len(a_list) and index - 1 >= 0): #Check index bounds

            prev_el = str(a_list[index-1])
            curr_el = str(elem)
            if index+4 < len(a_list) :
                next_el = str(a_list[index+4])
            else:
                next_el = str(a_list[-1])
            #print(prev_el, curr_el, next_el)
            return int(prev_el), int(curr_el), int(next_el)
    return -1