In [24]:
import os
import re
from collections import Counter
import json
import pymupdf


def get_toc(doc):
    toc = doc.get_toc()  # format: [level, title, page]
    toc_with_end = []

    for i, (level, title, start_page) in enumerate(toc):
        # Look ahead for the next section at same or higher level
        end_page = doc.page_count  # default: end of document

        for j in range(i + 1, len(toc)):
            next_level, _, next_start = toc[j]
            if next_level <= level:
                end_page = toc[j][2]
                break

        toc_with_end.append(
            (level,
            title,
            start_page,
            end_page)
        )

    return toc_with_end
    

# Getting the info from PDF
def info_extract(doc):
    # Get Table of Contents
    toc = get_toc(doc)
    sec_names = [toc[i][1].lower() for i in range(len(toc))]
    
    # Extracting the sections 
    sections = []
    for sec in toc:
        if sec[0]==2:
            sections.append(sec[1].lower()) 

    # Defining page height from first page
    pheight = doc[0].rect.height
    # Defining frame height 
    pframe = 50
    
    # Computing dominant text size throughout the document
    main_size = get_main_size(doc)
    main_font = get_main_font(doc)

    return (toc, pheight, pframe, main_size, main_font)


# Function returning the main text's font of a document
def get_main_size(doc):
    font_sizes = []
    for page in doc:
        text_dict = page.get_text("dict")
        blocks = text_dict["blocks"]
        for block in blocks:
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    if not re.match(r'[\s\t]+', span["text"]): font_sizes.append(span["size"])
    size_count = Counter(font_sizes)
    dominant_size = size_count.most_common(2)
    if (dominant_size[1][0] > dominant_size[0][0] and dominant_size[1][1] > dominant_size[0][0]/2):
        main_size = round(dominant_size[1][0]) 
    else:
        main_size = round(dominant_size[0][0]) 

    return main_size


def get_main_font(doc):
    fonts = []
    for page in doc:
        text_dict = page.get_text("dict")
        blocks = text_dict["blocks"]
        for block in blocks:
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    if not re.match(r'[\s\t]+', span["text"]): fonts.append(span["font"])
    font_count = Counter(fonts)
    dominant_font = font_count.most_common(1)

    return dominant_font


def tocL2tocD(toc_list):
    root = {}
    stack = [(0, root)]  # stack of (level, current_dict)

    for level, title, startpage, endpage in toc_list:
        current_dict = {}
        while stack and level <= stack[-1][0]:
            stack.pop()
        stack[-1][1][(level, title, startpage, endpage)] = {"_page": (startpage, endpage), "_sub": current_dict}
        stack.append((level, current_dict))

    def cleanup(d):
        return {
            k: cleanup(v["_sub"]) if v["_sub"] else {"_page": v["_page"]}
            for k, v in d.items()
        }

    return cleanup(root)


def get_parent(secname, toc_dict, parent=None):
    for key, value in toc_dict.items():
        if key == secname:
            return parent
            
        if isinstance(value, dict):
            child_dict = value
            result = get_parent(secname, child_dict, key)
            if result:
                return result
    return None


def get_ancestry(target, d, path=None):
    if path is None:
        path = []

    for key, value in d.items():
        current_path = path + [key]

        if key == target:
            return current_path

        # Check if value is a dict and has further nested items
        if isinstance(value, dict):
            child_dict = value if "_page" not in value else {}
            result = get_ancestry(target, child_dict, current_path)
            if result:
                return result

    return None  # Target not found

In [139]:
# Function extracting and structuring the text from PDF
def text_extract(doc, sections):
    
    toc, pheight, pframe, main_size, main_font = info_extract(doc)
    toc_dict = tocL2tocD(toc)

    main_text = {}

    print(main_size, main_font)

    for section in sections:
        start_page = section[2]
        end_page = section[3]
        level = section[0]
        parent = get_parent(section, toc_dict)

        
        parents_siblings_sections = [sec for sec in toc[toc.index(section):] if (sec[0]<level or 
                                                                                 (sec[0]==level and get_parent(sec, toc_dict) == parent))]
        next_section = parents_siblings_sections[parents_siblings_sections.index(section)+1]

        print(parents_siblings_sections)
        if (parent in sections) and (parent is not None):
            continue

        key = ""
        value = ""
        pblocks = []

        for i in range(start_page, end_page+1):
            page = doc.load_page(i-1)
            pblocks.append(page.get_text("dict")["blocks"])

        split_title = False
        
        for blocks in pblocks:
            bflag = False
            for block in blocks:
                if bflag: break
                # Removing header and footer blocks
                if block["bbox"][1] > pframe and block["bbox"][3] < pheight-pframe:
                    for line in block.get("lines", []):
                        span = {'text': '', 'font': '', 'size': 0}
                        for s in line.get("spans", []):
                            if not re.match(r'[\s\t]+', s["text"]):
                                if span["text"] == '': 
                                    span["text"] = s["text"]
                                    span["font"] = s["font"]
                                    span["size"] = s["size"]
                                else: 
                                    if s["font"] == span["font"] and s["size"] == span["size"]:
                                        span["text"] = span["text"] + ' ' + s["text"]                                      
                                    else:
                                        break

                        #span["text"] = span["text"].replace('\xa0', '').replace('\u2003', ' ')
                        # Excluding figure and table captions
                        if (re.match(r'Fig(ure)?\.(\s)?(\d+)?(\w+)?(\s+)?:', span["text"]) or 
                            re.match(r'Table(\s)?(\d+)?(\w+)?(\s+)?:', span["text"])):
                            bflag = True
                            break
                        # Check if the content if the line corresponds to the title
                        elif (re.match(rf"((chapter)?(part)?(\d+\.\s?)+)?{section[1].lower().strip()}", f"{span["text"].lower().strip('.:')}") # Check if line text corresponds to section title
                              and (span["font"] != main_font
                                   or span["size"] != main_size) # Check if line font is different from main text font
                              and block.get("lines", []).index(line) < 4
                              and key == ''  # Check if key is empty
                             ):
                            key = section[1]
                            value = ""
                                 
                            continue
                        # In case the title spans multiple lines in the text, check if the whole title can be found in first line
                        elif (section[1].lower().strip().startswith(span["text"].lower()) # Check if section name begins with line text
                              and len(span["text"].lower().split()) > 1  # Avoid single words like 'the' to be taken as false positive
                              and len(section[1].split()) > 2  # Apply the above condition only if the section title is longer than 2 words
                              and (span["font"] != main_font
                                   or span["size"] != main_size) # Check if line font is different from main text font                              and block.get("lines", []).index(line) < 4
                              and key == '' # Check if key is empty
                             ):
                            key = section[1]
                            value = ""
                            split_title = True
                            title = span["text"].lower().strip()
                            continue
                        # Avoid încluding the following title lines to be taken as main text
                        elif split_title:
                            title = title + ' ' + span["text"].lower().strip()
                            if title == section[1].lower().strip():
                                split_title = False
                                continue
                            else:
                                continue

                        # Stop adding text to the dictionary when encountering the next section
                        elif (any([sec[1].lower().strip().startswith(span["text"].lower().strip('.:'))   # Check if line starts with the name 
                                   for sec in parents_siblings_sections[parents_siblings_sections.index(section)+1:]]) # of a following same-level section
                              and len(span["text"].lower().split()) > 2 # Avoid single words like 'the' to be taken as false positive
                             ):
                            if (key != "" and value != ""):
                                bflag = True
                                break
                            else:
                                bflag = True
                                break
                        # Same as above, but in the case where the section title 
                        elif (any([span["text"].lower().strip('.:').startswith(sec[1].lower().strip()) 
                                   for sec in parents_siblings_sections[parents_siblings_sections.index(section)+1:]])
                              and (span["font"] != main_font
                                   or span["size"] != main_size) # Check if line font is different from main text font
                             ):
                            if (key != "" and value != ""):
                                bflag = True
                                break
                            else:
                                bflag = True
                                break

                        # Get the block's text as dict value based on the font size
                        for span in line.get("spans", []):
                            if span["text"].lower().strip() in ("references", "bibliography"):
                                pflag = True
                                break
                            # Introducing a tolerance of font size of 0.5 for small variations in the text body
                            if (round(span["size"]) == main_size and 
                                not any([sec[1].lower() == "materials and methods" for sec in get_ancestry(section, toc_dict)])  
                                and key != ""):
                                text = span["text"]
                                # Repairing lines
                                if (len(value) > 1 and (value[-1] == "-" or value[-1] == "ﬁ" or value[-1] == 'ﬂ') 
                                    or (text == 'ﬁ' or text == 'ﬂ')):
                                    if value[-1] == '-': value = value[:-1] 
                                    value = value + text
                                else: 
                                    value = value + ' ' + text 
                            # Text in an article "Materials and Methods" section can have a smaller font size
                            elif (round(span["size"]) >= main_size-1 and round(span["size"]) <= main_size
                                and any([sec[1].lower() == "materials and methods" for sec in get_ancestry(section, toc_dict)])):
                                text = span["text"]
                                # Repairing lines
                                if (len(value) > 1 and (value[-1] == "-" or value[-1] == "ﬁ" or value[-1] == 'ﬂ') 
                                    or (text == 'ﬁ' or text == 'ﬂ')):
                                    value = value + text   
                                else: 
                                    value = value + ' ' + text 
                                
                if (key != "" and value != ""):
                    # Removing references
                    value = re.sub(r'(;\s)?\[\s(\d(\s,\s+)?)+\s\]', '', value.strip())
                    value = re.sub(r'(;\s)?\[\s\d+\s–\s\d+\s\]', '', value.strip())
                    # Formatting spaces surrounding commas, dots, and parentheses
                    value = re.sub(r'\s,\s', ', ', value.strip())
                    value = re.sub(r'\s\.\s', '. ', value.strip())
                    value = re.sub(r'\(\s', '(', value.strip())
                    value = re.sub(r'\s\)', ')', value.strip())
                    # Removing multiple spaces (strip method fails)
                    value = re.sub(r'\s+', ' ', value.strip())
                    # Replacing the 'ﬁ' and 'ﬂ' characters with correct "fi" string
                    value = re.sub(r'ﬁ', 'fi', value.strip())
                    value = re.sub(r'ﬂ', 'fl', value.strip())
                    value = value.replace('\xad ', '').replace('\xa0', '').replace('\u2003', ' ')
                
                    main_text[key] = value
        
    return main_text

In [124]:
doc = pymupdf.open("/home/diomir0/Documents/books/comp_sci/Eldad Eilam - Reversing_ secrets of reverse engineering-Wiley (2005).pdf")

In [125]:
toc = get_toc(doc)
[print(i, sec) for i, sec in enumerate(toc)]

0 (1, 'Cover', 1, 9)
1 (1, 'Foreword', 9, 15)
2 (1, 'Contents', 15, 25)
3 (1, 'Introduction', 25, 31)
4 (2, 'Reverse Engineering and Low-Level Software', 26, 27)
5 (2, 'How This Book Is Organized', 27, 29)
6 (2, 'Who Should Read this Book', 29, 29)
7 (2, 'Tools and Platforms', 29, 30)
8 (2, 'What’s on the Web Site', 30, 30)
9 (2, 'Where to Go from Here?', 30, 31)
10 (1, 'Part 1. Reversing 101', 31, 169)
11 (2, 'Foundations', 33, 55)
12 (3, 'What Is Reverse Engineering?', 33, 34)
13 (3, 'Software Reverse Engineering: Reversing', 34, 34)
14 (3, 'Reversing Applications', 34, 39)
15 (3, 'Low-Level Software', 39, 43)
16 (3, 'The Reversing Process', 43, 44)
17 (3, 'The Tools', 44, 47)
18 (3, 'Is Reversing Legal?', 47, 53)
19 (3, 'Code Samples & Tools', 53, 53)
20 (3, 'Conclusion', 53, 55)
21 (2, 'Low-Level Software', 55, 99)
22 (3, 'High-Level Perspectives', 56, 67)
23 (3, 'Low-Level Perspectives', 67, 74)
24 (3, 'Assembly Language 101', 74, 83)
25 (3, 'A Primer on Compilers and Compilation'

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [140]:
sections = [toc[11]]
#print(info_extract(doc))
text_extract(doc, sections)

10 [('Palatino-Roman', 17377)]
[(2, 'Foundations', 33, 55), (2, 'Low-Level Software', 55, 99), (2, 'Windows Fundamentals', 99, 139), (2, 'Reversing Tools', 139, 169), (1, 'Part 2. Applied Reversing', 169, 337), (1, 'Part 3. Cracking', 337, 451), (1, 'Part 4. Beyond Disassembly', 451, 509), (1, 'Appendices', 509, 591), (1, 'Index', 591, 619)]


{'Foundations': 'C HAPTE R 1 cases, the information is owned by someone who isn’t willing to share them. In other cases, the information has been lost or destroyed. Traditionally, reverse engineering has been about taking shrink-wrapped products and physically dissecting them to uncover the secrets of their design. Such secrets were then typically used to make similar or better products. In many industries, reverse engineering involves examining the product under a microscope or taking it apart and figuring out what each piece does. Not too long ago, reverse engineering was actually a fairly popular hobby, practiced by a large number of people (even if it wasn’t referred to as reverse engineering). Remember how in the early days of modern electronics, many people were so amazed by modern appliances such as the radio and television set that it became common practice to take them apart and see what goes on inside? That was reverse engineering. Of course, advances in the electronics indus

In [117]:
page = doc.load_page(32)
for block in page.get_text("dict")["blocks"]:
    for line in block.get("lines", []):
        for spans in line.get("spans", []):
            if not re.match(r'[\s\t]+', spans["text"]): print(spans)

{'size': 17.93280029296875, 'flags': 20, 'bidi': 0, 'char_flags': 24, 'font': 'Times-Bold', 'color': 2301728, 'alpha': 255, 'ascender': 0.6990000009536743, 'descender': -0.20499999821186066, 'text': '3', 'origin': (82.91999816894531, 101.8800048828125), 'bbox': (82.91999816894531, 88.01382446289062, 91.88639831542969, 105.94662475585938)}
{'size': 17.93280029296875, 'flags': 20, 'bidi': 0, 'char_flags': 24, 'font': 'Times-Bold', 'color': 2301728, 'alpha': 255, 'ascender': 0.6990000009536743, 'descender': -0.20499999821186066, 'text': 'Growth of Functions', 'origin': (148.67958068847656, 101.8800048828125), 'bbox': (148.67958068847656, 88.01382446289062, 307.5947265625, 105.94662475585938)}
{'size': 10.909099578857422, 'flags': 4, 'bidi': 0, 'char_flags': 16, 'font': 'Times-Roman', 'color': 2301728, 'alpha': 255, 'ascender': 0.6990000009536743, 'descender': -0.21699999272823334, 'text': 'The order of growth of the running time of an algorithm, deﬁned in Chapter 2,', 'origin': (148.67999