In [1]:
import os
import re
from collections import Counter
import json
import pymupdf


def get_toc(doc):
    toc = doc.get_toc()  # format: [level, title, page]
    toc_with_end = []

    for i, (level, title, start_page) in enumerate(toc):
        # Look ahead for the next section at same or higher level
        end_page = doc.page_count  # default: end of document

        for j in range(i + 1, len(toc)):
            next_level, _, next_start = toc[j]
            if next_level <= level:
                end_page = toc[j][2]
                break
        title = re.sub(r'(\\u200[0-9a])+', ' ', re.sub(r'(\\xa[d0])+', '',title))
        toc_with_end.append(
            (level,
            title,
            start_page,
            end_page)
        )

    return toc_with_end
    

# Getting the info from PDF
def info_extract(doc):
    # Get Table of Contents
    toc = get_toc(doc)
    sec_names = [toc[i][1].lower() for i in range(len(toc))]
    
    # Extracting the sections 
    sections = []
    for sec in toc:
        if sec[0]==2:
            sections.append(sec[1].lower()) 

    # Defining page height from first page
    pheight = doc[0].rect.height
    
    # Defining frame height 
    pframe = 50
    
    # Computing dominant text size throughout the document
    main_size = get_main_size(doc)
    main_font = get_main_font(doc)

    return (toc, pheight, pframe, main_size, main_font)


# Function returning the main text's font of a document
def get_main_size(doc):
    font_sizes = []
    for page in doc:
        text_dict = page.get_text("dict")
        blocks = text_dict["blocks"]
        for block in blocks:
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    if not re.match(r'[\s\t]+', span["text"]): font_sizes.append(round(span["size"]))
    size_count = Counter(font_sizes)
    dominant_size = size_count.most_common(2)
    if (dominant_size[1][0] > dominant_size[0][0] and dominant_size[1][1] > dominant_size[0][0]/2):
        main_size = round(dominant_size[1][0]) 
    else:
        main_size = round(dominant_size[0][0]) 

    return main_size


def get_main_font(doc):
    fonts = []
    for page in doc:
        text_dict = page.get_text("dict")
        blocks = text_dict["blocks"]
        for block in blocks:
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    if not re.match(r'[\s\t]+', span["text"]): fonts.append(span["font"])
    font_count = Counter(fonts)
    dominant_font = font_count.most_common(1)[0]
    if type(dominant_font) == tuple:
        dominant_font = dominant_font[0]
    
    return dominant_font


def tocL2tocD(toc_list):
    root = {}
    stack = [(0, root)]  # stack of (level, current_dict)

    for level, title, startpage, endpage in toc_list:
        current_dict = {}
        while stack and level <= stack[-1][0]:
            stack.pop()
        stack[-1][1][(level, title, startpage, endpage)] = {"_page": (startpage, endpage), "_sub": current_dict}
        stack.append((level, current_dict))

    def cleanup(d):
        return {
            k: cleanup(v["_sub"]) if v["_sub"] else {"_page": v["_page"]}
            for k, v in d.items()
        }

    return cleanup(root)


def get_parent(secname, toc_dict, parent=None):
    for key, value in toc_dict.items():
        if key == secname:
            return parent
            
        if isinstance(value, dict):
            child_dict = value
            result = get_parent(secname, child_dict, key)
            if result:
                return result
    return None


def get_ancestry(target, d, path=None):
    if path is None:
        path = []

    for key, value in d.items():
        current_path = path + [key]

        if key == target:
            return current_path

        # Check if value is a dict and has further nested items
        if isinstance(value, dict):
            child_dict = value if "_page" not in value else {}
            result = get_ancestry(target, child_dict, current_path)
            if result:
                return result

    return None  # Target not found


def get_span(line):
    spans = []
    span = {}
    for s in line.get("spans", []):
        s["font"] = re.sub(r'\+.*', '', s["font"])
        if not re.match(r'[\s\t]+$', s["text"]):
            s["text"] = re.sub(r'.*(\\u200[0-9a])+.*', ' ', re.sub(r'.*(\\xa[d0])+.*', '',s["text"]))
            if span == {}: 
                span = {'text': s["text"], 
                        'font': s["font"], 
                        'size': s["size"]}
            else: 
                if re.match(r"[A-Z]$", span["text"].strip(' ,.:?!')): 
                    span["text"] = span["text"] + s["text"] 
                    span["size"] = s["size"]  
                    span["font"] = s["font"]
                    
                elif span["font"] != s["font"] or span["size"] != s["size"]: 
                    if re.match(r"[A-Z]$", s["text"]):
                        span["text"] = span["text"] + ' ' + s["text"]
                    else:
                        spans.append(span.copy())
                        span = {'text': s["text"], 
                                'font': s["font"], 
                                'size': s["size"]}

                elif (re.match(r"(\s)?[ﬁﬂ—](\s)?$", s["text"]) 
                      or re.match(r"[ﬁﬂ—]$", span["text"][-1])
                     ):
                    span["text"] = span["text"] + s["text"]
                    
                elif (span["font"] == s["font"] and
                      span["size"] == s["size"]
                     ):
                    if (' ' in (s["text"][0], span["text"][-1]) 
                        or (len(span["text"]) > 2 and re.match(r"^.\s[A-Z]$", span["text"][-3:]))
                       ):
                        span["text"] = span["text"] + s["text"] 
                    else:
                        span["text"] = span["text"] + ' ' + s["text"] 
                else:
                    continue
    if span != {}: spans.append(span)
    for span in spans: 
        span["text"] = span["text"].strip()
        span["text"] = re.sub(r'\s+([.,:?!])', r'\1', span["text"])
        span["text"] = re.sub(r'(\\u200[0-9a])+', ' ', span["text"])
        # Removing references
        span["text"] = re.sub(r'(;\s)?\[\s(\d(\s,\s+)?)+\s\]', '', span["text"])
        span["text"] = re.sub(r'(;\s)?\[\s\d+\s–\s\d+\s\]', '', span["text"])
        # Formatting spaces surrounding commas, dots, and parentheses
        span["text"] = re.sub(r'\(\s', '(', span["text"])
        span["text"] = re.sub(r'\s\)', ')', span["text"])
        # Removing multiple spaces (strip method fails)
        span["text"] = re.sub(r'\s+', ' ', span["text"])
        # Replacing the 'ﬁ' and 'ﬂ' characters with correct "fi" string
        span["text"] = re.sub(r'ﬁ', 'fi', span["text"])
        span["text"] = re.sub(r'ﬂ', 'fl', span["text"])
    return spans
                

In [2]:
# Function extracting and structuring the text from PDF
def text_extract(doc, sections):
    
    toc, pheight, pframe, main_size, main_font = info_extract(doc)
    toc_dict = tocL2tocD(toc)

    main_text = {}

    for section in sections:
        start_page = section[2]
        end_page = section[3]
        level = section[0]
        parent = get_parent(section, toc_dict)
        title_format = {'font': main_font, 'size': main_size}
        
        parents_siblings_sections = [sec for sec in toc[toc.index(section):] if (sec[0]<level or 
                                                                                 (sec[0]==level and get_parent(sec, toc_dict) == parent))]
        next_section = parents_siblings_sections[parents_siblings_sections.index(section)+1]

        if (parent in sections) and (parent is not None):
            continue

        key = ""
        value = ""
        
        pblocks = []
        for i in range(start_page, end_page+1):
            page = doc.load_page(i-1)
            pblocks.append(page.get_text("dict")["blocks"])

        skip = False
        stop = False
        for blocks in pblocks:
            if stop: break
            skip = False
            for block in blocks:
                if skip or stop: break  
                # Removing header and footer blocks
                if (("pdf" in doc.metadata["format"].lower() and block["bbox"][1] > pframe and block["bbox"][3] < pheight-pframe) or
                    "epub" in doc.metadata["format"].lower()):
                    for line in block.get("lines", []):
                        inline_title = False
                        spans = get_span(line)
                        # Check if line span is empty
                        if len(spans) > 0: 
                                                                                    
                            # Detect captions
                            if (blocks[blocks.index(block)-1]['type'] == 1
                                and (spans[0]["font"] != main_font or spans[0]["size"] < main_size)
                               ): 
                                break
                            
                            # Detect titles
                            if (spans[0]["font"] != main_font or spans[0]["size"] != main_size
                                and block.get("lines", []).index(line) < 4):
                                
                                # Detect current section's title
                                if ((re.search(rf"{re.escape(spans[0]['text'].lower().strip(',.:?!'))}", section[1].lower().strip(',.:?!')) 
                                     or re.search(rf"{re.escape(section[1].lower().strip(',.:?!'))}", spans[0]['text'].lower().strip(',.:?!')))
                                    and key == ''
                                   ):
                                    key = section[1]
                                    value = ''
                                    if len(spans) <= 1: continue
                                    else: inline_title = True
                                        
                                # Detect next section's title
                                elif ((re.search(rf"{re.escape(spans[0]['text'].lower().strip(',.:?!'))}", next_section[1].lower().strip(',.:?!')) or 
                                       re.search(rf"{re.escape(next_section[1].lower().strip(',.:?!'))}", spans[0]['text'].lower().strip(',.:?!')))
                                      and key != ''
                                     ):
                                    stop = True
                                    break
                                    
                                # Detect bibliogrqphy or references section (in case they are not in toc)
                                elif spans[0]["text"].lower().strip() in ("references", "bibliography"):
                                    stop = True
                                    break
                                
                                # Remove footer blocks from the text
                                elif (re.match(r'\s?\d+\s?$', spans[0]["text"]) 
                                      and [span["size"] < main_size for span in spans]):
                                    skip = True
                                    break

                            #else:
                            if key != '' and spans[0]["text"].lower() not in key.lower():
                                if inline_title: spans = spans[1:]
                                for span in spans: 
                                    if re.match(r'\s?\d+\s?$', span["text"]) and span["size"] < main_size:
                                        continue
                                    elif len(value) > 1 and re.match(r'-', value[-1]): 
                                        value = value + span["text"]
                                    else: 
                                        value = value + ' ' + span["text"] 
                                
                if (key != "" and value != ""):
                    main_text[key] = re.sub(r'[\xad\xa0]\s', '', value)
        
    return main_text

In [3]:
doc = pymupdf.open("/home/diomir0/Documents/books/med_sci/Anne Harrington - Mind Fixers - Psychiatry’s Troubled Search for the Biology of Mental Illness (2019).epub")

In [4]:
toc = get_toc(doc)
[print(i, sec) for i, sec in enumerate(toc)]

0 (1, 'Cover', 1, 2)
1 (1, 'Title', 2, 4)
2 (1, 'Contents', 4, 6)
3 (1, 'Introduction: Our Biological Enthusiasms', 6, 14)
4 (1, 'Part I: Doctors’ Stories', 14, 148)
5 (2, 'Chapter 1: Betting On Anatomy', 15, 45)
6 (2, 'Chapter 2: Biology In Disarray', 45, 87)
7 (2, 'Chapter 3: A Fragile Freudian Triumph', 87, 119)
8 (2, 'Chapter 4: Crisis And Revolt', 119, 148)
9 (1, 'Part II: Disease Stories', 148, 252)
10 (2, 'Chapter 5: Schizophrenia', 149, 192)
11 (2, 'Chapter 6: Depression', 192, 225)
12 (2, 'Chapter 7: Manic-Depression', 225, 252)
13 (1, 'Part III: Unfinished Stories', 252, 276)
14 (2, 'Chapter 8: False Dawn', 253, 276)
15 (1, 'Afterthoughts', 276, 282)
16 (1, 'Notes', 282, 352)
17 (1, 'Guide to Further Reading', 352, 363)
18 (1, 'Index', 363, 409)
19 (1, 'Acknowledgments', 409, 411)
20 (1, 'Also by Anne Harrington', 411, 412)
21 (1, 'Copyright', 412, 412)


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [100]:
sections = [toc[12]]
#print(info_extract(doc))
text_extract(doc, sections)

