In [104]:
import os
import re
from collections import Counter
import json
import pymupdf


def get_toc(doc):
    toc = doc.get_toc()  # format: [level, title, page]
    toc_with_end = []

    for i, (level, title, start_page) in enumerate(toc):
        # Look ahead for the next section at same or higher level
        end_page = doc.page_count  # default: end of document

        for j in range(i + 1, len(toc)):
            next_level, _, next_start = toc[j]
            if next_level <= level:
                end_page = toc[j][2]
                break
        title = re.sub(r'(\\u200[0-9a])+', ' ', re.sub(r'(\\xa[d0])+', '',title))
        toc_with_end.append(
            (level,
            title,
            start_page,
            end_page)
        )

    return toc_with_end
    

# Getting the info from PDF
def info_extract(doc):
    # Get Table of Contents
    toc = get_toc(doc)
    sec_names = [toc[i][1].lower() for i in range(len(toc))]
    
    # Extracting the sections 
    sections = []
    for sec in toc:
        if sec[0]==2:
            sections.append(sec[1].lower()) 

    # Defining page height from first page
    pheight = doc[0].rect.height
    # Defining frame height 
    pframe = 50
    
    # Computing dominant text size throughout the document
    main_size = get_main_size(doc)
    main_font = get_main_font(doc)

    return (toc, pheight, pframe, main_size, main_font)


# Function returning the main text's font of a document
def get_main_size(doc):
    font_sizes = []
    for page in doc:
        text_dict = page.get_text("dict")
        blocks = text_dict["blocks"]
        for block in blocks:
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    if not re.match(r'[\s\t]+', span["text"]): font_sizes.append(round(span["size"]))
    size_count = Counter(font_sizes)
    dominant_size = size_count.most_common(2)
    if (dominant_size[1][0] > dominant_size[0][0] and dominant_size[1][1] > dominant_size[0][0]/2):
        main_size = round(dominant_size[1][0]) 
    else:
        main_size = round(dominant_size[0][0]) 

    return main_size


def get_main_font(doc):
    fonts = []
    for page in doc:
        text_dict = page.get_text("dict")
        blocks = text_dict["blocks"]
        for block in blocks:
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    if not re.match(r'[\s\t]+', span["text"]): fonts.append(span["font"])
    font_count = Counter(fonts)
    dominant_font = font_count.most_common(1)

    return dominant_font


def tocL2tocD(toc_list):
    root = {}
    stack = [(0, root)]  # stack of (level, current_dict)

    for level, title, startpage, endpage in toc_list:
        current_dict = {}
        while stack and level <= stack[-1][0]:
            stack.pop()
        stack[-1][1][(level, title, startpage, endpage)] = {"_page": (startpage, endpage), "_sub": current_dict}
        stack.append((level, current_dict))

    def cleanup(d):
        return {
            k: cleanup(v["_sub"]) if v["_sub"] else {"_page": v["_page"]}
            for k, v in d.items()
        }

    return cleanup(root)


def get_parent(secname, toc_dict, parent=None):
    for key, value in toc_dict.items():
        if key == secname:
            return parent
            
        if isinstance(value, dict):
            child_dict = value
            result = get_parent(secname, child_dict, key)
            if result:
                return result
    return None


def get_ancestry(target, d, path=None):
    if path is None:
        path = []

    for key, value in d.items():
        current_path = path + [key]

        if key == target:
            return current_path

        # Check if value is a dict and has further nested items
        if isinstance(value, dict):
            child_dict = value if "_page" not in value else {}
            result = get_ancestry(target, child_dict, current_path)
            if result:
                return result

    return None  # Target not found


def get_span(line):
    spans = []
    span = {'text': '', 'font': '', 'size': 0}
    for s in line.get("spans", []):
        if not re.match(r'[\s\t]+', s["text"]):
            s["text"] = re.sub(r'(\\u200[0-9a])+', ' ', re.sub(r'(\\xa[d0])+', '',s["text"]))
            if span["text"] == '': 
                span["text"] = s["text"]
                span["font"] = s["font"]
                span["size"] = s["size"]
            else: 
                if (s["font"] == span["font"] and 
                    (s["size"] == span["size"] or (s["size"] < span["size"] and len(s["text"]) <= 3))):
                    span["text"] = span["text"] + ' ' + s["text"] 
                    span["size"] = s["size"]
                elif (line.get("spans", [])[line.get("spans", []).index(s)-1] != ' ' and 
                      span["font"] == s["font"] and 
                      s["size"] < span["size"]):
                    span["text"] = span["text"] + s["text"]
                    span["size"] = s["size"]
                else:
                    break
    span["text"] = re.sub(r'\s+([.,:?!])', r'\1', span["text"])
    return span


def detect_captions(span):
    if (re.match(r'Fig(ure)?\.(\s)?(\d+)?(\w+)?(\s+)?:', span["text"]) or 
        re.match(r'Table(\s)?(\d+)?(\w+)?(\s+)?:', span["text"])):
        return True
    else:
        return None


In [101]:
# Function extracting and structuring the text from PDF
def text_extract(doc, sections):
    
    toc, pheight, pframe, main_size, main_font = info_extract(doc)
    toc_dict = tocL2tocD(toc)

    main_text = {}

    for section in sections:
        start_page = section[2]
        end_page = section[3]
        level = section[0]
        parent = get_parent(section, toc_dict)
        title_format = {'font': main_font, 'size': main_size}
        
        parents_siblings_sections = [sec for sec in toc[toc.index(section):] if (sec[0]<level or 
                                                                                 (sec[0]==level and get_parent(sec, toc_dict) == parent))]
        next_section = parents_siblings_sections[parents_siblings_sections.index(section)+1]

        if (parent in sections) and (parent is not None):
            continue

        key = ""
        value = ""
        pblocks = []

        for i in range(start_page, end_page+1):
            page = doc.load_page(i-1)
            pblocks.append(page.get_text("dict")["blocks"])

        split_title = False

        stop = False
        
        for blocks in pblocks:
            if stop: break
            bflag = False
            for block in blocks:
                if bflag or stop: break  
                # Removing header and footer blocks
                if block["bbox"][1] > pframe and block["bbox"][3] < pheight-pframe:
                    for line in block.get("lines", []):
                        span = get_span(line)
                        print(span)

                        if detect_captions(span): 
                            bflag = True
                            break
                        
                        # Detect titles
                        if (span["font"] != main_font or span["size"] > main_size
                            and block.get("lines", []).index(line) < 4):

                            if (re.search(rf"{re.escape(span['text'].lower().strip(',.:?!'))}", section[1].lower().strip(',.:?!')) or
                                re.search(rf"{re.escape(section[1].lower().strip(',.:?!'))}", span['text'].lower().strip(',.:?!')) and
                                key == ''
                               ):
                                key = section[1]
                                value = ""
                                continue
                            
                        # Check if the content if the line corresponds to the title
                        if (re.match(rf"((chapter)?(part)?(\d+\.\s?)+)?{section[1].lower().strip()}", f"{span["text"].lower().strip('.:')}") # Check if line text corresponds to section title
                              and key == ''  # Check if key is empty
                             ):
                            key = section[1]
                            value = ""
                            title_format = {'font': span["font"], 'size': round(span["size"])}
                            continue
                                 
                        # In case the title spans multiple lines in the text, check if the whole title can be found in first line
                        elif (re.search(rf"{re.escape(span['text'].lower())}", section[1].lower().strip()) # Check if section name begins with line text
                              #and len(span["text"].lower().split()) > 1  # Avoid single words like 'the' to be taken as false positive
                              #and len(section[1].split()) > 2  # Apply the above condition only if the section title is longer than 2 words
                              and key == '' # Check if key is empty
                             ):
                            key = section[1]
                            value = ""
                            split_title = True
                            title = span["text"].lower().strip()
                            continue
                        
                        # Avoid încluding the following title lines to be taken as main text
                        elif split_title:
                            title = title + ' ' + span["text"].lower().strip()
                            if title == section[1].lower().strip():
                                title_format = {'font': span["font"], 'size': round(span["size"])}
                                split_title = False
                                continue
                            else:
                                continue
                        
                        # Stop adding text to the dictionary when encountering the next same-level section
                        if ((next_section[1].lower().strip().startswith(span["text"].lower().strip('.:'))
                             and span["size"] > main_size
                             and span["font"] == title_format["font"]
                             and len(span["text"].lower().split()) >= 1
                             and block.get("lines", []).index(line) < 4) # Avoid single words like 'the' to be taken as false positive
                            or 
                            (span["text"].lower().strip('.:').startswith(next_section[1].lower().strip())
                                and ((span["font"] == title_format["font"] and span["size"] == title_format["size"]) or
                                     (round(span["size"]) > title_format["size"]))
                             and block.get("lines", []).index(line) < 4)
                           ):
                            stop = True
                            break
                        
                        # Get the block's text as dict value based on the font size
                        for span in line.get("spans", []):
                            if (span["text"].lower().strip() in ("references", "bibliography") and
                                (round(span["size"]) > main_size)):
                                stop = True
                                break
                                
                            # Introducing a tolerance of font size of 0.5 for small variations in the text body
                            if ((#round(span["size"]) == main_size and 
                                not any([sec[1].lower() == "materials and methods" for sec in get_ancestry(section, toc_dict)])  
                                and key != "")
                                or(round(span["size"]) >= main_size-1 and round(span["size"]) <= main_size
                                   and any([sec[1].lower() == "materials and methods" for sec in get_ancestry(section, toc_dict)]))
                               ):
                                text = span["text"]
                                # Repairing lines
                                if (len(value) > 1 and (value[-1] == "-" or value[-1] == "ﬁ" or value[-1] == 'ﬂ') 
                                    or (text == 'ﬁ' or text == 'ﬂ')):
                                    if value[-1] == '-': value = value[:-1] 
                                    value = value + text
                                else: 
                                    value = value + ' ' + text 
                            
                if (key != "" and value != ""):
                    value = re.sub(r'(\\u200[0-9a])+', ' ', re.sub(r'(\\xa[d0])+', '',value))
                    # Removing references
                    value = re.sub(r'(;\s)?\[\s(\d(\s,\s+)?)+\s\]', '', value.strip())
                    value = re.sub(r'(;\s)?\[\s\d+\s–\s\d+\s\]', '', value.strip())
                    # Formatting spaces surrounding commas, dots, and parentheses
                    value = re.sub(r'\s+([,.!?:])', r'\1', value.strip())
                    value = re.sub(r'\(\s', '(', value.strip())
                    value = re.sub(r'\s\)', ')', value.strip())
                    # Removing multiple spaces (strip method fails)
                    value = re.sub(r'\s+', ' ', value.strip())
                    # Replacing the 'ﬁ' and 'ﬂ' characters with correct "fi" string
                    value = re.sub(r'ﬁ', 'fi', value.strip())
                    value = re.sub(r'ﬂ', 'fl', value.strip())
                    value = value.replace('\xad ', '').replace('\xa0', '').replace('\u2003', ' ')
                
                    main_text[key] = value
        
    return main_text

In [80]:
doc = pymupdf.open("/home/diomir0/Documents/books/med_sci/Anne Harrington - Mind Fixers - Psychiatry’s Troubled Search for the Biology of Mental Illness (2019).epub")

In [7]:
toc = get_toc(doc)
[print(i, sec) for i, sec in enumerate(toc)]

0 (1, 'Cover', 1, 2)
1 (1, 'Title', 2, 4)
2 (1, 'Contents', 4, 6)
3 (1, 'Introduction: Our Biological Enthusiasms', 6, 14)
4 (1, 'Part I: Doctors’ Stories', 14, 148)
5 (2, 'Chapter 1: Betting On Anatomy', 15, 45)
6 (2, 'Chapter 2: Biology In Disarray', 45, 87)
7 (2, 'Chapter 3: A Fragile Freudian Triumph', 87, 119)
8 (2, 'Chapter 4: Crisis And Revolt', 119, 148)
9 (1, 'Part II: Disease Stories', 148, 252)
10 (2, 'Chapter 5: Schizophrenia', 149, 192)
11 (2, 'Chapter 6: Depression', 192, 225)
12 (2, 'Chapter 7: Manic-Depression', 225, 252)
13 (1, 'Part III: Unfinished Stories', 252, 276)
14 (2, 'Chapter 8: False Dawn', 253, 276)
15 (1, 'Afterthoughts', 276, 282)
16 (1, 'Notes', 282, 352)
17 (1, 'Guide to Further Reading', 352, 363)
18 (1, 'Index', 363, 409)
19 (1, 'Acknowledgments', 409, 411)
20 (1, 'Also by Anne Harrington', 411, 412)
21 (1, 'Copyright', 412, 412)


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [102]:
sections = [toc[3]]
#print(info_extract(doc))
text_extract(doc, sections)

{'text': 'INTRODUCTION', 'font': 'Nimbus Sans Regular', 'size': 16.5}
{'text': 'Our Biological Enthusiasms', 'font': 'Nimbus Sans Bold', 'size': 22.0}
{'text': 'IN 1978 THE HISTORIAN AND SOCIAL CRITIC MARTIN GROSS APPEARED', 'font': 'Nimbus Sans Regular', 'size': 8.25}
{'text': 'television show ', 'font': 'Charis SIL Regular', 'size': 11.0}
{'text': 'Psychological Society', 'font': 'Charis SIL Italic', 'size': 11.0}
{'text': 'with psychoanalytic practices, he prophesied that “within the next', 'font': 'Charis SIL Regular', 'size': 11.0}
{'text': 'ten years...\xa0psychiatry will come out of the dark ages, will drop all', 'font': 'Charis SIL Regular', 'size': 11.0}
{'text': 'the nonsense and expertise about human behavior of which they', 'font': 'Charis SIL Regular', 'size': 11.0}
{'text': 'know absolutely nothing that my grandmother didn’t know, if they', 'font': 'Charis SIL Regular', 'size': 11.0}
{'text': 'know that much, and will turn to medicine.” His interlocutors were', 'font': 'C

{}

In [None]:
page = doc.load_page(5)
for i, block in enumerate(page.get_text("dict")["blocks"]):
    print(i)
    for line in block.get("lines", []):
        for spans in line.get("spans", []):
            if not re.match(r'[\s\t]+', spans["text"]): print(spans)

In [103]:
test = "Blue boo: here's it"
re.search(r'boo', test)

<re.Match object; span=(5, 8), match='boo'>