In [1]:
import os
import re
from collections import Counter
import json
import pymupdf


def get_toc(doc):
    toc = doc.get_toc()  # format: [level, title, page]
    toc_with_end = []

    for i, (level, title, start_page) in enumerate(toc):
        # Look ahead for the next section at same or higher level
        end_page = doc.page_count  # default: end of document

        for j in range(i + 1, len(toc)):
            next_level, _, next_start = toc[j]
            if next_level <= level:
                end_page = toc[j][2]
                break
        title = re.sub(r'(\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a)+', ' ', 
                       re.sub(r'(\xad\xa0])+', '',re.sub(r'\r', '', title)))
        toc_with_end.append(
            (level,
            title,
            start_page,
            end_page)
        )

    return toc_with_end
    

# Getting the info from PDF
def info_extract(doc):
    # Get Table of Contents
    toc = get_toc(doc)
    sec_names = [toc[i][1].lower() for i in range(len(toc))]
    
    # Extracting the sections 
    sections = []
    for sec in toc:
        if sec[0]==2:
            sections.append(sec[1].lower()) 

    # Defining page height from first page
    pheight = doc[0].rect.height
    
    # Defining frame height 
    pframe = 50
    
    # Computing dominant text size throughout the document
    main_size = get_main_size(doc)
    main_font = get_main_font(doc)

    return (toc, pheight, pframe, main_size, main_font)


# Function returning the main text's font of a document
def get_main_size(doc):
    font_sizes = []
    for page in doc:
        text_dict = page.get_text("dict")
        blocks = text_dict["blocks"]
        for block in blocks:
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    if not re.match(r'[\s\t]+', span["text"]): font_sizes.append(round(span["size"]))
    size_count = Counter(font_sizes)
    dominant_size = size_count.most_common(2)
    if (dominant_size[1][0] > dominant_size[0][0] and dominant_size[1][1] > dominant_size[0][0]/2):
        main_size = round(dominant_size[1][0]) 
    else:
        main_size = round(dominant_size[0][0]) 

    return main_size


def get_main_font(doc):
    fonts = []
    for page in doc:
        text_dict = page.get_text("dict")
        blocks = text_dict["blocks"]
        for block in blocks:
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    if not re.match(r'[\s\t]+', span["text"]): fonts.append(span["font"])
    font_count = Counter(fonts)
    dominant_font = font_count.most_common(1)[0]
    if type(dominant_font) == tuple:
        dominant_font = dominant_font[0]
    
    return dominant_font


def tocL2tocD(toc_list):
    root = {}
    stack = [(0, root)]  # stack of (level, current_dict)

    for level, title, startpage, endpage in toc_list:
        current_dict = {}
        while stack and level <= stack[-1][0]:
            stack.pop()
        stack[-1][1][(level, title, startpage, endpage)] = {"_page": (startpage, endpage), "_sub": current_dict}
        stack.append((level, current_dict))

    def cleanup(d):
        return {
            k: cleanup(v["_sub"]) if v["_sub"] else {"_page": v["_page"]}
            for k, v in d.items()
        }

    return cleanup(root)


def get_parent(secname, toc_dict, parent=None):
    for key, value in toc_dict.items():
        if key == secname:
            return parent
            
        if isinstance(value, dict):
            child_dict = value
            result = get_parent(secname, child_dict, key)
            if result:
                return result
    return None


def get_children(section, toc):
    subsections=[]
    level = section[0]
    for sec in toc[toc.index(section)+1:]:
        if sec[0]>level:
            subsections.append(sec)
        elif sec[0]==level:
            break
    return subsections


def get_next_section(section, toc):
    level = section[0]
    for sec in toc[toc.index(section)+1:]:
        if sec[0]<=level:
            return sec
    return None


def get_ancestry(target, d, path=None):
    if path is None:
        path = []

    for key, value in d.items():
        current_path = path + [key]

        if key == target:
            return current_path

        # Check if value is a dict and has further nested items
        if isinstance(value, dict):
            child_dict = value if "_page" not in value else {}
            result = get_ancestry(target, child_dict, current_path).key()
            if result:
                return result

    return None  # Target not found


def get_span(line):
    spans = []
    span = {}
    for s in line.get("spans", []):
        s["font"] = re.sub(r'\+.*', '', s["font"])
        if not re.match(r'[\s\t]+$', s["text"]):
            s["text"] = re.sub(r'.*(\\u200[0-9a])+.*', ' ', re.sub(r'.*(\\xa[d0])+.*', '',s["text"]))
            if span == {}: 
                span = {'text': s["text"], 
                        'font': s["font"], 
                        'size': s["size"]}
            else: 
                if re.match(r"(([A-Z]+\s)+)?[A-Z]$", span["text"].strip(' ,.:?!')): 
                    span["text"] = span["text"] + s["text"] 
                    span["size"] = s["size"]  
                    span["font"] = s["font"]
                    
                elif span["font"] != s["font"] or span["size"] != s["size"]: 
                    if re.match(r"[A-Z]+$", s["text"]):
                        span["text"] = span["text"] + ' ' + s["text"]
                    else:
                        spans.append(span.copy())
                        span = {'text': s["text"], 
                                'font': s["font"], 
                                'size': s["size"]}

                elif (re.match(r"(\s)?[ﬁﬂ—](\s)?$", s["text"]) 
                      or re.match(r"[ﬁﬂ—]$", span["text"][-1])
                     ):
                    span["text"] = span["text"] + s["text"]
                    
                elif (span["font"] == s["font"] and
                      span["size"] == s["size"]
                     ):
                    if (' ' in (s["text"][0], span["text"][-1]) 
                        or (len(span["text"]) > 1 and re.match(r"^\s[A-Z]$", span["text"][-2:]))
                       ):
                        span["text"] = span["text"] + s["text"] 
                    else:
                        span["text"] = span["text"] + ' ' + s["text"] 
                else:
                    continue
    if span != {}: spans.append(span)
    for span in spans: 
        span["text"] = span["text"].strip()
        span["text"] = re.sub(r'(\\u200[0-9a])+', ' ', span["text"])
        # Removing references
        span["text"] = re.sub(r'(;\s)?\[\s(\d(\s,\s+)?)+\s\]', '', span["text"])
        span["text"] = re.sub(r'(;\s)?\[\s\d+\s–\s\d+\s\]', '', span["text"])
        # Formatting spaces surrounding commas, dots, and parentheses
        span["text"] = re.sub(r'\(\s', '(', span["text"])
        span["text"] = re.sub(r'\s\)', ')', span["text"])
        # Removing multiple spaces (strip method fails)
        span["text"] = re.sub(r'\s+', ' ', span["text"])
        # Replacing the 'ﬁ' and 'ﬂ' characters with correct "fi" string
        span["text"] = re.sub(r'ﬁ', 'fi', span["text"])
        span["text"] = re.sub(r'ﬂ', 'fl', span["text"])
    return spans
                

In [5]:
# Function extracting and structuring the text from PDF
def text_extract(doc, sections, num_block=0):
    
    toc, pheight, pframe, main_size, main_font = info_extract(doc)
    toc_dict = tocL2tocD(toc)

    main_text = {}

    for section in sections:
        start_page = section[2]
        end_page = section[3]
        level = section[0]
        parent = get_parent(section, toc_dict)

        next_section = get_next_section(section, toc)
        subsections = get_children(section, toc)

        if (parent in sections) and (parent is not None):
            continue

        key = ""
        value = []
        
        pblocks = []
        for i in range(start_page, end_page+1):
            page = doc.load_page(i-1)
            pblocks.append(page.get_text("dict")["blocks"])

        skip = False
        stop = False
        flag = False
        for blocks in pblocks:
            if stop: break
            skip = False
            if pblocks.index(blocks) > 0:
                num_block = 0
            for block in blocks[num_block:]:
                if skip or stop: break  
                # Removing header and footer blocks
                if (("pdf" in doc.metadata["format"].lower() and block["bbox"][1] > pframe and block["bbox"][3] < pheight-pframe) or
                    "epub" in doc.metadata["format"].lower()):
                    for line in block.get("lines", []):
                        inline_title = False
                        spans = get_span(line)
                        # Check if line span is empty
                        if len(spans) > 0: 
                                                                                    
                            # Detect captions
                            if (blocks[blocks.index(block)-1]['type'] == 1
                                and (spans[0]["font"] != main_font or spans[0]["size"] < main_size)
                               ): 
                                break
                            
                            # Detect titles
                            if (spans[0]["font"] != main_font or spans[0]["size"] != main_size
                                and block.get("lines", []).index(line) < 4
                               ):
                                
                                # Detect current section's title
                                if (re.search(rf"{re.escape(re.sub(r'[ ,.:?!]', '', spans[0]['text'].lower().strip()))}", re.sub(r'[ ,.:?!]', '', section[1].lower())) 
                                    or re.search(rf"{re.escape(re.sub(r'[ ,.:?!]', '', section[1].lower()))}", re.sub(r'[ ,.:?!]', '', spans[0]['text'].lower().strip()))
                                   ):
                                    if key == '':
                                        key = section[1]
                                        value = ['']
                                        if len(spans) <= 1: continue
                                        else: inline_title = True
                                    else:
                                        if re.sub(r'[ ,.:?!]', '', spans[0]['text'].lower().strip()) == re.sub(r'[ ,.:?!]', '', section[1].lower()): 
                                            continue

                                # Skip subsections' text 
                                elif flag:
                                    if not (re.search(rf"{re.escape(re.sub(r'[ ,.:?!]', '', spans[0]['text'].lower().strip()))}", re.sub(r'[ ,.:?!]', '', next_section[1].lower())) 
                                            or re.search(rf"{re.escape(re.sub(r'[ ,.:?!]', '', next_section[1].lower()))}", re.sub(r'[ ,.:?!]', '', spans[0]['text'].lower().strip()))
                                           ):
                                        skip = True
                                    else:
                                        flag = False
                                        stop = True
                                    break
                                        
                                # Detect next section's title
                                elif (next_section is not None
                                      and (re.search(rf"{re.escape(re.sub(r'[ ,.:?!]', '', spans[0]['text'].lower().strip()))}", re.sub(r'[ ,.:?!]', '', next_section[1].lower())) 
                                           or re.search(rf"{re.escape(re.sub(r'[ ,.:?!]', '', next_section[1].lower()))}", re.sub(r'[ ,.:?!]', '', spans[0]['text'].lower().strip())))
                                      and key != ''
                                     ):
                                    stop = True
                                    break

                                # Detect first subsection 
                                elif (len(subsections)>0
                                      and (re.search(rf"{re.escape(re.sub(r'[ ,.:?!]', '', spans[0]['text'].lower().strip()))}", re.sub(r'[ ,.:?!]', '', subsections[0][1].lower())) 
                                           or re.search(rf"{re.escape(re.sub(r'[ ,.:?!]', '', subsections[0][1].lower()))}", re.sub(r'[ ,.:?!]', '', spans[0]['text'].lower().strip())))
                                      and key != ''
                                     ): 
                                    value.append(text_extract(doc, subsections, blocks.index(block))) 
                                    flag = True
                                    break
                                    
                                # Detect bibliography or references section (in case they are not in toc)
                                elif spans[0]["text"].lower().strip() in ("references", "bibliography"):
                                    stop = True
                                    break
                                
                                # Remove footer blocks from the text
                                elif (re.match(r'\s?\d+\s?$', spans[0]["text"]) 
                                      and [span["size"] < main_size for span in spans]):
                                    skip = True
                                    break
                                
                            #else:
                            if key != '': 
                                #if not inline_title and spans[0]["text"].lower().strip(',.:?!') in key.lower().strip(',.:?!'):
                                #    continue
                                if inline_title: spans = spans[1:]
                                for span in spans: 
                                    if re.match(r'\s?\d+\s?$', span["text"]) and span["size"] < main_size:
                                        continue
                                    elif (len(value[0]) > 1 and re.match(r'-', value[0][-1])
                                         or len(value[0])==0): 
                                        value[0] = value[0].strip() + span["text"]
                                    else: 
                                        value[0] = value[0].strip() + ' ' + span["text"] 
                                
                if key != "":
                    main_text[key] = value
                    value[0] = re.sub(r'[\xad\xa0]\s', '', value[0])
                    value[0] = re.sub(r'\s+([.,:?!]([A-Z]))', r'\1 \2', value[0])
                    if re.match(r'\. ', value[0]): 
                        value[0] = value[0][2:] 
                    if re.match(r'[a-z]+(\.)?\s?\w+', value[0]):
                        value[0] = re.sub(r'^[a-z]+(\.)?\s?(\w+)', r'\2', value[0])
                    
                        
        
    return main_text

In [3]:
doc = pymupdf.open("/home/diomir0/Documents/papers/Bedford, 2023 - LSD_Connectivity.pdf")

In [4]:
toc = get_toc(doc)
[print(i, sec) for i, sec in enumerate(toc)]

0 (1, 'The effect of lysergic acid diethylamide (LSD) on whole-brain functional and effective connectivity', 1, 9)
1 (2, 'Introduction', 1, 2)
2 (2, 'Materials and methods', 2, 2)
3 (3, 'Participants', 2, 2)
4 (3, 'Experimental procedure', 2, 2)
5 (3, 'Data analysis', 2, 2)
6 (4, 'Functional connectivity', 2, 2)
7 (4, 'Effective connectivity', 2, 2)
8 (4, 'Statistical analysis', 2, 2)
9 (2, 'Results', 2, 6)
10 (3, 'The effect of LSD on functional connectivity', 2, 3)
11 (4, 'Partial least squares correlation analysis', 2, 2)
12 (4, 'Machine learning analysis', 2, 3)
13 (3, 'The effect of LSD on effective connectivity', 3, 3)
14 (4, 'Partial least squares correlation analysis', 3, 3)
15 (4, 'Machine learning analysis', 3, 3)
16 (3, 'Comparing functional vs effective connectivity changes under LSD', 3, 6)
17 (4, 'The effect of LSD on inhibitory self-connections', 3, 4)
18 (4, 'Asymmetry in directed connectivity', 4, 5)
19 (4, 'Comparing functional and effective connectivity classifiers',

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [6]:
sections = [toc[9]]
text_extract(doc, sections)

{'Results': ['',
  {'The effect of LSD on functional connectivity': ['Mass-univariate tests suggested that about 23% (1993/8646) unique correlation coefficients significantly differed across LSD and placebo conditions ( p < 0.05). Among these connections, we observed mostly stronger FC under LSD (Fig. 1 A– C, Fig. S2).',
    {'Partial least squares correlation analysis': ['PLSC analysis of FC showed a significant condition effect on the first LV (LSD condition score: 4.442 [3.785, 5.076], placebo: − 4.442[ − 5.076, − 3.785], p < 0.001), but not on the second LV ( p = 1.000). LV loadings indicated that FC was stronger under LSD compared to placebo across a large number of regions. The most reliable effects were observed for the following regions connections: bilateral lingual gyrus and bilateral inferior frontal gurys (pars opercularis), right inferior frontal gyrus and right lingual gyrus, left cuneus and right middle frontal gyrus (MFG), and left temporo-occipital middle temporal gyru

In [11]:
page = doc.load_page(2)
for i, block in enumerate(page.get_text("dict")["blocks"]):
    print(i)
    for j, line in enumerate(block.get("lines", [])):
        spans = get_span(line)
        if len(spans)>0: print(j, spans)

0
0 [{'text': 'connections from occipital regions to prefrontal and temporal', 'font': 'AdvOT46dcae81', 'size': 8.966300010681152}]
1 [{'text': 'regions (Fig. S2D).', 'font': 'AdvOT46dcae81', 'size': 8.966300010681152}]
1
0 [{'text': 'The effect of LSD on effective connectivity', 'font': 'AdvOT3b30f6db.B', 'size': 8.966300010681152}]
1 [{'text': 'Mass-univariate tests suggested that about 13% (2184/17424)', 'font': 'AdvOT46dcae81', 'size': 8.966300010681152}]
2 [{'text': 'effective connections coefficients significantly differed across', 'font': 'AdvOT46dcae81', 'size': 8.966300010681152}]
3 [{'text': 'conditions (', 'font': 'AdvOT46dcae81', 'size': 8.966300010681152}, {'text': 'p', 'font': 'AdvOT65f8a23b.I', 'size': 8.966300010681152}, {'text': '< 0.05). As with LSD-induced changes in FC, we', 'font': 'AdvOT46dcae81', 'size': 8.966300010681152}]
4 [{'text': 'observed mostly stronger EC under LSD (Fig. 1 E– G, Fig. S2).', 'font': 'AdvOT46dcae81', 'size': 8.966300010681152}]
5 [{'text':