# Test to fecth pdf with py

In [1]:
import fitz  # PyMuPDF
import json
from pprint import pprint
from collections import Counter
import pandas as pd


In [14]:
#Recursive function to removed empty children
def remove_empty_children(node):
    if "children" in node:
        node["children"] = [remove_empty_children(child) for child in node["children"]]
        if not node["children"]:
            del node["children"]
    return node

#Function to get pdf information and transform to a structured json
def fetch_and_structre_json(pdf):
    #FETCH
    nbr_pages = len(pdf)
    total_character = 0

    data_json = {
        'nbr_page': nbr_pages,
        'nbr_character': total_character,
        'content': []
    }

    for page_num, page in enumerate(pdf, 1):
        blocks = page.get_text("dict")["blocks"]

        texte = page.get_text()
        total_character += len(texte)

        for block in blocks:
            if "lines" not in block:
                continue

            for line in block['lines']:
                for span in line['spans']:
                    data_json['content'].append({
                        'page': page_num,
                        'text': span["text"].strip(),
                        'size': round(span["size"], 2),
                        'font': span["font"],
                        'flags': span["flags"],
                    })

    data_json['nbr_character'] = total_character

    #FONT-SIZE COUNTS
    font_sizes = Counter()

    for span in data_json['content']:
        font_sizes[span["size"]] += 1

    sizes_sorted = sorted(font_sizes, reverse=True)

    size_to_level = {size: i + 1 for i, size in enumerate(sizes_sorted)}
    for span in data_json['content']:
        span["level"] = size_to_level[span["size"]]

    #TRANSFORM
    structured_json = {
        'nbr_page': data_json['nbr_page'],
        'nbr_character': data_json['nbr_character'],
        'content': []
    }

    stack = []

    for i in data_json["content"]:
        node = {
            "text": i["text"],
            "level": i["level"],
            "children": []
        }

        if not stack:
            structured_json['content'].append(node)
            stack.append(node)
            continue

        if node["level"] > stack[-1]["level"]:
            stack[-1]["children"].append(node)
            stack.append(node)

        elif node["level"] == stack[-1]["level"]:
            stack[-1]["text"] += " " + node["text"]

        else:
            while stack and node["level"] <= stack[-1]["level"]:
                stack.pop()
            if stack:
                stack[-1]["children"].append(node)
            else:
                structured_json['content'].append(node)
            stack.append(node)

    #REMOVE EMPTY CHILDREN
    structured_json['content'] = [remove_empty_children(node) for node in structured_json['content']]

    #RETURN
    return structured_json


In [20]:
doc_template3 = fitz.open("pdfs/template_3.pdf")
doc_template3

Document('pdfs/template_3.pdf')

In [21]:
pdf_sample = fitz.open("pdfs/template_3.pdf")
pdf_sample
fetch_and_structre_json(pdf_sample)

{'nbr_page': 2,
 'nbr_character': 3749,
 'content': [{'text': 'Titre du cours',
   'level': 1,
   'children': [{'text': 'Chapitre 1',
     'level': 2,
     'children': [{'text': 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Quisque consequat leo id ornare suscipit. Curabitur hendrerit libero non elit sodales cursus. Nunc viverra accumsan mi, a dictum nulla posuere eu. Fusce ullamcorper vitae enim eget placerat. Phasellus blandit sit amet mi a dictum. Quisque lacinia massa est, ut placerat eros egestas non. Donec dui turpis, tincidunt eu sagittis non, porta non est. Phasellus rutrum vel metus nec efficitur. Duis maximus purus et nisl egestas, ut pulvinar lorem venenatis. Cras condimentum tellus et magna ornare dictum. Phasellus id condimentum diam. Nam fermentum ultricies aliquam. Sed congue aliquet eros, vel rutrum risus lobortis a. Nunc congue blandit urna ac tincidunt. Etiam faucibus vel ligula nec egestas. Donec rutrum finibus ultrices. Etiam consequat finibus accumsan. 

In [22]:
pdf_sample2 = fitz.open("pdfs/template_2.pdf")
pdf_sample2
fetch_and_structre_json(pdf_sample2)

{'nbr_page': 4,
 'nbr_character': 4326,
 'content': [{'text': 'Document histoire CM1',
   'level': 1,
   'children': [{'text': 'Lien à fetcher',
     'level': 2,
     'children': [{'text': 'https://www.kartable.fr/cm1/histoire/specifique',
       'level': 5}]},
    {'text': 'PDF Template',
     'level': 2,
     'children': [{'text': 'Titre du cours',
       'level': 3,
       'children': [{'text': 'Chapitre 1',
         'level': 4,
         'children': [{'text': 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Quisque consequat leo id ornare suscipit. Curabitur hendrerit libero non elit sodales cursus. Nunc viverra accumsan mi, a dictum nulla posuere eu. Fusce ullamcorper vitae enim eget placerat. Phasellus blandit sit amet mi a dictum. Quisque lacinia massa est, ut placerat eros egestas non. Donec dui turpis, tincidunt eu sagittis non, porta non est. Phasellus rutrum vel metus nec efficitur. Duis maximus purus et nisl egestas, ut pulvinar lorem venenatis. Cras condimentum tel

In [19]:
pdf_list = [pdf_sample, pdf_sample2]
final_json = {
    "nbr_pdf": len(pdf_list),
    'pdfs': []
}

for pdf in pdf_list:
    current_json = fetch_and_structre_json(pdf)
    final_json['pdfs'].append(current_json)

final_json

{'nbr_pdf': 2,
 'pdfs': [{'nbr_page': 2,
   'nbr_character': 3749,
   'content': [{'text': 'Titre du cours',
     'level': 1,
     'children': [{'text': 'Chapitre 1',
       'level': 2,
       'children': [{'text': 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Quisque consequat leo id ornare suscipit. Curabitur hendrerit libero non elit sodales cursus. Nunc viverra accumsan mi, a dictum nulla posuere eu. Fusce ullamcorper vitae enim eget placerat. Phasellus blandit sit amet mi a dictum. Quisque lacinia massa est, ut placerat eros egestas non. Donec dui turpis, tincidunt eu sagittis non, porta non est. Phasellus rutrum vel metus nec efficitur. Duis maximus purus et nisl egestas, ut pulvinar lorem venenatis. Cras condimentum tellus et magna ornare dictum. Phasellus id condimentum diam. Nam fermentum ultricies aliquam. Sed congue aliquet eros, vel rutrum risus lobortis a. Nunc congue blandit urna ac tincidunt. Etiam faucibus vel ligula nec egestas. Donec rutrum finibus ultrice