In [None]:
import asyncio
from pathlib import Path
from playwright.async_api import async_playwright
from pypdf import PdfReader, PdfWriter
import fitz  # pip install pymupdf
import os
from PyPDF2 import PdfMerger
import shutil

In [None]:
DOCID = 88971656
START = 1
END = 559

STATE_FILE = "auth_state.json"

TITLE = "Livre" # Remplie à la première itération
CROP = None  # Remplie à la première itération

In [None]:
async def generate_state_file(path: str):
    Path(".").mkdir(exist_ok=True)

    async with async_playwright() as p:
        # fenêtre visible pour se connecter
        browser = await p.chromium.launch(headless=False)  
        ctx = await browser.new_context()
        page = await ctx.new_page()

        await page.goto("https://scholarvox.library.omneseducation.com/", wait_until="domcontentloaded")

        # Vérifie qu'on est bien connecté
        timeout = 3 * 60 * 1000  # 3 minutes
        await page.wait_for_selector(selector='text=Se déconnecter', timeout=timeout)

        # on sauvegarde la session
        await ctx.storage_state(path=path)
        print("Jeton de connexion sauvegardé.")
        await browser.close()

# Dans Jupyter / contexte async :
await generate_state_file(path=STATE_FILE)

In [None]:
def size_page(pdf_path:str):
    doc = fitz.open(pdf_path)
    page = doc[0] # première page

    page_height = page.rect.height # Hauteur de la fenetre

    # --- Dessins vectoriels (lignes, rectangles, etc.) ---
    for _, drawing in enumerate(page.get_drawings()):
        # bounding box de la forme
        rect = drawing["rect"]        
        _, y0, _, y1 = rect
        
        if y0 == 0.0 and y1 == page_height:
            return rect


def crop_pdf(path_in: str, path_out: str, rect):
    reader = PdfReader(path_in)
    writer = PdfWriter()

    x0, y0, x1, y1 = rect

    new_width_pt = x1 - x0
    height_pt = y1 - y0
    crop_left_pt = x0

    for page in reader.pages:
        page.mediabox.lower_left = (crop_left_pt, 0)
        page.mediabox.upper_right = (crop_left_pt + new_width_pt, height_pt)
        writer.add_page(page)

    with open(path_out, "wb") as f:
        writer.write(f)

def is_close_to(value: float, target: float, tolerance: float = 1.0) -> bool:
    return abs(value - target) <= tolerance

def test_croped(pdf_path:str):
    doc = fitz.open(pdf_path)
    page = doc[0]  # première page
    page_width = page.rect.width

    # Si une boite est proche du bord gauche, mesure aproximativement la largeur de la page,
    # et que sa partie haute est au dessus de la page, alors la page est mal cadrée.
    for _, drawing in enumerate(page.get_drawings()):
        # bounding box de la forme
        rect = drawing["rect"] 
        x0, y0, x1, _ = rect

        if is_close_to(x0, 0.0) and is_close_to(x1, page_width, 2.0) and y0 < -1.0:
            return False
        
    return True

In [None]:
async def capture_range(docid:int, start:int, end:int):
    global TITLE
    global CROP
    Path("captures").mkdir(exist_ok=True)
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        ctx = await browser.new_context(storage_state=STATE_FILE,
                                        device_scale_factor=1,
                                        viewport={'width':3200,'height':2000})
        page = await ctx.new_page()

        def url_for(n:int)->str:
            return f"https://scholarvox.library.omneseducation.com/reader/docid/{docid}/page/{n}"
        

        for n in range(start, end + 1):
            # Initialize variables
            path_page = f"captures/page-{n:03}.pdf"
            retry, max_retry = 1, 5
            url = url_for(n)

            print(f"→ Page {n}/{end} : {url}", end="\r", flush=True)
            
            while retry <= max_retry:
                print(f"→ Page {n}/{end} : {url} (tentative {retry}/{max_retry})", end="\r", flush=True)
                await page.emulate_media(media="screen")
                await page.goto(url, wait_until="networkidle")
                await page.wait_for_timeout(1000)  # laisse le rendu JS finir

                await page.pdf(
                    path=path_page,
                    width= '3200px',
                    height= '2000px',
                    print_background=False,
                    page_ranges="1",
                )

                # We get the crop rect ONCE (on the first page)
                CROP = size_page(path_page) if CROP is None else CROP

                # Et on la réutilise pour toutes les pages
                crop_pdf(path_in=path_page, path_out=path_page, rect=CROP)

                # Sort de la boucle de retry si max atteint, sinon nouvelle tentative
                retry = max_retry + 1 if test_croped(path_page) else retry + 1

        TITLE = await page.title()
        await browser.close()

In [None]:
# Exemple d’appel :
await capture_range(DOCID, START, END)

In [None]:
# --- Après avoir identifié les pages problématiques, on peut les recapturer une par une ---
pages_fails = [560]

for page in pages_fails:
    await capture_range(DOCID, page, page)

In [None]:
raise Exception("Arrêt du notebook ici")

In [None]:
sortie = f"{TITLE}.pdf"

merger = PdfMerger()

for fichier in sorted(os.listdir("captures")):
    if fichier.lower().endswith(".pdf"):
        merger.append(os.path.join("captures", fichier))

merger.write(sortie)
merger.close()

shutil.rmtree("captures")

print("PDF final créé :", sortie)