In [1]:
import json
import logging
import time
from pathlib import Path

In [2]:
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
_log = logging.getLogger(__name__)

In [9]:

logging.basicConfig(level=logging.INFO)

input_doc_path = "doc/data/CGRER_PUB.pdf"

pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.ocr_options.lang = ["fr"]
pipeline_options.accelerator_options = AcceleratorOptions(
    num_threads=4, device=AcceleratorDevice.AUTO
)

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

start_time = time.time()
conv_result = doc_converter.convert(input_doc_path)
end_time = time.time() - start_time

_log.info(f"Document converted in {end_time:.2f} seconds.")

## Export results
output_dir = Path("scratch")
output_dir.mkdir(parents=True, exist_ok=True)
doc_filename = conv_result.input.file.stem

# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
    fp.write(json.dumps(conv_result.document.export_to_dict()))

# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
    fp.write(conv_result.document.export_to_text())

# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
    fp.write(conv_result.document.export_to_markdown())

# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open("w", encoding="utf-8") as fp:
    fp.write(conv_result.document.export_to_doctags())

INFO:docling.datamodel.document:detected formats: [<InputFormat.PDF: 'pdf'>]
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for StandardPdfPipeline with options hash 70a24977f60b56c1e2845a166e5a76f6
INFO:docling.models.factories.base_factory:Loading plugin 'docling_defaults'
INFO:docling.models.factories:Registered ocr engines: ['easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
INFO:docling.utils.accelerator_utils:Accelerator device: 'mps'
INFO:docling.utils.accelerator_utils:Accelerator device: 'mps'
INFO:docling.utils.accelerator_utils:Accelerator device: 'mps'
INFO:docling.models.factories.base_factory:Loading plugin 'docling_defaults'
INFO:docling.models.factories:Registered picture descriptions: ['vlm', 'api']
INFO:docling.pipeline.base_pipeline:Processing document CGRER_PUB.pdf
INFO:docling.document_converter:Finished converting document CGRER_PUB.pdf in 60.56 sec.
INFO:__main__:Document converted 

In [10]:
import re

def parse_and_clean(markdown: str, source: str = "unknown") -> dict:
    """
    Parse et nettoie du Markdown avec préservation des tableaux.
    Retourne un dict { "cleaned": texte, "source": source }.
    """

    def strip_frontmatter(md: str) -> str:
        return re.sub(r"^---[\s\S]*?---\n+", "", md, flags=re.MULTILINE)

    def strip_code_fences(md: str) -> str:
        # Supprimer blocs ```...``` et inline `code`
        md = re.sub(r"```[\s\S]*?```", "", md)
        md = re.sub(r"`([^`]+)`", r"\1", md)
        return md

    def strip_html(md: str) -> str:
        return re.sub(r"<[^>]+>", " ", md)

    def normalize_links(md: str) -> str:
        # [texte](url) -> texte
        return re.sub(r"\[([^\]]+)\]\(([^)]+)\)", r"\1", md)

    def collapse_spaces(md: str) -> str:
        md = md.replace("\u00A0", " ")
        md = re.sub(r"[ \t]+", " ", md)
        md = re.sub(r"\n{3,}", "\n\n", md)
        return md.strip()

    def convert_tables(md: str) -> str:
        """Convertit les tableaux Markdown en TSV lisible"""
        lines = md.splitlines()
        out = []
        i = 0

        def is_table_row(s: str) -> bool:
            return "|" in s and s.count("|") >= 2

        while i < len(lines):
            line = lines[i]
            if (is_table_row(line) and i + 1 < len(lines)
                and re.match(r"\|?\s*:?-{3,}:?\s*(\|\s*:?-{3,}:?\s*)+\|?", lines[i+1])):
                # Début d'un tableau
                table_lines = [line]
                i += 1  # skip separator
                table_lines.append(lines[i])
                i += 1
                while i < len(lines) and is_table_row(lines[i]):
                    table_lines.append(lines[i])
                    i += 1
                # Conversion en TSV
                tsv = []
                for L in table_lines:
                    row = L.strip().lstrip("|").rstrip("|")
                    row = re.split(r"\s*\|\s*", row)
                    tsv.append("\t".join(row))
                out.append("\nTableau:\n" + "\n".join(tsv) + "\n")
                continue  # ne pas avancer i ici car déjà fait
            out.append(line)
            i += 1
        return "\n".join(out)

    md = markdown or ""
    md = strip_frontmatter(md)
    md = strip_code_fences(md)
    md = strip_html(md)
    md = normalize_links(md)
    md = convert_tables(md)
    md = collapse_spaces(md)

    return {"cleaned": md, "source": source}


In [11]:
result = parse_and_clean(conv_result.document.export_to_markdown(), source="CGRER.pdf")
# Export Markdown format:
with (output_dir / f"{doc_filename}_cleaned.md").open("w", encoding="utf-8") as fp:
    fp.write(result["cleaned"])

In [None]:
from typing import List, Dict

def chunk_with_headings(
    text: str,
    source: str = "unknown",
    chunk_size: int = 900,
    overlap: int = 150,
    max_heading_level: int = 3,  # on segmente sur #, ##, ### par défaut
) -> List[Dict]:
    """
    Découpe `text` par titres Markdown (# à ###), puis en chunks avec overlap.
    Retourne une liste de dicts: {"text": str, "meta": {...}}.
    """

    header_re = re.compile(r"^(#{1,6})\s+(.*)$")

    def split_by_headings(md: str):
        lines = md.splitlines()
        sections = []
        current_title = "root"
        current_lines: List[str] = []

        def flush():
            txt = "\n".join(current_lines).strip()
            if txt:  # n'ajouter la section que si elle contient du texte
                sections.append({"title": current_title, "text": txt})

        for line in lines:
            m = header_re.match(line.strip())
            if m and len(m.group(1)) <= max_heading_level:
                # on ferme la section en cours
                flush()
                # nouvelle section
                current_title = m.group(2).strip()
                current_lines = []
            else:
                current_lines.append(line)

        # dernière section
        flush()
        return sections

    def chunk_text(t: str, size: int, ov: int) -> List[str]:
        chunks: List[str] = []
        start = 0
        n = len(t)

        # garde-fous
        if size <= 0:
            size = 900
        if ov < 0:
            ov = 0
        if ov >= size:
            ov = max(0, size // 5)  # éviter le loop infini

        while start < n:
            end = min(start + size, n)
            slice_ = t[start:end]

            # essayer de couper à une fin de phrase si possible
            last_dot = slice_.rfind(". ")
            if last_dot > int(size * 0.6) and end < n:
                end = start + last_dot + 1
                slice_ = t[start:end]

            slice_ = slice_.strip()
            if slice_:
                chunks.append(slice_)

            if end >= n:
                break
            start = max(0, end - ov)

        return chunks

    sections = split_by_headings(text or "")
    out: List[Dict] = []
    global_index = 0

    for sec in sections:
        parts = chunk_text(sec["text"], chunk_size, overlap)
        for i, chunk in enumerate(parts):
            out.append({
                "text": chunk,
                "meta": {
                    "section": sec["title"],
                    "chunk_index": i,
                    "global_index": global_index,
                    "source": source,
                },
            })
            global_index += 1

    return out




In [19]:
chunks = chunk_with_headings(result["cleaned"], source="rapport.md", chunk_size=80, overlap=20)

for c in chunks:
    print(f"[{c['meta']}] → {c['text'][:60]}...")

[{'section': "CONDITIONS GENERALES DE RECRUTEMENT, D'EMPLOI ET DE REMUNERATION", 'chunk_index': 0, 'global_index': 0, 'source': 'rapport.md'}] → Page 1...
[{'section': 'SOMMAIRE', 'chunk_index': 0, 'global_index': 1, 'source': 'rapport.md'}] → Tableau:
 PREAMBULE............................................
[{'section': 'SOMMAIRE', 'chunk_index': 1, 'global_index': 2, 'source': 'rapport.md'}] → ...............................................................
[{'section': 'SOMMAIRE', 'chunk_index': 2, 'global_index': 3, 'source': 'rapport.md'}] → ..................................... 5 
-------------------...
[{'section': 'SOMMAIRE', 'chunk_index': 3, 'global_index': 4, 'source': 'rapport.md'}] → ------------------------------------------------------------...
[{'section': 'SOMMAIRE', 'chunk_index': 4, 'global_index': 5, 'source': 'rapport.md'}] → ------------------------------------------------------------...
[{'section': 'SOMMAIRE', 'chunk_index': 5, 'global_index': 6, 'source': 'rapport

In [None]:
from typing import List, Dict

class MarkdownPreprocessor:
    def __init__(self, chunk_size: int = 900, overlap: int = 150, max_heading_level: int = 3):
        self.chunk_size = chunk_size
        self.overlap = overlap
        self.max_heading_level = max_heading_level

    # -------------------------
    #  Parsing & nettoyage
    # -------------------------
    def parse_and_clean(self, markdown: str, source: str = "unknown") -> Dict:
        def strip_frontmatter(md: str) -> str:
            return re.sub(r"^---[\s\S]*?---\n+", "", md, flags=re.MULTILINE)

        def strip_code_fences(md: str) -> str:
            md = re.sub(r"```[\s\S]*?```", "", md)  # blocs de code
            md = re.sub(r"`([^`]+)`", r"\1", md)   # inline code
            return md

        def strip_html(md: str) -> str:
            return re.sub(r"<[^>]+>", " ", md)

        def normalize_links(md: str) -> str:
            return re.sub(r"\[([^\]]+)\]\(([^)]+)\)", r"\1", md)

        def collapse_spaces(md: str) -> str:
            md = md.replace("\u00A0", " ")
            md = re.sub(r"[ \t]+", " ", md)
            md = re.sub(r"\n{3,}", "\n\n", md)
            return md.strip()

        def convert_tables(md: str) -> str:
            lines = md.splitlines()
            out = []
            i = 0

            def is_table_row(s: str) -> bool:
                return "|" in s and s.count("|") >= 2

            while i < len(lines):
                line = lines[i]
                if (is_table_row(line) and i + 1 < len(lines)
                    and re.match(r"\|?\s*:?-{3,}:?\s*(\|\s*:?-{3,}:?\s*)+\|?", lines[i+1])):
                    # Début d'un tableau
                    table_lines = [line]
                    i += 1  # skip separator
                    table_lines.append(lines[i])
                    i += 1
                    while i < len(lines) and is_table_row(lines[i]):
                        table_lines.append(lines[i])
                        i += 1
                    # Conversion en TSV
                    tsv = []
                    for L in table_lines:
                        row = L.strip().lstrip("|").rstrip("|")
                        row = re.split(r"\s*\|\s*", row)
                        tsv.append("\t".join(row))
                    out.append("\nTableau:\n" + "\n".join(tsv) + "\n")
                    continue
                out.append(line)
                i += 1
            return "\n".join(out)

        md = markdown or ""
        md = strip_frontmatter(md)
        md = strip_code_fences(md)
        md = strip_html(md)
        md = normalize_links(md)
        md = convert_tables(md)
        md = collapse_spaces(md)

        return {"cleaned": md, "source": source}

    # -------------------------
    #  Chunking
    # -------------------------
    def chunk_with_headings(self, text: str, source: str = "unknown") -> List[Dict]:
        header_re = re.compile(r"^(#{1,6})\s+(.*)$")

        def split_by_headings(md: str):
            lines = md.splitlines()
            sections = []
            current_title = "root"
            current_lines: List[str] = []

            def flush():
                txt = "\n".join(current_lines).strip()
                if txt:
                    sections.append({"title": current_title, "text": txt})

            for line in lines:
                m = header_re.match(line.strip())
                if m and len(m.group(1)) <= self.max_heading_level:
                    flush()
                    current_title = m.group(2).strip()
                    current_lines = []
                else:
                    current_lines.append(line)
            flush()
            return sections

        def chunk_text(t: str) -> List[str]:
            chunks: List[str] = []
            start = 0
            n = len(t)

            size, ov = self.chunk_size, self.overlap
            if size <= 0:
                size = 900
            if ov < 0:
                ov = 0
            if ov >= size:
                ov = max(0, size // 5)

            while start < n:
                end = min(start + size, n)
                slice_ = t[start:end]

                # couper à la fin d'une phrase si possible
                last_dot = slice_.rfind(". ")
                if last_dot > int(size * 0.6) and end < n:
                    end = start + last_dot + 1
                    slice_ = t[start:end]

                slice_ = slice_.strip()
                if slice_:
                    chunks.append(slice_)

                if end >= n:
                    break
                start = max(0, end - ov)
            return chunks

        sections = split_by_headings(text or "")
        out: List[Dict] = []
        global_index = 0

        for sec in sections:
            parts = chunk_text(sec["text"])
            for i, chunk in enumerate(parts):
                out.append({
                    "text": chunk,
                    "meta": {
                        "section": sec["title"],
                        "chunk_index": i,
                        "global_index": global_index,
                        "source": source,
                    },
                })
                global_index += 1
        return out

    # -------------------------
    #  Pipeline complet
    # -------------------------
    def process(self, markdown: str, source: str = "unknown") -> List[Dict]:
        """Parse + clean + chunk en un seul appel"""
        cleaned = self.parse_and_clean(markdown, source)
        return self.chunk_with_headings(cleaned["cleaned"], source)



In [22]:
pre = MarkdownPreprocessor(chunk_size=80, overlap=20)
chunks = pre.process(result["cleaned"], source="cgrer.pdf")

for c in chunks:
    print(c["meta"], "→", c["text"][:60] + "…")

{'section': "CONDITIONS GENERALES DE RECRUTEMENT, D'EMPLOI ET DE REMUNERATION", 'chunk_index': 0, 'global_index': 0, 'source': 'cgrer.pdf'} → Page 1…
{'section': 'SOMMAIRE', 'chunk_index': 0, 'global_index': 1, 'source': 'cgrer.pdf'} → Tableau:
 PREAMBULE.........................................…
{'section': 'SOMMAIRE', 'chunk_index': 1, 'global_index': 2, 'source': 'cgrer.pdf'} → ............................................................…
{'section': 'SOMMAIRE', 'chunk_index': 2, 'global_index': 3, 'source': 'cgrer.pdf'} → ..................................... 5 
 TITRE 1 DOMAINE D'…
{'section': 'SOMMAIRE', 'chunk_index': 3, 'global_index': 4, 'source': 'cgrer.pdf'} → APPLICATION.................................................…
{'section': 'SOMMAIRE', 'chunk_index': 4, 'global_index': 5, 'source': 'cgrer.pdf'} → ..................................................…
{'section': 'SOMMAIRE', 'chunk_index': 5, 'global_index': 6, 'source': 'cgrer.pdf'} → .................... 6 
 ARTICLE 1

In [23]:
import requests

def get_embedding_ollama(text: str, model: str = "nomic-embed-text") -> list:
    url = "http://localhost:11434/api/embeddings"
    payload = {
        "model": model,
        "prompt": text
    }
    resp = requests.post(url, json=payload)
    resp.raise_for_status()
    data = resp.json()
    return data.get("embedding", [])


In [40]:
pre = MarkdownPreprocessor(chunk_size=500, overlap=100)

# 1) Nettoyage + chunking
chunks = pre.process(result["cleaned"], source="cgrer.pdf")

# 2) Embeddings pour chaque chunk
points = []
for c in chunks:
    vector = get_embedding_ollama(c["text"])  # liste de floats
    points.append({
        "id": c["meta"]["global_index"],
        "vector": vector,
        "payload": {
            "content": c["text"],
            "metadata": {
                **c["meta"]
            }
        }
    })


In [35]:
def create_collection_qdrant(collection="markdown_embeddings",
                             host="http://localhost:6333",
                             vector_size=768,
                             distance="Cosine"):
    url = f"{host}/collections/{collection}"
    payload = {
        "vectors": {
            "size": vector_size,
            "distance": distance
        }
    }
    resp = requests.put(url, json=payload)
    resp.raise_for_status()
    return resp.json()

In [36]:
def collection_exists(collection="markdown_embeddings", host="http://localhost:6333") -> bool:
    url = f"{host}/collections/{collection}"
    resp = requests.get(url)
    return resp.status_code == 200

In [37]:
def upsert_qdrant(points, collection="markdown_embeddings", host="http://localhost:6333"):
    url = f"{host}/collections/{collection}/points"
    payload = {"points": points}
    resp = requests.put(url, json=payload)
    resp.raise_for_status()
    return resp.json()




In [41]:
def delete_collection_qdrant(collection="markdown_embeddings", host="http://localhost:6333"):
    url = f"{host}/collections/{collection}"
    resp = requests.delete(url)
    resp.raise_for_status()
    return resp.json()

In [None]:
# Exemple
host = "http://192.168.1.70:6333/"
delete_collection_qdrant("cgrer", host=host)
if not collection_exists("cgrer", host=host):
    create_collection_qdrant("cgrer", host=host)
resp = upsert_qdrant(points, collection="cgrer", host=host)
print(resp)

{'result': {'operation_id': 0, 'status': 'acknowledged'}, 'status': 'ok', 'time': 0.049951581}
