# Academic Research Helper avec overlap dans le chunking

Objectif: ingérer des abstracts académiques synthétiques, les découper en morceaux avec overlap, les encoder (via EURI si une clé est fournie, fallback local sinon), les stocker dans Qdrant et permettre une recherche sémantique pour des requêtes comme « transformers in computer vision ». Le chunking inclut désormais un overlap entre chunks.

In [25]:
import os
import re
import requests
from typing import List, Dict, Any, Tuple

from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance

# Configuration
VECTOR_SIZE = 128
CHUNK_MAX_CHARS = 600
CHUNK_OVERLAP_CHARS = 100  # overlap between consecutive chunks (in characters)
COLLECTION_NAME = "dou_academic_papers"

# Qdrant host/port (override via environment if needed)




In [26]:
def generate_dataset(n: int = 6) -> List[Dict[str, Any]]:
    """Génère un petit jeu de données synthétiques de papiers (id, title, abstract)."""
    base_papers = [
        {"id": "P1", "title": "Transformers in Computer Vision: A Survey",
         "abstract": "The transformer architecture has emerged as a powerful model for sequence modeling. This paper surveys transformer-based models in computer vision, including ViT, DeiT, and data-efficient variants. We discuss architectures, training regimes, and evaluation benchmarks."},
        {"id": "P2", "title": "Vision Transformers for Image Recognition",
         "abstract": "We examine Vision Transformers (ViT) architectures, patch embeddings, and how self-attention captures long-range dependencies in images. We compare with CNN-based baselines and discuss efficiency and scalability."},
        {"id": "P3", "title": "Self-Attention Mechanisms in Vision Tasks",
         "abstract": "Self-attention modules and their variants are applied to object detection, segmentation, and action recognition. We analyze computational trade-offs and show improvements on common benchmarks."},
        {"id": "P4", "title": "Transformers in Object Detection",
         "abstract": "Transformers extend detection pipelines with query-based decoding and cross-attention. This paper surveys DETR-like models and improvements such as Deformable DETR and query-based attention."},
        {"id": "P5", "title": "Efficient Transformers for Vision",
         "abstract": "We discuss efficiency techniques in transformers for vision, including sparse attention, kernel-based methods, and distillation strategies to reduce compute and memory footprints."},
        {"id": "P6", "title": "Multimodal Transformers for Vision-Language",
         "abstract": "Extending transformers to vision-language tasks, we review approaches like CLIP and ALIGN, focusing on alignment between text and image representations and zero-shot capabilities."},
    ]
    out = []
    for i in range(n):
        p = base_papers[i % len(base_papers)].copy()
        p["id"] = f"{p['id']}_{i}"
        out.append(p)
    return out


In [18]:
def clean_text(text: str) -> str:
    text = text.strip()
    text = re.sub(r"\s+", " ", text)
    return text


In [27]:
def chunk_text(text: str, max_chars: int = CHUNK_MAX_CHARS, overlap_chars: int = CHUNK_OVERLAP_CHARS) -> List[str]:
    """Découpe le texte en chunks avec overlap entre chunks.

    Paramètres:
      max_chars: longueur maximale d'un chunk en caractères.
      overlap_chars: nombre de caractères qui se chevauchent entre chunks consécutifs.
    """
    if max_chars <= 0:
        return [text]
    chunks: List[str] = []
    if not text:
        return chunks
    i = 0
    n = len(text)
    while i < n:
        end = min(i + max_chars, n)
        chunk = text[i:end].strip()
        if chunk:
            chunks.append(chunk)
        if end >= n:
            break
        # Avancer en laissant un overlap de chars entre les chunks
        i = max(0, end - overlap_chars)
    return chunks


In [29]:
def generate_embeddings(text: str) -> List[float]:
    """
    Embedding function: utilise l’API EURI si clé présente; sinon, fallback local déterministe.
    """
    api_key = os.getenv("EURI_API_KEY")
    endpoint = os.getenv("EURI_EMBEDDING_ENDPOINT", "https://api.euri.ai/v1/embed")

    if api_key:
        try:
            headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
            payload = {"text": text}
            resp = requests.post(endpoint, json=payload, headers=headers, timeout=60)
            resp.raise_for_status()
            data = resp.json()
            embedding = None
            if isinstance(data, dict):
                embedding = data.get("embedding") or data.get("vector") or data.get("embeddings")
            if isinstance(embedding, list):
                return embedding
        except Exception as e:
            print(f"[EURI] Embedding API failed: {e}")

    # Fallback deterministe (128-dim)
    dim = VECTOR_SIZE
    vec = [0.0] * dim
    for idx, ch in enumerate(text):
        vec[idx % dim] += (ord(ch) / 255.0)
    norm = sum(v * v for v in vec) ** 0.5
    if norm > 0:
        vec = [v / norm for v in vec]
    return vec


In [30]:
def ensure_collection(client: QdrantClient, name: str) -> None:
    try:
        client.recreate_collection(
            collection_name=name,
            vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE)
        )
        print(f"Collection '{name}' recreated.")
    except Exception:
        print(f"Collection '{name}' already exists or could not be recreated.")


In [37]:
def build_and_store_dataset(papers, client, collection_name):
    points = []
    point_ids = []

    for paper in papers:
        # 👇 fallback: essaie "text", sinon "abstract", sinon "content"
        text = paper.get("text") or paper.get("abstract") or paper.get("content")
        if not text:
            continue  # skip si aucun texte

        chunks = chunk_text(text)
        embeddings = generate_embeddings(chunks)

        for chunk, vector in zip(chunks, embeddings):
            pid = str(uuid.uuid4())
            points.append(
                PointStruct(
                    id=pid,
                    vector=vector,
                    payload={
                        "title": paper.get("title", ""),
                        "text_snippet": chunk,
                        "lang": paper.get("lang", "en")
                    }
                )
            )
            point_ids.append(pid)

    if points:
        client.upsert(collection_name=collection_name, points=points)
    return point_ids


In [38]:
def _extract_payload_score(res: Any) -> Tuple[Dict[str, Any], float]:
    payload = getattr(res, "payload", None)
    score = getattr(res, "score", None)
    if payload is None and isinstance(res, dict):
        payload = res.get("payload", {})
        score = res.get("score", 0.0)
    return payload if payload is not None else {}, float(score if score is not None else 0.0)

def retrieve_papers(query: str, client: QdrantClient, collection_name: str = COLLECTION_NAME, top_k: int = 5) -> List[Dict[str, Any]]:
    vec = generate_embeddings(query)
    results = client.search(collection_name=collection_name, query_vector=vec, top=top_k, with_payload=True)
    hits: List[Dict[str, Any]] = []
    for r in results:
        payload, score = _extract_payload_score(r)
        title = payload.get("title")
        paper_id = payload.get("paper_id")
        chunk_id = payload.get("chunk_id")
        text = payload.get("text", "")
        hits.append({
            "score": score,
            "paper_id": paper_id,
            "title": title,
            "chunk_id": chunk_id,
            "text_snippet": text[:200] + ("..." if len(text) > 200 else "")
        })
    return hits


In [None]:
def run_demo():
    # Init Qdrant client
    
    qdrant_client = QdrantClient(
                                url="{QDRANT_URL}", 
                                api_key="{QDRANT_API_KEY}"
                                )

    # Ensure collection exists/established
    ensure_collection(qdrant_client, COLLECTION_NAME)

    # 1) Generate dataset
    papers = generate_dataset(n=6)

    # 2) Build and store (chunk + embed + store)
    print("Ingesting papers into Qdrant...")
    stored_ids = build_and_store_dataset(papers, qdrant_client, COLLECTION_NAME)
    print(f"Stored {len(stored_ids)} chunks across {len(papers)} papers.")

    # 3) Semantic query example
    query = "transformers in computer vision"
    print(f"Query: {query}")
    results = retrieve_papers(query, qdrant_client, COLLECTION_NAME, top_k=5)
    print("Top results:")
    for r in results:
        print(f"- Paper ID: {r['paper_id']}, Title: {r['title']}, Score: {r['score']:.4f}")
        print(f"  Snippet: {r['text_snippet']}")

# Execute the demo when running this notebook cell
run_demo()


  client.recreate_collection(


Collection 'dou_academic_papers' recreated.
Ingesting papers into Qdrant...
[EURI] Embedding API failed: HTTPSConnectionPool(host='api.euri.ai', port=443): Max retries exceeded with url: /v1/embed (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001D90EC090C0>: Failed to resolve 'api.euri.ai' (Name or service not known: api.euri.ai using 1 resolver(s))"))


TypeError: ord() expected a character, but string of length 268 found