# Cell 0 — Notebook Header / Info
"""
10_build_index.ipynb
Ziel: Mini-RAG-Index über Regel-Markdowns bauen (Clean Architecture / Layering).
Funktionen:
- Regeln einlesen -> chunken -> Embeddings -> FAISS-Index -> Queries + Zitate
Abhängigkeiten: faiss-cpu, sentence-transformers, numpy, pandas, tqdm, rank-bm25 (optional)
"""

In [None]:
# Cell 1 — Install (nur beim ersten Lauf nötig)
import sys, subprocess, pkgutil

def pip_install(pkgs):
    for p in pkgs:
        if pkgutil.find_loader(p.split("==")[0]) is None:
            subprocess.run([sys.executable, "-m", "pip", "install", "-q", p], check=False)

pip_install([
    "faiss-cpu",
    "sentence-transformers",
    "rank-bm25",
    "numpy",
    "pandas",
    "tqdm",
    "pyyaml"
])


In [None]:
# Cell 2 — Imports
import os, json, uuid, math, re, textwrap, pathlib
from pathlib import Path
from typing import List, Dict, Any
import numpy as np
import pandas as pd
from tqdm import tqdm

import faiss
from sentence_transformers import SentenceTransformer
try:
    from rank_bm25 import BM25Okapi
    HAS_BM25 = True
except:
    HAS_BM25 = False


In [None]:
# Cell 3 — Konfiguration
BASE_DIR = Path.cwd()
RULES_DIR = BASE_DIR / "10_rag" / "rules"   # erwartet: *.md Regeldateien
ARTIFACTS_DIR = BASE_DIR / "10_rag" / "artifacts"
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # klein, schnell
CHUNK_TOKENS = 500     # Zielgröße pro Chunk (grob, ohne echtes Tokenizer-Counting)
OVERLAP_TOKENS = 60    # Überlappung zw. Chunks
TOP_K = 5              # Standard Top-K für Suchergebnisse

INDEX_PATH = ARTIFACTS_DIR / "faiss.index"
META_PATH  = ARTIFACTS_DIR / "chunks_meta.jsonl"
BM25_PATH  = ARTIFACTS_DIR / "bm25_corpus.jsonl"   # optional


In [None]:
# Cell 4 — Regeldateien prüfen & ggf. Demo-Regeln anlegen
SAMPLE_RULES = {
"clean_architecture.md": """
# Clean Architecture (Kurz)
- Domain kennt keine Infrastruktur-Details.
- Use Cases/Services enthalten Business-Logik.
- Adapter/Controller nur Ein-/Ausgabe, kein Fachwissen.
""",
"layered_architecture.md": """
# Layered Architecture (Kurz)
- Controller: HTTP, Validierung, Delegation an Service.
- Service: Geschäftsregeln, Policies, Transaktionen.
- Repository: Persistenz, keine Business-Filter/Mutationen.
""",
"repo_anti_patterns.md": """
# Repository Anti-Patterns
- Default-Methoden mit .filter/.map: schiebt Logik ins Repository.
- Seiteneffekte an Entities im Repo (set*): vermeiden.
- Queries mit impliziten Business-Regeln: in Service verschieben.
""",
"spring_layering.md": """
# Spring Layering Do/Do not
- Controller schlank halten; keine Entscheidungen wie Rabatte/Freigaben.
- Entities: keine "magischen" Getter mit Berechnung/Seiteneffekt.
- Service kapselt Policies (z. B. Rabattberechtigung).
"""
}

RULES_DIR.mkdir(parents=True, exist_ok=True)
existing = list(RULES_DIR.glob("*.md"))

if not existing:
    for name, content in SAMPLE_RULES.items():
        (RULES_DIR / name).write_text(textwrap.dedent(content).strip(), encoding="utf-8")

print("Regeln liegen in:", RULES_DIR)
print("Gefundene Dateien:", [p.name for p in RULES_DIR.glob('*.md')])


In [None]:
# Cell 5 — Utility: grobe Token-Schätzung & Chunking (heading-aware light)
def rough_tokens(s: str) -> int:
    # sehr grob: 1 Token ~ 0.75 Wörter; hier reicht Wortzählung als Heuristik
    return max(1, len(re.findall(r"\S+", s)))

def split_by_headings(text: str) -> List[str]:
    # einfacher Split bei Markdown-Überschriften
    parts = re.split(r"(?m)^#{1,6}\s.*$", text)
    parts = [p.strip() for p in parts if p.strip()]
    return parts if parts else [text]

def sliding_window_chunks(text: str, max_tokens=CHUNK_TOKENS, overlap=OVERLAP_TOKENS) -> List[str]:
    words = re.findall(r"\S+", text)
    chunks = []
    i = 0
    step = max(1, max_tokens - overlap)
    while i < len(words):
        chunk_words = words[i:i+max_tokens]
        chunks.append(" ".join(chunk_words))
        i += step
    return chunks

def chunk_markdown(md_text: str) -> List[str]:
    sections = split_by_headings(md_text)
    out = []
    for sec in sections:
        if rough_tokens(sec) <= CHUNK_TOKENS:
            out.append(sec)
        else:
            out.extend(sliding_window_chunks(sec, CHUNK_TOKENS, OVERLAP_TOKENS))
    return out


In [None]:
# Cell 6 — Regeln einlesen & Chunks mit Metadaten erzeugen
def load_rules_to_chunks(rules_dir: Path) -> List[Dict[str, Any]]:
    all_chunks = []
    for md_file in sorted(rules_dir.glob("*.md")):
        text = md_file.read_text(encoding="utf-8")
        chunks = chunk_markdown(text)
        for idx, ch in enumerate(chunks):
            all_chunks.append({
                "doc_id": md_file.name,
                "chunk_id": idx,
                "text": ch,
            })
    return all_chunks

chunks = load_rules_to_chunks(RULES_DIR)
len(chunks), chunks[0] if chunks else None


In [None]:
# Cell 7 — Embedding-Modell laden und Embeddings berechnen
model = SentenceTransformer(MODEL_NAME)

def embed_texts(texts: List[str], batch_size=64) -> np.ndarray:
    vecs = model.encode(texts, batch_size=batch_size, show_progress_bar=True, normalize_embeddings=True)
    return np.asarray(vecs, dtype="float32")

texts = [c["text"] for c in chunks]
embeddings = embed_texts(texts)
print("Embeddings shape:", embeddings.shape)


In [None]:
# Cell 8 — FAISS-Index bauen (cosine sim via dot product mit normalisierten Vektoren)
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)  # Inner Product; bei normierten Vektoren = Cosinus
index.add(embeddings)
print("Vektoren im Index:", index.ntotal)


In [None]:
# Cell 9 — Metadaten & Index persistieren
faiss.write_index(index, str(INDEX_PATH))
with open(META_PATH, "w", encoding="utf-8") as f:
    for c in chunks:
        f.write(json.dumps(c, ensure_ascii=False) + "\n")

# (Optional) BM25-Korpus persistieren (für Hybrid-Suche)
if HAS_BM25:
    with open(BM25_PATH, "w", encoding="utf-8") as f:
        for c in chunks:
            f.write(json.dumps({"doc_id": c["doc_id"], "chunk_id": c["chunk_id"], "text": c["text"]}, ensure_ascii=False) + "\n")

print("Gespeichert:", INDEX_PATH, META_PATH, ("+ " + str(BM25_PATH) if HAS_BM25 else ""))


In [None]:
# Cell 10 — Ladefunktionen (Index + Metadaten + optional BM25)
def load_index_and_meta():
    idx = faiss.read_index(str(INDEX_PATH))
    meta = []
    with open(META_PATH, "r", encoding="utf-8") as f:
        for line in f:
            meta.append(json.loads(line))
    bm25 = None
    if HAS_BM25 and BM25_PATH.exists():
        corpus = []
        with open(BM25_PATH, "r", encoding="utf-8") as f:
            for line in f:
                corpus.append(json.loads(line))
        bm25 = BM25Okapi([c["text"].split() for c in corpus])
        return idx, meta, corpus, bm25
    return idx, meta, None, None

index, meta, bm25_corpus, bm25 = load_index_and_meta()
len(meta), (len(bm25_corpus) if bm25_corpus else 0)


In [None]:
# Cell 11 — Suche: Vektor-only und (optional) Hybrid (BM25 + Vektor mit einfacher Fusion)
def vector_search(query: str, k=TOP_K):
    qv = embed_texts([query])
    scores, ids = index.search(qv, k)
    results = []
    for score, idx_ in zip(scores[0], ids[0]):
        if idx_ == -1:
            continue
        results.append({
            "score": float(score),
            "doc_id": meta[idx_]["doc_id"],
            "chunk_id": meta[idx_]["chunk_id"],
            "text": meta[idx_]["text"],
            "rank": len(results)+1
        })
    return results

def hybrid_search(query: str, k=TOP_K, alpha=0.6):
    """
    alpha ∈ [0,1]: Gewicht für Vektor-Score; (1-alpha) für BM25-Score
    Einfache Late-Fusion: normalisierte Scores + Summe
    """
    vec = vector_search(query, k=max(k, 20))  # etwas breiter
    if not bm25:
        return vec[:k]

    tokenized_query = query.split()
    bm25_scores = bm25.get_scores(tokenized_query)
    # Alle Kandidaten-IDs sammeln
    cand_ids = set([ (r["doc_id"], r["chunk_id"]) for r in vec ])
    if bm25_corpus:
        # nimm die Top-N BM25
        top_bm25_idx = np.argsort(bm25_scores)[::-1][:max(k, 50)]
        for i in top_bm25_idx:
            d = bm25_corpus[i]
            cand_ids.add((d["doc_id"], d["chunk_id"]))

    # Score-Tabellen aufbauen
    v_scores = {}
    for r in vec:
        v_scores[(r["doc_id"], r["chunk_id"])] = r["score"]
    # normieren (0..1)
    if v_scores:
        v_min, v_max = min(v_scores.values()), max(v_scores.values())
        rng = (v_max - v_min) or 1.0
        for k_ in v_scores.keys():
            v_scores[k_] = (v_scores[k_] - v_min)/rng

    b_scores = {}
    if bm25_corpus:
        for i, s in enumerate(bm25_scores):
            d = bm25_corpus[i]
            b_scores[(d["doc_id"], d["chunk_id"])] = float(s)
        if b_scores:
            b_vals = list(b_scores.values())
            b_min, b_max = min(b_vals), max(b_vals)
            brng = (b_max - b_min) or 1.0
            for k_ in b_scores.keys():
                b_scores[k_] = (b_scores[k_] - b_min)/brng

    fused = []
    for key in cand_ids:
        vs = v_scores.get(key, 0.0)
        bs = b_scores.get(key, 0.0)
        fs = alpha*vs + (1-alpha)*bs
        d = next(m for m in meta if m["doc_id"]==key[0] and m["chunk_id"]==key[1])
        fused.append({
            "score": float(fs),
            "doc_id": d["doc_id"],
            "chunk_id": d["chunk_id"],
            "text": d["text"]
        })
    fused.sort(key=lambda x: x["score"], reverse=True)
    for i, r in enumerate(fused):
        r["rank"] = i+1
    return fused[:k]


In [None]:
# Cell 12 — Pretty-Print mit „Zitaten“-Feeling
def pretty(results: List[Dict[str, Any]], show_text=True):
    for r in results:
        header = f"[{r['rank']}] {r['doc_id']}#chunk-{r['chunk_id']}  score={r['score']:.3f}"
        print(header)
        if show_text:
            print(textwrap.fill(r["text"], width=100))
            print("-"*80)

query_examples = [
    "Darf ein Repository Geschäftslogik (Filter/Mutation) enthalten?",
    "Wo gehört Rabattberechnung laut Layered Architecture hin?",
    "Welche Aufgaben hat ein Spring Controller und was soll er nicht tun?"
]
print("Beispiel-Queries:", query_examples, sep="\n- ")


In [None]:
# Cell 13 — Demo: Vektor-Suche
res = vector_search(query_examples[0], k=TOP_K)
pretty(res)


In [None]:
# Cell 14 — Demo: Hybrid-Suche (falls BM25 verfügbar)
if bm25:
    res_h = hybrid_search(query_examples[1], k=TOP_K, alpha=0.6)
    pretty(res_h)
else:
    print("BM25 nicht installiert oder Korpus fehlt — Vektor-Suche genügt für den Workshop.")


In [None]:
# Cell 15 — (Optional) Einfache "API" für den späteren Workflow/Agent
def retrieve_rules_for_explanation(question: str, k=TOP_K) -> List[Dict[str, Any]]:
    """
    Liefert Top-k Passagen (doc_id, chunk_id, text, score) für die Begründung im Report.
    """
    if bm25:
        return hybrid_search(question, k=k, alpha=0.6)
    return vector_search(question, k=k)

# Kurztest
test = retrieve_rules_for_explanation("Warum ist Business-Logik im Controller problematisch?", k=3)
pretty(test)


In [None]:
# Cell 16 — Mini-Checks (asserts) für den Kurs
assert index.ntotal == len(chunks) == embeddings.shape[0] > 0
assert INDEX_PATH.exists() and META_PATH.exists()
print("Smoke-Tests OK ✅")
