# BUNDESDATA RAG 


In [52]:
import json, requests
from typing import Any, List 
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_community.vectorstores import Chroma
from ollama._types import ResponseError

## Daten laden und vorbereiten 

In [53]:
BASEURL = "https://verkehr.autobahn.de/o/autobahn"
SERVICES = [
    "roadworks",
    "parking_lorry",
    "warning",
    "closure",
    "electric_charging_station",
]

**stringify_all_fields:**
Conversion of a python object (dict) into string required for document indexing and processing using langchain.

In [54]:
def stringify_all_fields(obj: Any) -> str:
    return json.dumps(obj, ensure_ascii=False, indent=2, sort_keys=True) #ensure ascii=False to handle special characters

In [55]:
def get_items(payload: Any, service: str) -> List[Any]:
    # if payload is a dict with the service as a key and the value is a list, return that list
    if isinstance(payload, dict) and isinstance(payload.get(service), list):
        return payload[service]
    # if payload is already a list, return it directly
    if isinstance(payload, list):
        return payload
    return [payload]

**to_documents:**
Helper to convert the json response (dict) into langchain documents using the stringify_all_fields() function to get a text representation of all fields.
Additionally, we capture data about the road_id, service, title, description to give unique markers to roads.

In [56]:
def to_documents(road_id: str, service: str, items: List[Any], source_url: str) -> List[Document]:
    docs: List[Document] = [] # initialize empty list
    for i, item in enumerate(items):
        # define title and description
        title = ""
        description = ""
        if isinstance(item, dict):
            title = item.get("title", "") or item.get("name", "") # if title is empty, try name
            description = item.get("description", "") or item.get("text", "") # if description is empty, try text

        page_content = "\n".join(
            # structured content
            [
                f"road: {road_id}",
                f"service: {service}",
                (f"title: {title}" if title else ""),
                (f"description: {description}" if description else ""),
                "",
                "ALL_FIELDS_JSON:",
                # use stringify_all_fields to convert item to string
                stringify_all_fields(item),
            ]
        ).strip()
        # create document and add to list
        docs.append(
            Document(
                page_content=page_content,
                metadata={
                    "road_id": road_id,
                    "service": service,
                    "source_url": source_url,
                    "item_index": i,
                    "title": title,
                },
            )
        )
    return docs

**build_documents():**
This function does the API calls. We go through all road ids and services (endpoints) available and convert them into a list of langchain documents.

In [57]:
def build_documents() -> List[Document]:
    docs: List[Document] = [] # initialize empty list
    with requests.Session() as s:
        # get list of roads
        roads_resp = s.get(BASEURL, timeout=20)
        roads_resp.raise_for_status()
        roads = roads_resp.json()["roads"]
        # iterate over roads and services
        for road_id in roads:
            for service in SERVICES:
                # fetch service data for every road
                url = f"{BASEURL}/{road_id}/services/{service}"
                resp = s.get(url, timeout=20)
                resp.raise_for_status()
                # get payload
                payload = resp.json()
                # extract items with helper function
                items = get_items(payload, service)
                docs.extend(to_documents(road_id, service, items, url)) # convert to documents with helper function
    return docs

## Embedding Model initialisieren (mit dem Uni-Ollama-Server)

In [58]:
LLM_URL = "http://132.199.138.16:11434"
LLM_MODEL = "gpt-oss:20b"
EMBEDDING_MODEL = "nomic-embed-text"

***Split text using RecursiveCharacterTextSplitter***

In [59]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50,
    separators=["\n", ",", " ", ""]
)

docs = build_documents()

splitted_docs = text_splitter.split_documents(docs)

print(len(splitted_docs))

25201


***add_in_batches():***

We encountered a EOF status code: 500 Error during the embedding request. To fix this, we batched the documents in even smaller batches (batch_size = 64 here) to avoid the error.

In [60]:
def add_in_batches(
    vectorstore: Chroma,
    docs: List[Document],
    batch_size: int = 64,
    max_retries: int = 3,
):
    
    total = len(docs)
    for start in range(0, total, batch_size):
        batch = docs[start : start + batch_size]

        for attempt in range(1, max_retries + 1):
            try:
                vectorstore.add_documents(batch)
                break
            except ResponseError as e:
                if attempt == max_retries:
                    raise

***Initialize Ollama-Embedding-Model***

In [61]:
ollama_embeddings = OllamaEmbeddings(
    model=EMBEDDING_MODEL,
    base_url=LLM_URL
)

try:
    vectorstore.delete_collection()

except:
    pass


vectorstore = Chroma(collection_name="autobahn-rag", embedding_function=ollama_embeddings)

add_in_batches(
    vectorstore,
    splitted_docs,
    batch_size = 32,
    max_retries=3
)

print("Docs in Chroma:", vectorstore._collection.count())

vector_retriever = vectorstore.as_retriever(search_kwargs={"k":5})

Docs in Chroma: 25201


In [62]:
llm = ChatOllama(
    model=LLM_MODEL,
    base_url=LLM_URL,
    temperature=1
)

In [63]:
question = "Parkplatz auf der A93"
context_docs = vector_retriever.invoke(question)
context = "\n\n---\n\n".join(d.page_content for d in context_docs)

prompt = (
        "Beantworte die Frage mit Kontext. "
        "Wenn Informationen fehlen, sage, was im Kontext nicht enthalten ist, aber antworte trotzdem so weit wie möglich.\n\n"
        "Falls die Frage deutsche Begriffe enthält, suche im Kontext nach folgenden englischen Begriffen: parking lorries - Parkplätze, closures - Sperrungen, electric charging stations - Ladestationen, warnings - Warnungen, roadworks - Baustellen"
        f"Frage: {question}\n\n"
        f"Kontext:\n{context}"
    )

answer = llm.invoke(prompt)
print("\nQuestion:", question)
print("\n[ANSWER]:", answer.content)


Question: Parkplatz auf der A93

[ANSWER]: **Parkplatz auf der A93 – Zusammenfassung aus dem Kontext**

| Segment | Lage des Parkplatzes | Relevante Informationen |
|---------|-----------------------|-------------------------|
| **A93 – Inntal → Nicklheim** | „A93: Rosenheim → Kiefersfelden, zwischen 0,9 km hinter AD Inntal und 1,3 km vor Nicklheim“ | • **Parking‑Area**: Der Parkplatz liegt unmittelbar vor dem Ort „Nicklheim“. <br>• **Verkehrssituation**: Baustelle ist für die Zeiträume **11. Februar 2026 von 08:00 bis 14:00 Uhr** gültig. <br>• **Baustelleneigenschaften**: Länge 1 km, maximale Fahrgeschwindigkeit 80 km/h, maximale Durchfahrtsbreite 4 m. <br>• **Zweck**: „A93 von Inntal (AD) nach Nicklheim (Parkplatz) Unterhaltungsarbeiten“ – d.h. die Arbeiten betreffen den Abschnitt bis zum Parkplatz. |

**Was im Kontext nicht enthalten ist**

* **Andere Parkplätze**: Der Text nennt nur den Parkplatz vor Nicklheim; weitere Parkplätze auf der A93 werden nicht erwähnt.  
* **Ladestation

## Gradio UI 

In [64]:
import gradio as gr

def rag_answer(question: str):
    # Aufruf für Retriever
    context_docs = vector_retriever.invoke(question)

    # alle docs zu einem String zusammenfügen
    context = "\n\n---\n\n".join(d.page_content for d in context_docs)

    # Prompt bauen
    prompt = (
        "Beantworte die Frage mit Kontext. "
        "Wenn Informationen fehlen, sage, was im Kontext nicht enthalten ist, aber antworte trotzdem so weit wie möglich.\n\n"
        "Falls die Frage deutsche Begriffe enthält, suche im Kontext nach folgenden englischen Begriffen: parking lorries - Parkplätze, closures - Sperrungen, electric charging stations - Ladestationen, warnings - Warnungen, roadworks - Baustellen"
        f"Frage: {question}\n\n"
        f"Kontext:\n{context}"
    )

    # LLM Aufruf mit prompt
    answer = llm.invoke(prompt)

    return answer.content


with gr.Blocks(title="Bundesdata RAG") as demo:
    gr.Markdown("Autobahn Bundesdata RAG")
    gr.Markdown(
        "Stelle eine Frage zum Thema deutsche Autobahnen. "
        "Es wird Kontext via Retriever geholt und das LLM antwortet soweit möglich aus dem Kontext. "
        "Im Kontext vorhandene Themenbereiche sind roadworks, parkinglorries, warnings, closures und electric charging stations."
    )

    question_in = gr.Textbox(
        label="Stelle deine Frage",
        placeholder="z.B. gib mir alle Warnings auf der A93",
        lines=2,
    )

    ask_btn = gr.Button("Antwort generieren")

    answer_out = gr.Textbox(
        label="Antwort",
        lines=10,
        interactive=False
    )

   

    ask_btn.click(
        fn=rag_answer,
        inputs=question_in,
        outputs=[answer_out],
    )

demo.launch()


* Running on local URL:  http://127.0.0.1:7868
* To create a public link, set `share=True` in `launch()`.


