In [1]:
!pip install  --upgrade langchain langchain-community langchain_chroma langchain_openai langchain_unstructured  unstructured[pdf] chromadb

Collecting langchain
  Downloading langchain-0.3.13-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-community
  Using cached langchain_community-0.3.12-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain_chroma
  Using cached langchain_chroma-0.1.4-py3-none-any.whl.metadata (1.6 kB)
Collecting langchain_openai
  Downloading langchain_openai-0.2.13-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain_unstructured
  Using cached langchain_unstructured-0.1.6-py3-none-any.whl.metadata (3.3 kB)
Collecting chromadb
  Using cached chromadb-0.5.23-py3-none-any.whl.metadata (6.8 kB)
Collecting unstructured[pdf]
  Using cached unstructured-0.16.11-py3-none-any.whl.metadata (24 kB)
Collecting langchain-core<0.4.0,>=0.3.26 (from langchain)
  Downloading langchain_core-0.3.27-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.3 (from langchain)
  Downloading langchain_text_splitters-0.3.4-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.3,>=0.

In [1]:
import chromadb
import os
from collections import defaultdict
from tqdm import tqdm
from pathlib import Path
from dotenv import load_dotenv, find_dotenv
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain_unstructured.document_loaders import UnstructuredLoader
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.retrievers import MergerRetriever
from langchain.retrievers.document_compressors.flashrank_rerank import FlashrankRerank
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title
from unstructured.chunking.basic import chunk_elements
from unstructured.documents.elements import Image

load_dotenv(find_dotenv())

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
DATABASE_PATH = "./chroma/"
EMBEDDING_MODEL = "text-embedding-ada-002"

def pretty_output(chunks, mode: str):
    if mode == "elements":
        for i, chunk in enumerate(chunks, 1):
            print(f"Chunk {i}:")
            print(chunk.text)
            print("-" * 120)
            
    elif mode == "documents":
        for i, chunk in enumerate(chunks, 1):
            print(f"Chunk {i}:")
            print(chunk.page_content)
            print("-" * 120)

In [2]:
docs = {
    "BSW": "BSW_Parteiprogramm.pdf",
    "Grüne": "Grüne_BTW2025.pdf",
    "CDU": "CDU_BTW2025.pdf",
    "AfD": "Programm_AfD_Online_.pdf",
    "Linke": "DIE_LINKE_Wahlprogramm_zur_Bundestagswahl_2021.pdf",
    "SPD": "SPD-Zukunftsprogramm.pdf",
    "FDP": "FDP_Programm_Bundestagswahl2021_1.pdf"
}

In [3]:
from os import path

# Chunker 2
max_characters = 5000
new_after_n_chars = 1500
overlap = 1000
combine_text_under_n_chars_multiplier=int(new_after_n_chars*(2/3))

DOCS = []

for (party, fpath) in docs.items():
    chunks = UnstructuredLoader(
        file_path=path.join("data", fpath),
        languages=["deu"],
        chunking_strategy="by_title",
        max_characters=max_characters,
        overlap=overlap,
        overlap_all=True,
        combine_text_under_n_chars=combine_text_under_n_chars_multiplier,
        new_after_n_chars=new_after_n_chars,
    ).load()
    for chunk in chunks:
        chunk.metadata["party"] = party
    #print(len(chunks), chunks[0])
    DOCS += chunks


INFO: pikepdf C++ to Python logger bridge initialized


In [4]:
len(DOCS), DOCS[-1]

(3292,
 Document(metadata={'source': 'data/FDP_Programm_Bundestagswahl2021_1.pdf', 'file_directory': 'data', 'filename': 'FDP_Programm_Bundestagswahl2021_1.pdf', 'languages': ['deu'], 'last_modified': '2021-08-05T10:44:54', 'page_number': 67, 'orig_elements': 'eJzdVE1v3DYQ/SuETi2wFEiJEsU9NanjHIIWRm0gB9dYUOJQIixRC4obxw7y3zPUrg033WsKNBeBbz7ImXlPc/slgxEm8HHnTLYlWa2ACdZ1tGC6o6ICTluuORWyUa1tS8ONyDYkmyBqo6PGnC9ZN8/BOK8jLCse9eN8iLsBXD9EtDSC543CrJPjwZk4oL1SVV7IGh372fmYkm9vS47BG1LXKpd3G/KCZXPEFa9ydgav8WjIlsclwpSauXKfYbze6w6yr+iwboSdcQG6OIfHFLB2cPJ4PUGyXV5c7a7C3Ac9Tbu3B29gibpfHvQwFqzgO57vjc3WZnx/0P3a821m4JDdrdYl7qbZOOtgnWhKoqyhrLrhbCvEtloHuMfMnT9MLQSMqmUqMMLnNK7swi33wU3OOwiABZA3Y4Sw9AiewBPdLt2grQWfLoqP+7XuGxfHtc/vKa0aBUZySaVkhgrVMqq7uqBFwxUopptaqf+GUiVy9opSWZdHXAl1Dh/jfw5KP7pALgM4IBcwzfcBB+uJxRlD8MSg+c2J1IPvEb/wT8mqgPBPCeTkPSwQn4BgM+RaxydMwwsnF0kLeFH4+8AYCE/JfUr33ylomccxCelgiRtCSkzxtu5H1w3gXwqg5FmCnvw+6KDv8RYyOE/W+A473Yfj0ca1FnAeeznYAdrUybyW7vd6WRJM7xKvu4Es+KFrxghuiehcenypG9JcPgSH7zg8fZoD9jVPKGf

In [5]:
client = chromadb.PersistentClient(
    path=os.path.join(DATABASE_PATH, f"{EMBEDDING_MODEL}"),
)

INFO: Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [6]:
for chunk in DOCS:
    for md in chunk.metadata:
        if isinstance(chunk.metadata[md], list):
            chunk.metadata[md] = str(chunk.metadata[md])

In [9]:
Chroma.from_documents(
    documents=DOCS,
    embedding=OpenAIEmbeddings(api_key=OPENAI_API_KEY, model=EMBEDDING_MODEL),
    client=client,
    collection_name=f"BTW2025",
)

INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


<langchain_chroma.vectorstores.Chroma at 0x7fe47451abd0>

In [10]:
LLM = ChatOpenAI(
    api_key=OPENAI_API_KEY,
    model="gpt-4o-mini",
    temperature=0.0,
)

PROMPT = ChatPromptTemplate([
    ("system", """Du bist ein Experte für politische Fragen zur Bundestagswahl und beantwortest die Fragen der Benutzer auf Basis des bereitgestellten Kontext. 
Der Kontext besteht aus eine Aufstellung der Aussagen einzelner Parteien zu der Fragestellung des Benutzers.

- Wenn die Frage anhand des Kontext beantwortet werden kann, gib in Deiner Antwort jeweils an, zu welcher Partei eine Aussage gehört.
- Wenn es Aussagen mehrerer Parteien gibt, stelle die Aussagen der Parteien gegenüber und verdeutliche die Unterschieder der Parteien.
- Wenn die Frage im Kontext nicht eindeutig beantwortet werden kann oder keine ausreichenden Informationen vorliegen, gib an, dass du die Frage nicht beantworten kannst.
- Achte besonders darauf, dass du keine Informationen hinzufügst, die nicht im Kontext enthalten sind.
- Gib am Ende Zitate aus den Aussagen der Parteien an, die Deine Zusammenfassung nachvollziehbar machen.

Wenn in der Frage nach der Position einer bestimmten Partei gefragt wird, gehe in der Antwort auf diese Partei ein.
Wenn in der Frage keine Partei explizit erwähnt wird, erstelle eine Übersicht der Positionen der folgenden Parteien:
- CDU
- SPD
- Grüne
- AfD
- FDP
- BSW
- Linke

Am Ende deiner Antwort weise bitte darauf hin, dass du ein ChatBot bist und die Antwort unbedingt von einer qualifizierten Person überprüft werden sollte.

<kontext>
{context}
</kontext>"""),
    ("human", "Frage: {input}")
])


In [11]:
vectorstore = Chroma(
        collection_name=f"BTW2025",
        client=client,
        embedding_function=OpenAIEmbeddings(model=EMBEDDING_MODEL, api_key=OPENAI_API_KEY),
        create_collection_if_not_exists=False
    )

PARTY_RETRIEVERS = [ 
    vectorstore.as_retriever(
        search_type='similarity',
        search_kwargs={
            'k': 3,
            'filter': {'party': party}
        }
    ) for party in docs.keys() ]

lotr = MergerRetriever(retrievers=PARTY_RETRIEVERS)


In [12]:
retrieval_chain = create_retrieval_chain(
    retriever=lotr,
    combine_docs_chain=create_stuff_documents_chain(
        llm=LLM,
        prompt=PROMPT,
        document_prompt=PromptTemplate.from_template("{party}: {page_content}")
    )
)

In [16]:
retrieval_chain.invoke({"input": "Was ist die Haltung der Parteien zur Schuldenbremse und zu Investitionen?"})

INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{'input': 'Was ist die Haltung der Parteien zur Schuldenbremse und zu Investitionen?',
 'context': [Document(metadata={'category': 'CompositeElement', 'element_id': 'fbf329a6687bf9c3e47bbfac0b77376e', 'file_directory': 'data', 'filename': 'BSW_Parteiprogramm.pdf', 'filetype': 'application/pdf', 'languages': "['deu']", 'last_modified': '2024-12-03T13:09:57', 'orig_elements': 'eJzVVE1v4zYU/CsPOluqPmzZym2DTVtgN0XQpA2QNDBo8UkiLFEGSSUbL/a/75DSLtJtrz30YMAkH9/MvBnq8XPEPQ+s3V7J6IKiqsqrujw08aEu8nidFnUs0oLjzbrcldlWsMg4WlE0sBNSOIE7n6N6HI1UWji2Yd2L13Fy+45V2zns7NZZsqtwazl4UdJ12N9UmyTfljg4jUo7f/nxMdslKN1laZI9rejbMi+TMizLKkn/ZR3KsRHZV+t48FJu1Cfub0+i5ugLDhrV814qw7UbzasvCPyXEy0G9nuXt/f7G2Ecq5MZWyOGITnJJgrcdTuJNkh8jCRP0VPYtW4/jFI1isMA8zRfx1kep8VdVlyk1cVm62+fcHOvp+HABlW5J+T4k1sg6a8pT7OC/mSjJ904mrSkXxhcO6faIytH9U8jXSstrReE+jQ9bOmDEZNlbZ1JqIqz9Nt+lmbZli7Z9Ep7dPd6Cup+HkcHAkD/0fdsK3dpnudxc8iqeH3Y7GKxq6uY11m5yWtOudj9t74XsHSzoizPZuOX9TrLksKvN8UuKf+5nuv/n86/V+wt96bxWks2k25JsqV77t2xV4OwIQkSdQ9srPOVTRnKJm2RD0OYft

INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [15]:
import gradio as gr
import urllib

def pretty_output(answer, context):
    return_str = f"<span>{answer}</span><br><br><span>Kontext:</span><br><ul>"
    for doc in context:
        file_path = doc.metadata["source"]
        formatted_path = file_path.replace("\\", "/").replace("Data", "Source")
        encoded_path = urllib.parse.quote(formatted_path)
        file_url = f"file:///{encoded_path}"
        return_str += f"<li><a href='{file_url}' target='_blank'>{os.path.basename(file_path)}</a><span> - </span><span>Seite {doc.metadata['page_number']}</span></li>"
    
    return return_str + "</ul>"

def generate(query, history):   
    response = retrieval_chain.invoke({"input": query, "history": history })
    answer = pretty_output(response['answer'], response['context'])
    return answer

chat = gr.ChatInterface(
    fn=generate,
    type="messages",

    examples=[
        [
            "Was ist die Haltung der Parteien zum Klimawandel?",
        ],
        [
            "Was ist die Haltung der Parteien zum Thema Migration?",
        ]
    ]
)

chat.launch(share=True)



INFO: HTTP Request: GET http://127.0.0.1:7861/gradio_api/startup-events "HTTP/1.1 200 OK"
INFO: HTTP Request: HEAD http://127.0.0.1:7861/ "HTTP/1.1 200 OK"


* Running on local URL:  http://127.0.0.1:7861


INFO: HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
INFO: HTTP Request: GET https://api.gradio.app/v3/tunnel-request "HTTP/1.1 200 OK"


* Running on public URL: https://9faa2bc6cd97f2eb29.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


INFO: HTTP Request: HEAD https://9faa2bc6cd97f2eb29.gradio.live "HTTP/1.1 200 OK"




INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://a