In [2]:
import chromadb
import os
from collections import defaultdict
from tqdm import tqdm
from pathlib import Path
from dotenv import load_dotenv, find_dotenv
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain_unstructured.document_loaders import UnstructuredLoader
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.retrievers import MergerRetriever
from langchain.retrievers.document_compressors.flashrank_rerank import FlashrankRerank
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title
from unstructured.chunking.basic import chunk_elements
from unstructured.documents.elements import Image

load_dotenv(find_dotenv())

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
DATABASE_PATH = "./chroma/"
EMBEDDING_MODEL = "text-embedding-ada-002"

def pretty_output(chunks, mode: str):
    if mode == "elements":
        for i, chunk in enumerate(chunks, 1):
            print(f"Chunk {i}:")
            print(chunk.text)
            print("-" * 120)
            
    elif mode == "documents":
        for i, chunk in enumerate(chunks, 1):
            print(f"Chunk {i}:")
            print(chunk.page_content)
            print("-" * 120)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
docs = {
    "BSW": "BSW_Wahlprogramm_2025__Entwurf_.pdf",
    "Grüne": "Grüne_BTW2025.pdf",
    "CDU": "CDU_BTW2025.pdf",
    "AfD": "AfD_Leitantrag-Bundestagswahlprogramm-2025.pdf",
    "Linke": "DIE_LINKE_Wahlprogramm_zur_Bundestagswahl_2021.pdf",
    "SPD": "BTW_2025_Wahlprogramm_SPD_Entwurf.pdf",
    "FDP": "fdp-wahlprogramm_2025.pdf"
}

In [4]:
from os import path

# Chunker 2
max_characters = 5000
new_after_n_chars = 1500
overlap = 1000
combine_text_under_n_chars_multiplier=int(new_after_n_chars*(2/3))

DOCS = []

for (party, fpath) in docs.items():
    chunks = UnstructuredLoader(
        file_path=path.join("files", fpath),
        languages=["deu"],
        chunking_strategy="by_title",
        max_characters=max_characters,
        overlap=overlap,
        overlap_all=True,
        combine_text_under_n_chars=combine_text_under_n_chars_multiplier,
        new_after_n_chars=new_after_n_chars,
    ).load()
    for chunk in chunks:
        chunk.metadata["party"] = party
    #print(len(chunks), chunks[0])
    DOCS += chunks


INFO: pikepdf C++ to Python logger bridge initialized


In [5]:
len(DOCS), DOCS[-1]

(3903,
 Document(metadata={'source': 'files/fdp-wahlprogramm_2025.pdf', 'file_directory': 'files', 'filename': 'fdp-wahlprogramm_2025.pdf', 'languages': ['deu'], 'last_modified': '2024-12-29T12:28:54', 'page_number': 52, 'orig_elements': 'eJxlkMFuwyAQRH/F4hwsQyGFfEDPldqbFVnUrG0kY5C9VmJF+fcCinrpCc3bGTS77YPADB4W7Jwll4oI8970595SqdmZCsU11T0z1EqrhFQDwLkhp4p4QGMNmpR5kD6E1brFIGxFz+YIO3YTuHHCRJRgtUqhF785i1PCUstaJByDWzAn25ZrXrNT9a5Yra+n6k+np+i3Ruaf/uniT4Bsx4bg8yaf7g7zVzQ9kGcaDG6GzroVegzrkQ2ZbOQ1WoyHAm2kNzPNcQ3jarzveMNlHe1ASv9l3M1YtmyJhZ1cC92w88G6wUG5YYoIyjjl+pvxC1cXKXI6pmS37P4H1rw8z60Q7vlARLLswCOWEh8hYDI9r784THjj', 'filetype': 'application/pdf', 'category': 'CompositeElement', 'element_id': '65b9f80f666735472352629dfc924f31', 'party': 'FDP'}, page_content='nd Investitionspartnerschaften sowie Partnerschaften im Bereich Rohstoffe oder grüne Technologien insbesondere mit mittleren Mächten ein. Für uns Freie Demokraten ist der Ausbau der transatlantischen Handelsbeziehungen ein Schwerpunkt, mindes

In [6]:
client = chromadb.PersistentClient(
    path=os.path.join(DATABASE_PATH, f"{EMBEDDING_MODEL}"),
)

INFO: Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [7]:
for chunk in DOCS:
    for md in chunk.metadata:
        if isinstance(chunk.metadata[md], list):
            chunk.metadata[md] = str(chunk.metadata[md])

In [8]:
Chroma.from_documents(
    documents=DOCS,
    embedding=OpenAIEmbeddings(api_key=OPENAI_API_KEY, model=EMBEDDING_MODEL),
    client=client,
    collection_name=f"BTW2025",
)

INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


<langchain_chroma.vectorstores.Chroma at 0x7f4dd4b0c510>