In [1]:
# Select flags: Decide which database should be rebuilt
build_debate_database = True
build_manifesto_database = True
# Then: Run all

In [2]:
import sys

sys.path.append("..")

In [3]:
# Here we import our modules

from database.vector_database import VectorDatabase
from models.embedding import (
    ManifestoBertaEmbeddings,
    SentenceTransformerEmbedding,
    JinaAIEmbedding,
)
from langchain_openai import OpenAIEmbeddings

# Define embedding model

In [4]:
# Define embedding module
# embedding_model1 = ManifestoBertaEmbeddings()
# embedding_name = "manifestoberta"

# embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")
# embedding_name = "openai"

# embedding_model = SentenceTransformerEmbedding("multi-qa-mpnet-base-dot-v1")
# embedding_name = "multi-qa-mpnet-base"

embedding_model = JinaAIEmbedding()
embedding_name = "jina-embeddings-v2-base-de"

# (Re)Build Debate Database

In [5]:
# Define path of database and raw data
import pandas as pd

DATABASE_DIR = f"../../data/debates/chroma/{embedding_name}/"
DATA_PATH = "../../data/debates/europarl_speeches.csv"

# instantiate database
database_debates = VectorDatabase(
    embedding_model=embedding_model,
    source_type="debates",
    data_path=DATA_PATH,
    database_directory=DATABASE_DIR,
    loader="csv",
    reload=False,
)

# Build or load database, depending on flag
if build_debate_database:
    database_debates.build_database()
    # Assert whether database exists
    assert (
        str(type(database_debates.database))
        == "<class 'langchain_community.vectorstores.chroma.Chroma'>"
    )
else:
    database_debates.load_database()

Loading data and creating splits...
Creating database...


# (Re)Build Manifesto Database

In [6]:
# Define path of database and raw data
DATABASE_DIR = f"../../data/manifestos/chroma/{embedding_name}/"
DATA_PATH = "../../data/manifestos/01_pdf_originals"

# instantiate database
database_manifestos = VectorDatabase(
    embedding_model=embedding_model,
    source_type="manifestos",
    database_directory=DATABASE_DIR,
    data_path=DATA_PATH,
    loader="pdf",
    reload=False,
)

# Build or load database, depending on flag
if build_manifesto_database:
    database_manifestos.build_database()
    # Assert whether database exists
    assert (
        str(type(database_manifestos.database))
        == "<class 'langchain_community.vectorstores.chroma.Chroma'>"
    )
else:
    database_manifestos.load_database()

Loading data and creating splits...
Creating database...
