# Build and upload index

## Init config

In [7]:
from evidence_seeker.retrieval import build_index, RetrievalConfig
import pathlib

document_input_dir = "../TMP/APUZ_PART/corpus"
# List all pdf files (we do not need to index the metadata files)
pdf_files = [
    str(p) for p in pathlib.Path(document_input_dir).rglob("*") 
    if p.is_file() and (p.name.endswith(".pdf") or p.name.endswith(".PDF"))
]


config = RetrievalConfig(
    ###### MODEL CONFIGURATION ##############
    ### Local model (via Huggingface API) ###
    embed_backend_type="huggingface",
    embed_model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",   
    ### Huggingface inference API ###########
    # embed_backend_type="huggingface_inference_api",
    # embed_model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    # embed_base_url="https://router.huggingface.co/hf-inference/models/sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    # api_key_name="hf_debatelab_inference_provider",
    # bill_to="DebateLabKIT",
    ####### END MODEL CONFIGURATIAN #########
    #hub_key_name="hf_evse_data",
    document_input_dir=document_input_dir,
    document_input_files=pdf_files,
    index_persist_path="../TMP/APUZ_PART/storage",
    # uncomment the following line to upload the index to the HF hub
    #index_hub_path = "DebateLabKIT/apuz-index-es",
)

## Load meta data from yaml files

In [8]:
from loguru import logger
import pathlib
import yaml
from llama_index.core import SimpleDirectoryReader
from pprint import pprint
from typing import Dict
import os
#import chardet 

metadata_dict = {}
metadata_dir = config.document_input_dir or os.path.dirname(config.document_input_files[0])
logger.info(f"Metadata directory: {metadata_dir}")
# load and parse all yaml files in metadata_dir
for filepath in pathlib.Path(metadata_dir).rglob("*.yaml"):
    logger.info(f"Loading metadata from {filepath}")

    # if the corresponding pdf file does not exist, skip this metadata file
    pdf_file = os.path.join(metadata_dir, filepath.stem + ".pdf")
    if not pathlib.Path(pdf_file).is_file():
        logger.warning(f"PDF file {pdf_file} does not exist, skipping metadata file {filepath}.")
        continue
    with open(filepath, "r", encoding="utf-8") as f:
        data = yaml.safe_load(f)
        if (
            "file" in data 
            and "author" in data 
            and "url" in data
            and "title" in data
        ):
            filename = data["file"]
            metadata_dict[filename] = {
                "author": data["author"],
                "title": data["title"], 
                "url": data["url"],
                "year": data.get("year", None),  # optional field
                "month": data.get("month", None),  # optional field
                # added by hand (not correctly in the yaml files/bibtex file)
                "journal": "Aus Politik und Zeitgeschichte (APuZ)",
            }
        else:
            logger.warning(f"Invalid metadata in {filepath}.")
        

def document_file_metadata(filename: str) -> Dict: 
    meta = metadata_dict.get(pathlib.Path(filename).name, {})
    if not meta:
        logger.warning(f"No metadata found for file: {filename}")
    return meta


# for debugging purposes, print the metadata dictionary
reader = SimpleDirectoryReader(
    input_files=pdf_files,
    file_metadata=document_file_metadata,
)
docs = reader.load_data()
print(f"Loaded {len(docs)} documents with metadata from {metadata_dir}")
#for doc in docs:
#    print(doc.metadata)

#pprint(docs[0].metadata)


[32m2025-06-17 11:32:37.135[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mMetadata directory: ../TMP/APUZ_PART/corpus[0m
[32m2025-06-17 11:32:37.136[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m15[0m - [1mLoading metadata from ../TMP/APUZ_PART/corpus/ahrens_pragmatischer_2024.yaml[0m
[32m2025-06-17 11:32:37.138[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m15[0m - [1mLoading metadata from ../TMP/APUZ_PART/corpus/alexander_stabil_2024.yaml[0m


Loaded 11 documents with metadata from ../TMP/APUZ_PART/corpus


## Build index

In [None]:
from evidence_seeker.retrieval.base import IndexBuilder
from os import path

index_builder = IndexBuilder(
    config=config,
    env_file="../.env",  # path to your .env file
)

index_builder.build_index()

[32m2025-06-17 11:35:54.502[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mBuilding index in '/home/basti/Nextcloud/Documents/mindmaps/mind/projects/kideku/code/evidence-seeker/TMP/APUZ_PART/storage/index' ...[0m
[32m2025-06-17 11:35:54.503[0m | [34m[1mDEBUG   [0m | [36mevidence_seeker.retrieval.base[0m:[36mbuild_index[0m:[36m423[0m - [34m[1mReading documents from ['../TMP/APUZ_PART/corpus/alexander_stabil_2024.pdf', '../TMP/APUZ_PART/corpus/ahrens_pragmatischer_2024.pdf'][0m
[32m2025-06-17 11:35:54.503[0m | [1mINFO    [0m | [36mevidence_seeker.retrieval.base[0m:[36mbuild_index[0m:[36m428[0m - [1mBuilding document index...[0m
[32m2025-06-17 11:35:54.985[0m | [34m[1mDEBUG   [0m | [36mevidence_seeker.retrieval.base[0m:[36mbuild_index[0m:[36m436[0m - [34m[1mParsing nodes...[0m
[32m2025-06-17 11:35:55.104[0m | [34m[1mDEBUG   [0m | [36mevidence_seeker.retrieval.base[0m:[36mbuild_index[0m:[36m443[0m - [34m

Generating embeddings:   0%|          | 0/465 [00:00<?, ?it/s]

[32m2025-06-17 11:36:24.082[0m | [34m[1mDEBUG   [0m | [36mevidence_seeker.retrieval.base[0m:[36mbuild_index[0m:[36m453[0m - [34m[1mPersisting index to /home/basti/Nextcloud/Documents/mindmaps/mind/projects/kideku/code/evidence-seeker/TMP/APUZ_PART/storage/index[0m


In [12]:
# explicitly upload the index to the HuggingFace hub 
# (can also be done via the `build_index` function)
from evidence_seeker.retrieval.base import INDEX_PATH_IN_REPO

HfApi = huggingface_hub.HfApi(token=hub_token)
HfApi.upload_folder(
    repo_id="DebateLabKIT/apuz-index-es",
    folder_path=config.index_persist_path,
    #path_in_repo=INDEX_PATH_IN_REPO,
    repo_type="dataset",
)

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

default__vector_store.json:   0%|          | 0.00/650M [00:00<?, ?B/s]

docstore.json:   0%|          | 0.00/208M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/DebateLabKIT/apuz-index-es/commit/799dee54e009b532410c4765368ee1bcbdade16b', commit_message='Upload folder using huggingface_hub', commit_description='', oid='799dee54e009b532410c4765368ee1bcbdade16b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/DebateLabKIT/apuz-index-es', endpoint='https://huggingface.co', repo_type='dataset', repo_id='DebateLabKIT/apuz-index-es'), pr_revision=None, pr_num=None)