# Build and upload index

In [1]:
import dotenv
import os
dotenv.load_dotenv()
"HF_TOKEN" in os.environ or print("Please set the HF_TOKEN environment variable in .env file")

True

In [2]:
from evidence_seeker.retrieval import build_index, RetrievalConfig

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [3]:
config = RetrievalConfig(
    embed_base_url="https://ibpp4xgm0kspxkjb.us-east-1.aws.endpoints.huggingface.cloud",
    embed_model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    document_input_dir="../TMP/APUZ/corpus",
    index_persist_path="../TMP/APUZ/storage/index",
)

In [4]:
from loguru import logger
import pathlib
import yaml

metadata_dict = {}
metadata_dir = config.document_input_dir or os.path.dirname(config.document_input_files[0])
# load and parse all yaml files in metadata_dir
for filepath in pathlib.Path(metadata_dir).rglob("*.yaml"):
    with open(filepath, "r") as f:
        data = yaml.safe_load(f)
        if "file" in data and "author" in data:
            filename = data["file"]
            author = data["author"]
            metadata_dict[filename] = {"author": author}
        else:
            logger.warning(f"Invalid metadata in {filepath}.")

document_file_metadata = lambda filename: metadata_dict.get(filename, {})  # noqa



In [6]:

build_index(
    document_input_dir=config.document_input_dir,
    document_file_metadata=document_file_metadata,
    index_persist_path=config.index_persist_path,
    embed_base_url=config.embed_base_url,
    embed_model_name=config.embed_model_name,
    token=os.environ["HF_TOKEN"],
)

[32m2024-12-14 12:30:58.597[0m | [34m[1mDEBUG   [0m | [36mevidence_seeker.retrieval.base[0m:[36mbuild_index[0m:[36m196[0m - [34m[1mReading documents from ../TMP/APUZ/corpus[0m
[32m2024-12-14 12:30:58.598[0m | [1mINFO    [0m | [36mevidence_seeker.retrieval.base[0m:[36mbuild_index[0m:[36m204[0m - [1mBuilding document index...[0m
[32m2024-12-14 12:31:30.491[0m | [34m[1mDEBUG   [0m | [36mevidence_seeker.retrieval.base[0m:[36mbuild_index[0m:[36m212[0m - [34m[1mParsing nodes...[0m
[32m2024-12-14 12:31:32.285[0m | [34m[1mDEBUG   [0m | [36mevidence_seeker.retrieval.base[0m:[36mbuild_index[0m:[36m219[0m - [34m[1mCreating VectorStoreIndex with embeddings...[0m


Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1829 [00:00<?, ?it/s]

[32m2024-12-14 12:54:12.264[0m | [34m[1mDEBUG   [0m | [36mevidence_seeker.retrieval.base[0m:[36mbuild_index[0m:[36m225[0m - [34m[1mPersisting index to ../TMP/APUZ/storage/index[0m


In [7]:
import huggingface_hub

HfApi = huggingface_hub.HfApi(token=os.environ["HF_TOKEN"])

In [8]:
HfApi.upload_folder(
    repo_id="DebateLabKIT/apuz-index-es",
    folder_path=config.index_persist_path,
    path_in_repo="index",
    repo_type="dataset",
)

default__vector_store.json:   0%|          | 0.00/196M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

docstore.json:   0%|          | 0.00/88.5M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/DebateLabKIT/apuz-index-es/commit/6dbdaf9bb91058ed8419731cb8439766c941c16f', commit_message='Upload folder using huggingface_hub', commit_description='', oid='6dbdaf9bb91058ed8419731cb8439766c941c16f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/DebateLabKIT/apuz-index-es', endpoint='https://huggingface.co', repo_type='dataset', repo_id='DebateLabKIT/apuz-index-es'), pr_revision=None, pr_num=None)