In [None]:
import sys

sys.path.append("../app/")

In [None]:
from utils import init_azure_openai_models

In [None]:
# Set embedding model

init_azure_openai_models()

# Load documents

In [None]:
from llama_index.core import SimpleDirectoryReader


# Get metadata for leaflets
def get_leaflets_metadata(file_path):
    # Extract brand name from file path
    try:
        brand = file_path.split("\\")[-2]
        model_name = file_path.split("\\")[-1].split(".")[0]
    except IndexError:
        print(f"file path : {file_path}")
    return {"brand": brand, "model_name": model_name}


# Read all leaflets
reader = SimpleDirectoryReader(
    input_dir="leaflets/", recursive=True, file_metadata=get_leaflets_metadata
)
documents = reader.load_data()

In [None]:
len(documents)

# Split documents into nodes

In [None]:
# Test text splitters (https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/?h=sentencespli#sentencesplitter
# Try automergingn (https://docs.llamaindex.ai/en/stable/examples/retrievers/auto_merging_retriever/)

from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter


pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=512, chunk_overlap=20),
    ]
)

nodes = pipeline.run(documents=documents)

In [None]:
len(nodes)

# Create index

In [None]:
from llama_index.core import VectorStoreIndex

chunked_index = VectorStoreIndex(nodes=nodes, show_progress=True)

# Store Index

In [None]:
from llama_index.core import StorageContext, load_index_from_storage    

In [None]:
# save index to disk
chunked_index.set_index_id("vector_chunked_index")
chunked_index.storage_context.persist("./storage")