# Imports

In [None]:
import nest_asyncio
nest_asyncio.apply()

from dotenv import load_dotenv
load_dotenv()

import os
from llama_index.core import Settings, VectorStoreIndex
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core.node_parser import MarkdownElementNodeParser
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

import src.file_parser as parse

# Default models

In [None]:
model = 'llama3'
reranker_model = 'BAAI/bge-reranker-large'

llm = Ollama(model=model, temperature=0.0)
embed_model = OllamaEmbedding(model_name=model)
reranker = FlagEmbeddingReranker(top_n=3, model=reranker_model)

Settings.llm = llm
Settings.embed_model = embed_model

# Data Ingestion

In [None]:
# define path to load document
document_directory = 'data'
document_type = 'purchase_order'
document_path = f'{document_directory}/{document_type}'
parsed_format = 'markdown'

document = parse.parse_docs(
    result_format=parsed_format,
    document_dir=document_path
)

In [None]:
# inspect parsing result
print(document[0].text)

# Split document into chunks

In [None]:
node_parser = MarkdownElementNodeParser()
nodes = node_parser.get_nodes_from_documents(documents=document)
base_nodes, objects = node_parser.get_nodes_and_objects(nodes=nodes)

# Build index

In [None]:
recursive_index = VectorStoreIndex(nodes=base_nodes+objects)
raw_index = VectorStoreIndex.from_documents(documents=document)

# Build query engine

In [None]:
recursive_query_engine = recursive_index.as_query_engine(
    similarity_top_k=5,
    node_postprocessors=[reranker],
    verbose=True
)

# Query

In [None]:
query = 'What is this document about? Summarise in 30 words.'
response = recursive_query_engine.query(query)

print(response)