In [1]:
import os
import json
import uuid

from llama_index.readers.json import JSONReader
from llama_index.core import Document

file_path = "/Users/damontingey/personal/lds-rag/data/scriptures/flat/book-of-mormon-flat.json"
dir_path = "/Users/damontingey/personal/lds-rag/data/scriptures/flat"

class CustomJSONReader(JSONReader):
    def load_data(self, input_file):
        with open(input_file, 'r') as file:
            data = json.load(file)
        
        documents = []
        for item in data.get('headings', []):
            documents.append(Document(id_=str(uuid.uuid4()), text=item['text'], metadata={'reference': item['reference']}))
        
        for item in data.get('verses', []):
            documents.append(Document(id_=str(uuid.uuid4()), text=item['text'], metadata={'reference': item['reference']}))
        
        return documents
    
    def load_dir(self, input_dir: dir):
        documents = []
        for file in os.listdir(input_dir):
            if file.endswith(".json"):
                documents.extend(self.load_data(os.path.join(input_dir, file)))
        return documents

documents = CustomJSONReader().load_data(file_path)
more_documents = CustomJSONReader().load_dir(dir_path)

In [3]:
documents[1]

Document(id_='3cd522f3-3f45-4ee4-a9c2-2c542e148b4e', embedding=None, metadata={'reference': '2 Nephi'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text="An account of the death of Lehi. Nephi's brethren rebel against him. The Lord warns Nephi to depart into the wilderness. His journeyings in the wilderness, and so forth.", mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [4]:
len(documents)

6626

In [5]:
import os
import logging
import qdrant_client
from llama_index.llms.ollama import Ollama
from llama_index.core import (
    StorageContext,
    Settings,
    VectorStoreIndex,
    SimpleDirectoryReader,
)
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.postprocessor.colbert_rerank import ColbertRerank

Settings.embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")
print(len(documents))
documents_ = documents[:1000]

client = qdrant_client.QdrantClient(host="localhost", port=6333)

vector_store = QdrantVectorStore(client=client, collection_name="test")
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(documents_, storage_context=storage_context)
retriever = index.as_retriever(similarity_top_k=100)

colbert_reranker = ColbertRerank(
    top_n=5,
    model="colbert-ir/colbertv2.0",
    tokenizer="colbert-ir/colbertv2.0",
    keep_retrieval_score=True,
)


  from .autonotebook import tqdm as notebook_tqdm
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 101311.69it/s]


6626


In [123]:
retriever = index.as_retriever(similarity_top_k=100)
response = retriever.retrieve("What is the Spirit?")

In [8]:
len(response)

28

In [14]:
response[0].node

TextNode(id_='a7c0eeaf-a99b-4ad3-b608-9885a63d6c93', embedding=None, metadata={'reference': '1 Nephi 1:12'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='f8ce99b2-9242-4677-95c7-75db279928f1', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'reference': '1 Nephi 1:12'}, hash='3211186358df3f85fd26ccd597ef96afe3e99ca62810a96b1d6a88440d8a9811')}, text='And it came to pass that as he read, he was filled with the Spirit of the Lord.', mimetype='text/plain', start_char_idx=0, end_char_idx=79, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [9]:
for item in response:
    print(f"{item.score} {item.metadata['reference']}: {item.text}")

0.66517806 1 Nephi 1:12: And it came to pass that as he read, he was filled with the Spirit of the Lord.
0.6564901 1 Nephi 11:2: And the Spirit said unto me: Behold, what desirest thou?
0.64928293 2 Nephi 9:10: O how great the goodness of our God, who prepareth a way for our escape from the grasp of this awful monster; yea, that monster, death and hell, which I call the death of the body, and also the death of the spirit.
0.64354265 1 Nephi 11:6: And when I had spoken these words, the Spirit cried with a loud voice, saying: Hosanna to the Lord, the most high God; for he is God over all the earth, yea, even above all. And blessed art thou, Nephi, because thou believest in the Son of the most high God; wherefore, thou shalt behold the things which thou hast desired.
0.6418716 1 Nephi 11:4: And the Spirit said unto me: Believest thou that thy father saw the tree of which he hath spoken?
0.63292325 1 Nephi 17:47: Behold, my soul is rent with anguish because of you, and my heart is pained; 

In [6]:
from llama_index.postprocessor.colbert_rerank import ColbertRerank

colbert_reranker = ColbertRerank(
    top_n=5,
    model="colbert-ir/colbertv2.0",
    tokenizer="colbert-ir/colbertv2.0",
    keep_retrieval_score=True,
)



In [7]:
retriever = index.as_retriever(similarity_top_k=100, node_postprocessor=colbert_reranker)
response = retriever.retrieve("What is the Spirit?")

In [16]:
colbert_reranker.postprocess_nodes(response, query_str="What is the Spirit?")

[NodeWithScore(node=TextNode(id_='73d332da-37ea-4913-a177-9e55fe8a7e8b', embedding=None, metadata={'reference': '1 Nephi 11:2', 'retrieval_score': 0.6564901}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='09c1f675-7ea3-40f5-9487-2975160cf448', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'reference': '1 Nephi 11:2'}, hash='f984198ed6e479ad2f12c0fbe0b75bd91c9a7785d3c9d70bf880671592bbb6ac')}, text='And the Spirit said unto me: Behold, what desirest thou?', mimetype='text/plain', start_char_idx=0, end_char_idx=56, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), score=0.55892413854599),
 NodeWithScore(node=TextNode(id_='9019ec5d-a53e-4fdd-bce4-a41f5c481d5c', embedding=None, metadata={'reference': '1 Nephi 1:8', 'retrieval_score': 0.6023814}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOU