In [1]:
from codes.utils import set_ipynb_config, time_it
set_ipynb_config()

In [2]:
path_data = 'data/multi_docs/'

In [3]:
path_persist_db = '/Users/prb000j/Downloads/downloaded_models/vector_dbs/'

## C. Retrieval Process
**Expected Input:** Query from user

```python
get_query_topic(query) -> topics  
get_filter_criterion(topics) -> filters
retrieve_chunks(query, filters)  # based on filter criterion
rerank_chunks(query, chunks) -> chunks
```
**Expected Output:** List of chunks (or docs)

In [4]:
from codes.file_to_docs import Docs2VectorDb

In [5]:
from langchain_core.documents.base import Document  # required to add additional metadata
from langchain_community.vectorstores.chroma import Chroma

In [6]:
class RetrieveDocs:

    def main(query:str, vector_store:Chroma, method_search:str='mmr', method_rerank:str='simple', **kwargs)->list[Document]:
        '''
        Retrieve documents from vector store based on given query
        Arguments:
            query <str>: user query
            vector_store <Chroma>: chroma vector database,
            method <str>: choose one out of ['mmr', 'siml', 'siml_w_relvscore', 'siml_w_score'],
            kwargs <dict>: includes a filter based on metadata
        '''
        docs_retrieved = RetrieveDocs.retrieve_docs(query, vector_store, method_search, **kwargs)
        print("docs_retrieved:\n")
        RetrieveDocs.pprint_docs(docs_retrieved)
        docs_reranked = RetrieveDocs.rerank_docs(query, docs_retrieved, method_rerank)
        return docs_reranked

    def retrieve_docs(query:str, vector_store:Chroma, method:str='mmr', **kwargs)->list[Document]:
        '''
        Retrieve documents from vector store based on given query
        Arguments:
            query <str>: user query
            vector_store <Chroma>: chroma vector database,
            method <str>: choose one out of ['mmr', 'siml', 'siml_w_relvscore', 'siml_w_score'],
            kwargs <dict>: includes a filter based on metadata
        '''
        # parameters
        k = kwargs.get('k')
        k = 4 if not k else k
        metadata_filter = kwargs.get('metadata')
        metadata_filter = {} if not metadata_filter else metadata_filter
        methods = ['mmr', 'siml', 'siml_w_relvscore', 'siml_w_score']
        # criteria
        if method=='mmr':
            docs_retrieved = vector_store.max_marginal_relevance_search(query, k=k, filter=metadata_filter)
        elif method=='siml':
            docs_retrieved = vector_store.similarity_search(query, k=k, filter=metadata_filter)
        elif method=='siml_w_relvscore':
            docs_retrieved = vector_store.similarity_search_with_relevance_scores(query, k=k, filter=metadata_filter)
        elif method=='siml_w_score':
            docs_retrieved = vector_store.similarity_search_with_score(query, k=k, filter=metadata_filter)
        else:
            print(f'method is incorrect. method needs to be out of {methods}')
            raise NotImplementedError

        return docs_retrieved

    def rerank_docs(query:str, docs:list[Document], method:str='simple')->list[Document]:
        '''
        Rerank documents based on given query and chosen strategy
        Arguments:
            query <str>: user query
            method <str>: choose one out of ['simple', 'multi-model'],
        '''
        methods = ['pass', 'simple', 'multi-model']
        if method not in methods:
            print(f'method is incorrect. method needs to be out of {methods}')
            docs_reranked = docs[:]
        elif method=='simple':  # Example: [1,2,3,4,5,6] --> [1,3,5] + [6,4,2]
            docs_even = [doc for idx, doc in enumerate(docs) if idx%2==0]
            docs_odd = [doc for idx, doc in enumerate(docs) if idx%2!=0]
            docs_odd_reversed = docs_odd[::-1]
            docs_reranked = docs_even + docs_odd_reversed
        return docs_reranked


    def pprint_docs(docs:list[Document])->None:
        '''
        print docs one by one
        '''
        print("-"*30)
        for doc in docs:
            print(doc.page_content)
            metadata_to_be_printed = {k:v for k,v in doc.metadata.items() if k in ['data_type', 'topic']}
            print(metadata_to_be_printed)
            print('\n')

In [7]:
vector_store_multi = Docs2VectorDb.load_vector_store(path_persist_db)

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
sources = Docs2VectorDb.sources_from_vdb(vector_store_multi)
sources

{'source': {'data/multi_docs/Luminate Report Builder.docx',
  'data/multi_docs/Luminate Report Builder.pdf',
  'data/multi_docs/dataframe.csv',
  'data/multi_docs/marketing.txt'}}

In [12]:
query = 'aspires'
metadata_filt = {}
# metadata_filt['metadata'] = {'topic': 'introduction and background to RB, Luminate'}
# metadata_filt['metadata'] = {{'data_type': 'dataframe'}
# metadata_filt['metadata'] = {'data_type': {'$in':['txt']}}
metadata_filt['metadata'] = {'title': 'dataframe.csv'}
# metadata_filt['metadata'] = {'data_type': {'$in':['dataframe', 'word document']}}

dict_parameters = {'k': 4}

dict_all_params = dict_parameters
dict_all_params.update(metadata_filt)
print(dict_all_params)

docs_retrvd_w_reranking = RetrieveDocs.main(query, vector_store_multi, **dict_all_params)
RetrieveDocs.pprint_docs(docs_retrvd_w_reranking)

{'k': 4, 'metadata': {'title': 'dataframe.csv'}}
docs_retrieved:

------------------------------
to follow along the
{'data_type': 'dataframe', 'topic': 'qna on topics like RB, luminate'}


you look at reports from
{'data_type': 'dataframe', 'topic': 'qna on topics like RB, luminate'}


how do I log in ,you can
{'data_type': 'dataframe', 'topic': 'qna on topics like RB, luminate'}


domains,testsource1
{'data_type': 'dataframe', 'topic': 'qna on topics like RB, luminate'}


------------------------------
to follow along the
{'data_type': 'dataframe', 'topic': 'qna on topics like RB, luminate'}


how do I log in ,you can
{'data_type': 'dataframe', 'topic': 'qna on topics like RB, luminate'}


domains,testsource1
{'data_type': 'dataframe', 'topic': 'qna on topics like RB, luminate'}


you look at reports from
{'data_type': 'dataframe', 'topic': 'qna on topics like RB, luminate'}




## To check

- chunk overlap > word size, will the word get split into two chunks
- mmr >> does it already incorporate reranking >> handle it in code