# Load Vector Store

In [1]:
from codes.utils import set_ipynb_config, time_it
set_ipynb_config()

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
import os

In [4]:
path_data = os.getenv('PATH_DATA')
path_data

'data/'

In [5]:
path_persist_db = os.getenv('PATH_PERSIST_DB')

## C. Retrieval Process
**Expected Input:** Query from user

```python
get_query_topic(query) -> topics  
get_filter_criterion(topics) -> filters
retrieve_chunks(query, filters)  # based on filter criterion
rerank_chunks(query, chunks) -> chunks
```
**Expected Output:** List of chunks (or docs)

In [6]:
from codes.file_to_docs import Docs2VectorDb
from codes.retrieve_docs import RetrieveDocs

In [7]:
vector_store_multi = Docs2VectorDb.load_vector_store(path_persist_db)



In [8]:
sources = Docs2VectorDb.sources_from_vdb(vector_store_multi)
sources

{'source': {'data/garb_in_garb_out.txt',
  'data/marketing.txt',
  'data/qna_table.csv'}}

# Generate Answers with LLM using RAG

Approaches:

1. Without using langchain:
- Create Prompt = Context + Query
    - Manually Retrieve Docs using metadata filter & retrieval method like 'mmr'
    - If method is 'mmr', filter retrieved docs based on siml / relv threshold
    - rerank if required
- Pass prompt to LLM call

2. With Langchain:
- Create a base retrieval based on metadata filter & retrieval method like 'mmr'
- Do reranking if rerank==True
- invoke chain with query

In [9]:
from codes.retrieve_docs import RetrieveDocs, ReRanking
from codes.generate_w_rag import LlmWithManualRag, LlmWithRag

## Query

In [10]:
metadata_keys = ['title', 'source', 'data_type', 'topic']
for key in metadata_keys: 
    metadata_key = Docs2VectorDb.sources_from_vdb(vector_store_multi, key)
    print('\n')
    print(metadata_key)



{'title': {'garb_in_garb_out.txt', 'marketing.txt', 'qna_table.csv'}}


{'source': {'data/qna_table.csv', 'data/garb_in_garb_out.txt', 'data/marketing.txt'}}


{'data_type': {'dataframe', 'txt'}}


{'topic': {'qna on topics like RB, luminate', 'marketing, toys', 'philosophy'}}


In [11]:
query = 'How have the toy stores changed over the years?'

### Explicitly filter on metadata and generate response based on query

#### Filter and retrieve documents based on query

In [12]:
metadata_filt = {
    'filter': {
        '$and': [
            {'title': {'$eq':'marketing.txt'}},
            {'data_type': {'$eq':'txt'}},
            # {'data_type': {'$in':['txt', 'dataframe']}},
            # {'topic': {'$eq':'philosophy'}},
            ]
        }
    }

# search_kwargs={
#         'k': 4,
#         'fetch_k': 20,
# }

search_kwargs={
        'k': 20,
        'fetch_k': 100,
}


search_kwargs.update(metadata_filt)
print(search_kwargs)

{'k': 20, 'fetch_k': 100, 'filter': {'$and': [{'title': {'$eq': 'marketing.txt'}}, {'data_type': {'$eq': 'txt'}}]}}


In [13]:
docs_retrvd_w_reranking = RetrieveDocs.main(query, 
                                            vector_store_multi, 
                                            method_search='mmr', 
                                            rerank=False,
                                            **search_kwargs)
RetrieveDocs.pprint_docs(docs_retrvd_w_reranking)

------------------------------
Still, a visit to a toy store or the toys section of a grocery store in most places will make you feel that not much has changed in the past few decades.
{'data_type': 'txt', 'topic': 'marketing, toys'}


All in all, toy ads haven’t really evolved much over time in terms of the social messages they convey about gender. Most toys still have clearly gendered associations, with dolls being targeted
{'data_type': 'txt', 'topic': 'marketing, toys'}


To this end, Norgaard and Wider analysed 175 television commercials for toys listed as ‘best selling’ for children ages five through eleven years old by the top three toy retailers — Target, Walmart,
{'data_type': 'txt', 'topic': 'marketing, toys'}


What’s more, the study also found that the toy industry’s marketing techniques continue to forge gendered associations in more subtle ways, such as through the use of colour — pink for girls, blue
{'data_type': 'txt', 'topic': 'marketing, toys'}


Here’s the thing, th

#### Filter retrieved docs based on relevance
Remove docs that have similarity < threshold w.r.t query

In [14]:
docs_filtd_manual = LlmWithManualRag.filter_docs_on_siml(query, 
                                                         docs_retrvd_w_reranking, 
                                                         thresh=0.5, 
                                                         k=4)
len(docs_filtd_manual)
print(docs_filtd_manual)

Starting to Embed texts ...




Starting to Embed texts ...
[0.75812393 0.5259469  0.5165936  0.5138917  0.51362133 0.512375
 0.5077028  0.4967534  0.492603   0.48505926 0.48138386 0.47835496
 0.46552873 0.46435103 0.46306068 0.4600075  0.4561665  0.4402113
 0.42687872 0.42188013]
[ True  True  True  True  True  True  True False False False False False
 False False False False False False False False]


4

[Document(page_content='Still, a visit to a toy store or the toys section of a grocery store in most places will make you feel that not much has changed in the past few decades.', metadata={'data_type': 'txt', 'source': 'data/marketing.txt', 'title': 'marketing.txt', 'topic': 'marketing, toys'}), Document(page_content='All in all, toy ads haven’t really evolved much over time in terms of the social messages they convey about gender. Most toys still have clearly gendered associations, with dolls being targeted', metadata={'data_type': 'txt', 'source': 'data/marketing.txt', 'title': 'marketing.txt', 'topic': 'marketing, toys'}), Document(page_content='To this end, Norgaard and Wider analysed 175 television commercials for toys listed as ‘best selling’ for children ages five through eleven years old by the top three toy retailers — Target, Walmart,', metadata={'data_type': 'txt', 'source': 'data/marketing.txt', 'title': 'marketing.txt', 'topic': 'marketing, toys'}), Document(page_content=

In [15]:
RetrieveDocs.pprint_docs(docs_filtd_manual)
# RetrieveDocs.pprint_docs([doc[0] for doc in docs_filtd_manual])

------------------------------
Still, a visit to a toy store or the toys section of a grocery store in most places will make you feel that not much has changed in the past few decades.
{'data_type': 'txt', 'topic': 'marketing, toys'}


All in all, toy ads haven’t really evolved much over time in terms of the social messages they convey about gender. Most toys still have clearly gendered associations, with dolls being targeted
{'data_type': 'txt', 'topic': 'marketing, toys'}


To this end, Norgaard and Wider analysed 175 television commercials for toys listed as ‘best selling’ for children ages five through eleven years old by the top three toy retailers — Target, Walmart,
{'data_type': 'txt', 'topic': 'marketing, toys'}


What’s more, the study also found that the toy industry’s marketing techniques continue to forge gendered associations in more subtle ways, such as through the use of colour — pink for girls, blue
{'data_type': 'txt', 'topic': 'marketing, toys'}




#### Create prompt inclusive of context

In [16]:
prompt_upd_wo_rr = LlmWithManualRag.add_context_to_prompt(query, 
                                                    docs_filtd_manual, 
                                                    rerank=False)

print(prompt_upd_wo_rr)

Context: Still, a visit to a toy store or the toys section of a grocery store in most places will make you feel that not much has changed in the past few decades.;All in all, toy ads haven’t really evolved much over time in terms of the social messages they convey about gender. Most toys still have clearly gendered associations, with dolls being targeted;To this end, Norgaard and Wider analysed 175 television commercials for toys listed as ‘best selling’ for children ages five through eleven years old by the top three toy retailers — Target, Walmart,;What’s more, the study also found that the toy industry’s marketing techniques continue to forge gendered associations in more subtle ways, such as through the use of colour — pink for girls, blue

        Answer the question based only on the context provided. 
        If you don't know the answer, say you do not know. 
        Decide based on the question if answer can be made concise or not. 
        If so, keep answer within three sent

  warn_deprecated(


In [17]:
prompt_upd_w_rr = LlmWithManualRag.add_context_to_prompt(query, 
                                                         docs_filtd_manual, 
                                                         rerank=True, 
                                                         rerank_method='simple')

print(prompt_upd_w_rr)

Context: Still, a visit to a toy store or the toys section of a grocery store in most places will make you feel that not much has changed in the past few decades.;To this end, Norgaard and Wider analysed 175 television commercials for toys listed as ‘best selling’ for children ages five through eleven years old by the top three toy retailers — Target, Walmart,;What’s more, the study also found that the toy industry’s marketing techniques continue to forge gendered associations in more subtle ways, such as through the use of colour — pink for girls, blue;All in all, toy ads haven’t really evolved much over time in terms of the social messages they convey about gender. Most toys still have clearly gendered associations, with dolls being targeted

        Answer the question based only on the context provided. 
        If you don't know the answer, say you do not know. 
        Decide based on the question if answer can be made concise or not. 
        If so, keep answer within three sent

### Use Langchain Retriever to filter on metadata and generate response 

In [18]:
search_kwargs

{'k': 20,
 'fetch_k': 100,
 'filter': {'$and': [{'title': {'$eq': 'marketing.txt'}},
   {'data_type': {'$eq': 'txt'}}]}}

In [19]:
retriever_base = vector_store_multi.as_retriever(
    
    search_type='mmr', # "similarity" (default), "mmr", or "similarity_score_threshold"
    search_kwargs=search_kwargs,
)

In [20]:
# only for checking
print(query)
docs_filtd = retriever_base.invoke(query)
RetrieveDocs.pprint_docs(docs_filtd)

How have the toy stores changed over the years?
------------------------------
Still, a visit to a toy store or the toys section of a grocery store in most places will make you feel that not much has changed in the past few decades.
{'data_type': 'txt', 'topic': 'marketing, toys'}


To this end, Norgaard and Wider analysed 175 television commercials for toys listed as ‘best selling’ for children ages five through eleven years old by the top three toy retailers — Target, Walmart,
{'data_type': 'txt', 'topic': 'marketing, toys'}


What’s more, the study also found that the toy industry’s marketing techniques continue to forge gendered associations in more subtle ways, such as through the use of colour — pink for girls, blue
{'data_type': 'txt', 'topic': 'marketing, toys'}


Toys are simply learning tools that communicate to children how they should move through the world and the kinds of things they might be interested in and aspire to.
{'data_type': 'txt', 'topic': 'marketing, toys'}




#### Without Reranking

In [21]:
chain_multi_docs_wo_rr = LlmWithRag.create_chain(retriever_base, 
                                                 rerank=False)

#### With reranking

In [22]:
chain_multi_docs_w_rr = LlmWithRag.create_chain(retriever_base, 
                                                rerank=True, 
                                                rerank_method='hf_crossencoder')



# Generate Answers with LLMs
> Switch on the VPN before running the below cells

In [23]:
answers = []

## Without Langchain

### Without Reranking

In [24]:
response_wo_lc = LlmWithManualRag.invoke_chain(prompt_upd_wo_rr)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [25]:
answer = response_wo_lc.content
print(answer)
answers.append(answer)

The context does not provide information on how toy stores have changed over the years.


### With Reranking

In [26]:
response_wo_lc_pl_rr = LlmWithManualRag.invoke_chain(prompt_upd_wo_rr)

In [27]:
answer = response_wo_lc_pl_rr.content
print(answer)
answers.append(answer)

The context does not provide information on how toy stores have changed over the years.


## With Langchain

### Without Ranking

In [28]:
response_w_lc = chain_multi_docs_wo_rr.invoke({'input':query})

In [29]:
answer = response_w_lc['answer']
print(answer)
answers.append(answer)

The context does not provide information on how toy stores have changed over the years.


### With Reranking

In [30]:
response_w_lc_pl_rr = chain_multi_docs_w_rr.invoke({'input':query})

In [31]:
answer = response_w_lc_pl_rr['answer']
print(answer)
answers.append(answer)

The context does not provide specific information on how toy stores have changed over the years.


## All Answers Analysis

In [32]:
methods = ['wo_lc', 'wo_lc_pl_rr', 'w_lc', 'w_lc_pl_rr']
methods

['wo_lc', 'wo_lc_pl_rr', 'w_lc', 'w_lc_pl_rr']

In [33]:
for method, answer in zip(methods, answers[-6:]):
    print(f"{method}:")
    print(answer, '\n')

wo_lc:
The context does not provide information on how toy stores have changed over the years. 

wo_lc_pl_rr:
The context does not provide information on how toy stores have changed over the years. 

w_lc:
The context does not provide information on how toy stores have changed over the years. 

w_lc_pl_rr:
The context does not provide specific information on how toy stores have changed over the years. 

