# Load Vector Store

In [1]:
from codes.utils import set_ipynb_config, time_it
set_ipynb_config()

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
import os

In [4]:
path_data = os.getenv('PATH_DATA')
path_data

'data/'

In [5]:
path_persist_db = os.getenv('PATH_PERSIST_DB')

## C. Retrieval Process
**Expected Input:** Query from user

```python
get_query_topic(query) -> topics  
get_filter_criterion(topics) -> filters
retrieve_chunks(query, filters)  # based on filter criterion
rerank_chunks(query, chunks) -> chunks
```
**Expected Output:** List of chunks (or docs)

In [6]:
from codes.file_to_docs import Docs2VectorDb
from codes.retrieve_docs import RetrieveDocs

In [7]:
vector_store_multi = Docs2VectorDb.load_vector_store(path_persist_db)



In [8]:
sources = Docs2VectorDb.sources_from_vdb(vector_store_multi)
sources

{'source': {'data/garb_in_garb_out.txt',
  'data/marketing.txt',
  'data/qna_table.csv'}}

# Generate Answers with LLM using RAG

Approaches:

1. Without using langchain:
- Create Prompt = Context + Query
    - Manually Retrieve Docs using metadata filter & retrieval method like 'mmr'
    - If method is 'mmr', filter retrieved docs based on siml / relv threshold
    - rerank if required
- Pass prompt to LLM call

2. With Langchain:
- Create a base retrieval based on metadata filter & retrieval method like 'mmr'
- Do reranking if rerank==True
- invoke chain with query

In [9]:
from codes.retrieve_docs import RetrieveDocs, ReRanking
from codes.generate_w_rag import LlmWithManualRag, LlmWithRag

## Query

In [10]:
metadata_keys = ['title', 'source', 'data_type', 'topic']
for key in metadata_keys: 
    metadata_key = Docs2VectorDb.sources_from_vdb(vector_store_multi, key)
    print('\n')
    print(metadata_key)



{'title': {'qna_table.csv', 'garb_in_garb_out.txt', 'marketing.txt'}}


{'source': {'data/garb_in_garb_out.txt', 'data/marketing.txt', 'data/qna_table.csv'}}


{'data_type': {'txt', 'dataframe'}}


{'topic': {'marketing, toys', 'philosophy', 'qna on topics like RB, luminate'}}


In [11]:
# query = 'How have the toy stores changed over the years?'
query = 'What are the typical problems that writers face. Answer in bullet points?'

### Explicitly filter on metadata and generate response based on query

#### Filter and retrieve documents based on query

In [12]:
metadata_filt = {
    'filter': {
        '$and': [
            {'title': {'$eq':'marketing.txt'}},
            {'data_type': {'$eq':'txt'}},
            # {'data_type': {'$in':['txt', 'dataframe']}},
            # {'topic': {'$eq':'philosophy'}},
            ]
        }
    }

# search_kwargs={
#         'k': 4,
#         'fetch_k': 20,
# }

search_kwargs={
        'k': 20,
        'fetch_k': 100,
}


search_kwargs.update(metadata_filt)
print(search_kwargs)

{'k': 20, 'fetch_k': 100, 'filter': {'$and': [{'title': {'$eq': 'marketing.txt'}}, {'data_type': {'$eq': 'txt'}}]}}


In [13]:
docs_retrvd_w_reranking = RetrieveDocs.main(query, 
                                            vector_store_multi, 
                                            method_search='mmr', 
                                            rerank=False,
                                            **search_kwargs)
RetrieveDocs.pprint_docs(docs_retrvd_w_reranking)

------------------------------
They’re perhaps just more insidious today.


The #1 Problem All Writers Face Is They Suck at Sales and Marketing
Which is why many of us are often broke and complain a lot
{'data_type': 'txt', 'topic': 'marketing, toys'}


The group of romantic writers that obsess over the writing, complain/blame a lot, and can never pay their bills.
{'data_type': 'txt', 'topic': 'marketing, toys'}


Transactional writers think about themselves first and do anything they can to score some sexy time with a reader. They think short term.
{'data_type': 'txt', 'topic': 'marketing, toys'}


These outliers are what delude most writers. They treat these rarer and rarer opportunities as a given, as long as they show up for enough years.
{'data_type': 'txt', 'topic': 'marketing, toys'}


When they get one hater they get all defensive and allow it to disrupt their entire writing empire. Successful writers don’t care about the 1% of haters that exist online and will find faults in
{

#### Filter retrieved docs based on relevance
Remove docs that have similarity < threshold w.r.t query

In [14]:
docs_filtd_manual = LlmWithManualRag.filter_docs_on_siml(query, 
                                                         docs_retrvd_w_reranking, 
                                                         thresh=0.5, 
                                                         k=4)
len(docs_filtd_manual)
print(docs_filtd_manual)

Starting to Embed texts ...




Starting to Embed texts ...
[0.56264985 0.546083   0.5390084  0.45970762 0.4551007  0.43661135
 0.42819524 0.42632252 0.42232603 0.41534993 0.4054547  0.3965654
 0.39085037 0.38488102 0.3840184  0.3826279  0.37508386 0.37100586
 0.36500105 0.351358  ]
[ True  True  True False False False False False False False False False
 False False False False False False False False]


3

[Document(page_content='They’re perhaps just more insidious today.\n\n\nThe #1 Problem All Writers Face Is They Suck at Sales and Marketing\nWhich is why many of us are often broke and complain a lot', metadata={'data_type': 'txt', 'source': 'data/marketing.txt', 'title': 'marketing.txt', 'topic': 'marketing, toys'}), Document(page_content='The group of romantic writers that obsess over the writing, complain/blame a lot, and can never pay their bills.', metadata={'data_type': 'txt', 'source': 'data/marketing.txt', 'title': 'marketing.txt', 'topic': 'marketing, toys'}), Document(page_content='Transactional writers think about themselves first and do anything they can to score some sexy time with a reader. They think short term.', metadata={'data_type': 'txt', 'source': 'data/marketing.txt', 'title': 'marketing.txt', 'topic': 'marketing, toys'})]


In [15]:
RetrieveDocs.pprint_docs(docs_filtd_manual)
# RetrieveDocs.pprint_docs([doc[0] for doc in docs_filtd_manual])

------------------------------
They’re perhaps just more insidious today.


The #1 Problem All Writers Face Is They Suck at Sales and Marketing
Which is why many of us are often broke and complain a lot
{'data_type': 'txt', 'topic': 'marketing, toys'}


The group of romantic writers that obsess over the writing, complain/blame a lot, and can never pay their bills.
{'data_type': 'txt', 'topic': 'marketing, toys'}


Transactional writers think about themselves first and do anything they can to score some sexy time with a reader. They think short term.
{'data_type': 'txt', 'topic': 'marketing, toys'}




#### Create prompt inclusive of context

In [16]:
prompt_upd_wo_rr = LlmWithManualRag.add_context_to_prompt(query, 
                                                    docs_filtd_manual, 
                                                    rerank=False)

print(prompt_upd_wo_rr)

Context: They’re perhaps just more insidious today.


The #1 Problem All Writers Face Is They Suck at Sales and Marketing
Which is why many of us are often broke and complain a lot;The group of romantic writers that obsess over the writing, complain/blame a lot, and can never pay their bills.;Transactional writers think about themselves first and do anything they can to score some sexy time with a reader. They think short term.

        Answer the question based only on the context provided. 
        If you don't know the answer, say you do not know. 
        Decide based on the question if answer can be made concise or not. 
        If so, keep answer within three sentences. Concise is better.
        If answer needs to be elaborate, generate a very structured response.
        Question: What are the typical problems that writers face. Answer in bullet points?
        


  warn_deprecated(


In [17]:
prompt_upd_w_rr = LlmWithManualRag.add_context_to_prompt(query, 
                                                         docs_filtd_manual, 
                                                         rerank=True, 
                                                         rerank_method='simple')

print(prompt_upd_w_rr)

Context: They’re perhaps just more insidious today.


The #1 Problem All Writers Face Is They Suck at Sales and Marketing
Which is why many of us are often broke and complain a lot;Transactional writers think about themselves first and do anything they can to score some sexy time with a reader. They think short term.;The group of romantic writers that obsess over the writing, complain/blame a lot, and can never pay their bills.

        Answer the question based only on the context provided. 
        If you don't know the answer, say you do not know. 
        Decide based on the question if answer can be made concise or not. 
        If so, keep answer within three sentences. Concise is better.
        If answer needs to be elaborate, generate a very structured response.
        Question: What are the typical problems that writers face. Answer in bullet points?
        


### Use Langchain Retriever to filter on metadata and generate response 

In [18]:
search_kwargs

{'k': 20,
 'fetch_k': 100,
 'filter': {'$and': [{'title': {'$eq': 'marketing.txt'}},
   {'data_type': {'$eq': 'txt'}}]}}

In [19]:
retriever_base = vector_store_multi.as_retriever(
    
    search_type='mmr', # "similarity" (default), "mmr", or "similarity_score_threshold"
    search_kwargs=search_kwargs,
)

In [20]:
# only for checking
print(query)
docs_filtd = retriever_base.invoke(query)
RetrieveDocs.pprint_docs(docs_filtd)

What are the typical problems that writers face. Answer in bullet points?
------------------------------
They’re perhaps just more insidious today.


The #1 Problem All Writers Face Is They Suck at Sales and Marketing
Which is why many of us are often broke and complain a lot
{'data_type': 'txt', 'topic': 'marketing, toys'}


The group of romantic writers that obsess over the writing, complain/blame a lot, and can never pay their bills.
{'data_type': 'txt', 'topic': 'marketing, toys'}


Transactional writers think about themselves first and do anything they can to score some sexy time with a reader. They think short term.
{'data_type': 'txt', 'topic': 'marketing, toys'}


The life of a writer often feels cursed.

So many of us are starving artists. I’ve been writing online for 10 years and I have two distinct groups of writer friends:
{'data_type': 'txt', 'topic': 'marketing, toys'}


Harsh Truth: Nobody else is going to promote your writing
This is where the average writer goes wrong.

#### Without Reranking

In [21]:
chain_multi_docs_wo_rr = LlmWithRag.create_chain(retriever_base, 
                                                 rerank=False)

#### With reranking

In [22]:
chain_multi_docs_w_rr = LlmWithRag.create_chain(retriever_base, 
                                                rerank=True, 
                                                rerank_method='hf_crossencoder')



# Generate Answers with LLMs
> Switch on the VPN before running the below cells

In [23]:
answers = []

## Without Langchain

### Without Reranking

In [24]:
response_wo_lc = LlmWithManualRag.invoke_chain(prompt_upd_wo_rr)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [25]:
answer = response_wo_lc.content
print(answer)
answers.append(answer)

- They struggle with sales and marketing of their work.
- They often face financial difficulties.
- They tend to complain and blame a lot.
- They may prioritize their own needs and short-term gains over long-term success.


### With Reranking

In [26]:
response_wo_lc_pl_rr = LlmWithManualRag.invoke_chain(prompt_upd_wo_rr)

In [27]:
answer = response_wo_lc_pl_rr.content
print(answer)
answers.append(answer)

- They struggle with sales and marketing of their work.
- Many writers are often broke due to lack of income.
- They tend to complain and blame a lot.
- Some writers obsess over the writing process and struggle to pay their bills.
- Transactional writers often think about themselves first and focus on short-term gains.


## With Langchain

### Without Ranking

In [28]:
response_w_lc = chain_multi_docs_wo_rr.invoke({'input':query})

In [29]:
answer = response_w_lc['answer']
print(answer)
answers.append(answer)

- They struggle with sales and marketing of their work.
- Many writers are often financially unstable.
- They face the challenge of self-promotion.
- They have to deal with negative feedback or 'haters'.
- They often try to sell too many things at once.
- They struggle with finding a balance between writing quality pieces and promoting their work.
- They face the unpredictability of writing platforms and social media algorithms.
- They have to deal with traditional book publishers that occasionally give out advances.


### With Reranking

In [30]:
response_w_lc_pl_rr = chain_multi_docs_w_rr.invoke({'input':query})

In [31]:
answer = response_w_lc_pl_rr['answer']
print(answer)
answers.append(answer)

- They struggle with sales and marketing of their work.
- Many writers often face financial difficulties.
- They often feel frustrated and complain a lot.
- They may refuse to promote their work, leading to less readership.
- They may feel upset with the world when their writing is not read by many.


## All Answers Analysis

In [32]:
methods = ['wo_lc', 'wo_lc_pl_rr', 'w_lc', 'w_lc_pl_rr']
methods

['wo_lc', 'wo_lc_pl_rr', 'w_lc', 'w_lc_pl_rr']

In [33]:
for method, answer in zip(methods, answers[-6:]):
    print(f"{method}:")
    print(answer, '\n')

wo_lc:
- They struggle with sales and marketing of their work.
- They often face financial difficulties.
- They tend to complain and blame a lot.
- They may prioritize their own needs and short-term gains over long-term success. 

wo_lc_pl_rr:
- They struggle with sales and marketing of their work.
- Many writers are often broke due to lack of income.
- They tend to complain and blame a lot.
- Some writers obsess over the writing process and struggle to pay their bills.
- Transactional writers often think about themselves first and focus on short-term gains. 

w_lc:
- They struggle with sales and marketing of their work.
- Many writers are often financially unstable.
- They face the challenge of self-promotion.
- They have to deal with negative feedback or 'haters'.
- They often try to sell too many things at once.
- They struggle with finding a balance between writing quality pieces and promoting their work.
- They face the unpredictability of writing platforms and social media algori