# LLM and RAGs with Langchain

# Code Starts here

In [1]:
from codes.utils import set_ipynb_config, time_it

In [2]:
set_ipynb_config()

In [3]:
from codes.llm_gateway import (
    openAI_wo_api, 
)

In [4]:
from langchain.schema import HumanMessage, SystemMessage

In [5]:
llm = openAI_wo_api()

  warn_deprecated(


In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, TextLoader, CSVLoader

## Load data

In [7]:
path_data = 'data/'
file_text = path_data + 'test_doc.txt'
file_text

'data/test_doc.txt'

In [8]:
persist_directory = '/Users/prb000j/Downloads/downloaded_models/vector_dbs/'

In [9]:
def get_document_from_directory(path):
    loader = TextLoader(path)
    docs = loader.load()
    
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=200,
        chunk_overlap=10,
        length_function=len
    )
    splitDocs = splitter.split_documents(docs)
    l_splitDocs = [str(doc.page_content) for doc in splitDocs]
    return l_splitDocs

## Embeddings

In [10]:
from langchain.embeddings import SentenceTransformerEmbeddings

In [11]:
def get_embedding_func():
    embedder = SentenceTransformerEmbeddings(
        model_name="all-MiniLM-L6-v2", 
    )
    return embedder

## Vector DB

In [12]:
from langchain_community.vectorstores.chroma import Chroma


In [13]:
def create_db(docs):
    embedding = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    l_docs = [str(doc.page_content) for doc in docs]
    docs_embedded = embedding.encode(l_docs)
    vectorStore = Chroma.from_documents(docs_embedded)
    # vectorStore = Chroma.from_documents(docs, embedding=embedding)
    return vectorStore


## Chain

In [14]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

In [15]:
def create_chain(retriever):
    model = openAI_wo_api()

    prompt = ChatPromptTemplate.from_template("""
    Answer the question based only on the context provided. Keep the answer crisp and clear.
    Do not use your outside knowledge to answer the question.
    When you do not know an answer, just mention you do not know.
    Context: {context}
    Question: {input}                                       
    """)

    # chain = prompt | model
    chain = create_stuff_documents_chain(
        llm=model,
        prompt=prompt
    )

    # retriever = vectorStore.as_retriever(search_kwargs={"k": 3})

    retrieval_chain = create_retrieval_chain(
        retriever,
        chain
    )

    return retrieval_chain

In [16]:
docs = get_document_from_directory(file_text)

In [17]:
type(docs)
len(docs)
len(docs[0])

list

201

65

In [18]:
vectorStore = Chroma.from_texts(docs, get_embedding_func(), persist_directory=persist_directory)

In [19]:
# test retriever
test_retriever = vectorStore.as_retriever(
    # search_type='similarity', # "similarity" (default), "mmr", or "similarity_score_threshold"
    search_type='mmr', # "similarity" (default), "mmr", or "similarity_score_threshold"
    # search_type='similarity_score_threshold', # "similarity" (default), "mmr", or "similarity_score_threshold"
    search_kwargs={
        'k': 5,
        'fetch_k': 20,
        # 'score_threshold': 0.7,
        # 'filter': ''
    }
)

In [20]:
query = "What is cold calling"
test_retriever.invoke(query)

[Document(page_content='3. You need to get better at “cold calling”'),
 Document(page_content='3. You need to get better at “cold calling”'),
 Document(page_content='3. You need to get better at “cold calling”'),
 Document(page_content='By cold calling, I try to capture cold emails, cold intro at a party, or, why not, cold calling. This is how you can get new people in your network. The first step is to introduce yourself and get a'),
 Document(page_content='as oppose to the other types which require calls to an LLM.')]

In [21]:
chain = create_chain(test_retriever)

# Queries & Responses

In [22]:
def format_output_for_debugging(query):
    retrieved_docs = test_retriever.invoke(query)
    for retrv in retrieved_docs:
        print(retrv.page_content, '\n')

    print("-"*30, "\n")
    response = chain.invoke({
        "input": query,
    })
    # print(response["context"])
    print(response["answer"])

In [23]:
query = "What is cold calling"
format_output_for_debugging(query)

3. You need to get better at “cold calling” 

3. You need to get better at “cold calling” 

3. You need to get better at “cold calling” 

By cold calling, I try to capture cold emails, cold intro at a party, or, why not, cold calling. This is how you can get new people in your network. The first step is to introduce yourself and get a 

as oppose to the other types which require calls to an LLM. 

------------------------------ 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Cold calling is the practice of contacting individuals who have not previously expressed interest in the products or services you offer, typically over the phone, with the aim of making a sale or establishing a new business relationship.


In [24]:
query = "How many others were participating in the Executive Leadership Program?"
format_output_for_debugging(query)

A couple of days ago, I finished an Executive Leadership program alongside 100 other participants who wanted to explore how to become better, more effective leaders by increasing their influence and 

A couple of days ago, I finished an Executive Leadership program alongside 100 other participants who wanted to explore how to become better, more effective leaders by increasing their influence and 

A couple of days ago, I finished an Executive Leadership program alongside 100 other participants who wanted to explore how to become better, more effective leaders by increasing their influence and 

with how many people. 

If you launch work to nurture your community, who in your company or group will need time to do this work well?
If your community are important external stakeholders, who are the internal versions? 

------------------------------ 

There were 100 other participants in the Executive Leadership Program.


In [25]:
query = "As per me, which was the most important session in the executive leadership program"
format_output_for_debugging(query)

A couple of days ago, I finished an Executive Leadership program alongside 100 other participants who wanted to explore how to become better, more effective leaders by increasing their influence and 

A couple of days ago, I finished an Executive Leadership program alongside 100 other participants who wanted to explore how to become better, more effective leaders by increasing their influence and 

A couple of days ago, I finished an Executive Leadership program alongside 100 other participants who wanted to explore how to become better, more effective leaders by increasing their influence and 

and impact. There were a lot of exciting sessions, but the one on networking struck me as the most important. 

What it means for someone to be elevated into a power user or ambassador role
And moments could include: 

------------------------------ 

As per you, the most important session in the executive leadership program was the one on networking.


In [26]:
query = "What was the revenue for Apple products?"
format_output_for_debugging(query)

iPhone revenue: $43.81 billion | Mac revenue: $7.61 billion | iPad revenue: $6.44 billion | Wearable revenue: $9.32 billion | Services revenue: $22.31 billion 

iPhone revenue: $43.81 billion | Mac revenue: $7.61 billion | iPad revenue: $6.44 billion | Wearable revenue: $9.32 billion | Services revenue: $22.31 billion 

iPhone revenue: $43.81 billion | Mac revenue: $7.61 billion | iPad revenue: $6.44 billion | Wearable revenue: $9.32 billion | Services revenue: $22.31 billion 

Continuous revenue from products and services
Complementary product and pricing in services
Apple’s Q4 2023 Services revenue: $22.31 billion 

Willingness to pay, demand and revenue maximization
Apple’s Q4 2023 — Revenue: $89.5 billion | Gross margin: 45.2% (link) 

------------------------------ 

The revenue for Apple products in Q4 2023 was $67.18 billion. This is calculated by adding the iPhone, Mac, iPad, and Wearable revenues ($43.81 billion + $7.61 billion + $6.44 billion + $9.32 billion).


# Multiple Documents

In [16]:
path_docs = path_data + 'multi_docs/'
path_docs

'data/multi_docs/'

In [17]:
marketing_docs = get_document_from_directory(path_docs + 'marketing.txt')

In [18]:
finance_docs = get_document_from_directory(path_docs + 'personal_finance.txt')

In [19]:
vectorStore_multi = Chroma.from_texts(
    marketing_docs, 
    get_embedding_func(), 
    metadatas=[{'topics': 'marketing, sales'}],
    collection_name='multiple_docs',
    persist_directory=persist_directory
)

In [20]:
_ = vectorStore_multi.add_texts(
    finance_docs,
    metadatas=[{'topics': 'finance, money'}],
)

In [21]:
# test retriever
test_retriever_multi = vectorStore_multi.as_retriever(
    # search_type='similarity', # "similarity" (default), "mmr", or "similarity_score_threshold"
    search_type='mmr', # "similarity" (default), "mmr", or "similarity_score_threshold"
    # search_type='similarity_score_threshold', # "similarity" (default), "mmr", or "similarity_score_threshold"
    search_kwargs={
        'k': 5,
        'fetch_k': 20,
        # 'score_threshold': 0.7,
        # 'filter': ''
    }
)

In [22]:
chain_multi_docs = create_chain(test_retriever_multi)

In [23]:
def format_output_for_debugging_multi(query, test_wo_rag:bool=False):
    retrieved_docs = test_retriever_multi.invoke(query)
    for retrv in retrieved_docs:
        print(retrv.page_content, '\n')

    print("-"*30, "\n")
    response = chain_multi_docs.invoke({
        "input": query,
    })
    print("-"*30)
    print("Output using RAG")
    print("-"*30)
    print(response["answer"])
    
    # use llm without rag
    if test_wo_rag:
        print('\n')
        print("-"*30)
        print("Output without using RAG")
        print("-"*30, '\n')
        query_upd = f"""
        Keep the answer crisp and clear. Keep the answer to less than 200 words.
        When you do not know an answer, just mention you do not know.
        {query}
        """
        response_wo_rag = llm.invoke(query_upd)
        print(response_wo_rag.content)

In [24]:
# query = "Write a short note on `gender inclusive products`"
query = "Tell me about `gender inclusive products`"
# query = "Tell me about gender inclusive products" # I do not know.
format_output_for_debugging_multi(query, True)

But the reality is, gender-inclusive products and marketing strategies for toys still tend to be the exception rather than the rule. 

That’s also the finding of the latest study on this topic done in collaboration between the Geena Davis Institute on Gender in Media and two chairs of the first North American Toy Association DEI, 

well — is still gendered, too. Actually, according to a new analysis of over 10,000 globally run ads, gender-stereotypical advertising has only become more prevalent in recent years. 

targeted toward girls and cars, sporting equipment and toy weapons targeted toward boys, and only around 23% falling into the gender-neutral category. 

sports equipment. Meanwhile, girls got dolls, dollhouses, beauty sets and… toy kitchen appliances. 

------------------------------ 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if p

In [25]:
query = "What are the different ways to ask for financial help"
format_output_for_debugging_multi(query, True)

If you’re uncomfortable asking for money directly, try getting a loan and paying them back over time. It’s also okay to ask for non-monetary help, such as coming over for a home-cooked meal or asking 

- The general good advice is generally explained in an overly complex manner, too complicated for the average Joe or the old-school, cash holding relatives 

in the right direction towards your financial goals. 

FAQs
What if I’m in debt? 

You can find many resources online to help you better understand personal finance. Books, articles, and online courses cover topics such as budgeting, saving, and investing. 

------------------------------ 

------------------------------
Output using RAG
------------------------------
The different ways to ask for financial help include asking for a loan and repaying it over time, and requesting non-monetary assistance like having meals with friends or family, or seeking advice and guidance on financial matters.


------------------------------
Outp

In [26]:
query = "What are the top challenges in investing in real estate?"
# query = "What are the top problems in investing in real estate?"  # I do not know.
format_output_for_debugging_multi(query, True)

Real estate is finite. You can’t create more of it. In high-demand areas, real estate goes for top dollar. Investors and developers have an incentive to earn the maximum return on their investment — 

financial hardships. 

“When the rich and the poor compete for housing on the open market, the poor don’t stand a chance.” (202) 

Invest
Once you know what you spend on, strategically plan it so that you can still continue saving and pay off your debts correctly, you are ready to invest. 

25% into certain stocks and ETFs that I consider huge potential
25% into Crypto ( I know this is controversial but to each their own ) it can also be wines, whisky or towards a Berkin bag 

------------------------------ 

------------------------------
Output using RAG
------------------------------
Based on the context provided, the top challenges in investing in real estate include:

1. High Costs: Real estate in high-demand areas is expensive.
2. Competition: The market is competitive, especially b