# LLM and RAGs with Langchain

# Code Starts here

In [2]:
from codes.utils import set_ipynb_config, time_it

In [3]:
set_ipynb_config()

In [12]:
from dotenv import load_dotenv

In [13]:
load_dotenv()

True

In [6]:
import os

In [69]:
model_names = ["gpt-3.5-turbo-instruct", "gpt-3.5-turbo-1106", "gpt-4", "gpt-4o"]
model = model_names[1]
model

'gpt-3.5-turbo-1106'

In [70]:
from langchain.llms.openai import OpenAI
from langchain_community.chat_models import ChatOpenAI

In [70]:
query = "Who have been the Presidents of India?"

if model == "gpt-3.5-turbo-instruct":
    llm = OpenAI(model_name=model, temperature=0.)
    response = llm.invoke(query)  # response --> output as str
    print(response)
    
elif model == "gpt-3.5-turbo-1106":
    llm = ChatOpenAI(model_name=model, temperature=0.)
    response = llm.invoke(query)  # response.content --> output as str
    print(response)
    
elif model == "gpt-4":
    llm = ChatOpenAI(model_name=model, temperature=0.)
    response = llm.invoke(query)  # response.content --> output as str
    print(response)
    
elif model == "gpt-4o":
    llm = ChatOpenAI(model_name=model, temperature=0.)
    response = llm.invoke(query)  # response.content --> output as str
    print(response)
    
else:
    raise NotImplementedError

content='The Presidents of India have been:\n\n1. Dr. Rajendra Prasad (1950-1962)\n2. Dr. Sarvepalli Radhakrishnan (1962-1967)\n3. Dr. Zakir Husain (1967-1969)\n4. V. V. Giri (1969-1974)\n5. Fakhruddin Ali Ahmed (1974-1977)\n6. Neelam Sanjiva Reddy (1977-1982)\n7. Giani Zail Singh (1982-1987)\n8. R. Venkataraman (1987-1992)\n9. Dr. Shankar Dayal Sharma (1992-1997)\n10. K. R. Narayanan (1997-2002)\n11. Dr. A. P. J. Abdul Kalam (2002-2007)\n12. Pratibha Patil (2007-2012)\n13. Pranab Mukherjee (2012-2017)\n14. Ram Nath Kovind (2017-present)' response_metadata={'token_usage': {'completion_tokens': 227, 'prompt_tokens': 15, 'total_tokens': 242}, 'model_name': 'gpt-3.5-turbo-1106', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run-4c33c0c8-f39f-4179-8384-eaafa93914bc-0'


In [71]:
print(response.content)

The Presidents of India have been:

1. Dr. Rajendra Prasad (1950-1962)
2. Dr. Sarvepalli Radhakrishnan (1962-1967)
3. Dr. Zakir Husain (1967-1969)
4. V. V. Giri (1969-1974)
5. Fakhruddin Ali Ahmed (1974-1977)
6. Neelam Sanjiva Reddy (1977-1982)
7. Giani Zail Singh (1982-1987)
8. R. Venkataraman (1987-1992)
9. Dr. Shankar Dayal Sharma (1992-1997)
10. K. R. Narayanan (1997-2002)
11. Dr. A. P. J. Abdul Kalam (2002-2007)
12. Pratibha Patil (2007-2012)
13. Pranab Mukherjee (2012-2017)
14. Ram Nath Kovind (2017-present)


In [46]:
print(response)

content='The Presidents of India have been:\n\n1. Dr. Rajendra Prasad (1950-1962)\n2. Dr. Sarvepalli Radhakrishnan (1962-1967)\n3. Dr. Zakir Husain (1967-1969)\n4. V. V. Giri (1969-1974)\n5. Fakhruddin Ali Ahmed (1974-1977)\n6. Neelam Sanjiva Reddy (1977-1982)\n7. Giani Zail Singh (1982-1987)\n8. R. Venkataraman (1987-1992)\n9. Dr. Shankar Dayal Sharma (1992-1997)\n10. K. R. Narayanan (1997-2002)\n11. Dr. A. P. J. Abdul Kalam (2002-2007)\n12. Pratibha Patil (2007-2012)\n13. Pranab Mukherjee (2012-2017)\n14. Ram Nath Kovind (2017-present)' response_metadata={'token_usage': {'completion_tokens': 227, 'prompt_tokens': 15, 'total_tokens': 242}, 'model_name': 'gpt-3.5-turbo-1106', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run-565cadbd-fa0d-4c2d-b2b2-c80e5909963a-0'


In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, TextLoader, CSVLoader

## Load data

In [21]:
path_data = 'data/'

In [20]:
persist_directory = '/Users/pinaki/Downloads/models/vector-dbs/'
persist_directory

'/Users/pinaki/Downloads/models/vector-dbs/'

In [22]:
def get_document_from_directory(path):
    loader = TextLoader(path)
    docs = loader.load()
    
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=200,
        chunk_overlap=10,
        length_function=len
    )
    splitDocs = splitter.split_documents(docs)
    l_splitDocs = [str(doc.page_content) for doc in splitDocs]
    return l_splitDocs

## Embeddings

In [23]:
from langchain.embeddings import SentenceTransformerEmbeddings

In [24]:
def get_embedding_func():
    embedder = SentenceTransformerEmbeddings(
        model_name="all-MiniLM-L6-v2", 
    )
    return embedder

## Vector DB

In [25]:
from langchain_community.vectorstores.chroma import Chroma


In [26]:
def create_db(docs):
    embedding = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    l_docs = [str(doc.page_content) for doc in docs]
    docs_embedded = embedding.encode(l_docs)
    vectorStore = Chroma.from_documents(docs_embedded)
    # vectorStore = Chroma.from_documents(docs, embedding=embedding)
    return vectorStore


## Chain

In [27]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

In [37]:
def create_chain(retriever):
    model = OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0.)

    prompt = ChatPromptTemplate.from_template("""
    Answer the question based only on the context provided. Keep the answer crisp and clear.
    Do not use your outside knowledge to answer the question.
    When you do not know an answer, just mention you do not know.
    Context: {context}
    Question: {input}                                       
    """)

    chain = create_stuff_documents_chain(
        llm=model,
        prompt=prompt
    )

    retrieval_chain = create_retrieval_chain(
        retriever,
        chain
    )

    return retrieval_chain

# Queries & Responses

# Multiple Documents

In [28]:
path_docs = path_data
path_docs

'data/'

In [29]:
marketing_docs = get_document_from_directory(path_docs + 'marketing.txt')

In [30]:
finance_docs = get_document_from_directory(path_docs + 'personal_finance.txt')

In [32]:
vectorStore_multi = Chroma.from_texts(
    marketing_docs, 
    get_embedding_func(), 
    metadatas=[{'topics': 'marketing, sales'}],
    collection_name='multiple_docs',
    persist_directory=persist_directory
)



In [33]:
_ = vectorStore_multi.add_texts(
    finance_docs,
    metadatas=[{'topics': 'finance, money'}],
)

In [35]:
# test retriever

# options: 
# search_type='similarity', # "similarity" (default), "mmr", or "similarity_score_threshold"
# search_kwargs={
#     # 'k': 5,
#     # 'fetch_k': 20,
#     # 'score_threshold': 0.7,
#     # 'filter': ''
# }

test_retriever_multi = vectorStore_multi.as_retriever(
    search_type='mmr',
    search_kwargs={
        'k': 5,
        'fetch_k': 20,
    }
)

In [38]:
chain_multi_docs = create_chain(test_retriever_multi)

In [39]:
def format_output_for_debugging_multi(query, test_wo_rag:bool=False):
    retrieved_docs = test_retriever_multi.invoke(query)
    for retrv in retrieved_docs:
        print(retrv.page_content, '\n')

    print("-"*30, "\n")
    response = chain_multi_docs.invoke({
        "input": query,
    })
    print("-"*30)
    print("Output using RAG")
    print("-"*30)
    print(response["answer"])
    
    # use llm without rag
    if test_wo_rag:
        print('\n')
        print("-"*30)
        print("Output without using RAG")
        print("-"*30, '\n')
        query_upd = f"""
        Keep the answer crisp and clear. Keep the answer to less than 200 words.
        When you do not know an answer, just mention you do not know.
        {query}
        """
        response_wo_rag = llm.invoke(query_upd)
        print(response_wo_rag.content)

In [72]:
# query = "Write a short note on `gender inclusive products`"
query = "Tell me about `gender inclusive products`"
# query = "Tell me about gender inclusive products" # I do not know.
format_output_for_debugging_multi(query, True)

But the reality is, gender-inclusive products and marketing strategies for toys still tend to be the exception rather than the rule. 

That’s also the finding of the latest study on this topic done in collaboration between the Geena Davis Institute on Gender in Media and two chairs of the first North American Toy Association DEI, 

well — is still gendered, too. Actually, according to a new analysis of over 10,000 globally run ads, gender-stereotypical advertising has only become more prevalent in recent years. 

targeted toward girls and cars, sporting equipment and toy weapons targeted toward boys, and only around 23% falling into the gender-neutral category. 

sports equipment. Meanwhile, girls got dolls, dollhouses, beauty sets and… toy kitchen appliances. 

------------------------------ 

------------------------------
Output using RAG
------------------------------

Answer: Gender-inclusive products are products that are designed and marketed in a way that does not conform to tr

In [73]:
query = "What are the different ways to ask for financial help"
format_output_for_debugging_multi(query, True)

If you’re uncomfortable asking for money directly, try getting a loan and paying them back over time. It’s also okay to ask for non-monetary help, such as coming over for a home-cooked meal or asking 

- The general good advice is generally explained in an overly complex manner, too complicated for the average Joe or the old-school, cash holding relatives 

in the right direction towards your financial goals. 

FAQs
What if I’m in debt? 

You can find many resources online to help you better understand personal finance. Books, articles, and online courses cover topics such as budgeting, saving, and investing. 

------------------------------ 

------------------------------
Output using RAG
------------------------------

Answer: Some ways to ask for financial help include getting a loan and paying it back over time, asking for non-monetary help, and seeking resources online to better understand personal finance.


------------------------------
Output without using RAG
---------------

In [74]:
query = "What are the top challenges in investing in real estate?"
# query = "What are the top problems in investing in real estate?"  # I do not know.
format_output_for_debugging_multi(query, True)

Real estate is finite. You can’t create more of it. In high-demand areas, real estate goes for top dollar. Investors and developers have an incentive to earn the maximum return on their investment — 

financial hardships. 

wage income, much less own a home with a white picket fence and raise 2.4 children. Now that isn’t even possible for dual-income households with advanced degrees. 

“When the rich and the poor compete for housing on the open market, the poor don’t stand a chance.” (202) 

Choose an investment budget out of those saved money. I usually go for no more than 50% of the saved ones.
50% goes into a long-term ISA 

------------------------------ 

------------------------------
Output using RAG
------------------------------

Answer: The top challenges in investing in real estate are limited availability of properties, high demand in certain areas leading to high prices, and financial hardships for those with lower income or savings.


------------------------------
Output