In [None]:
!pip3 install openai --quiet
!pip3 install langchain --quiet
!pip3 install cohere --quiet
!pip3 install tiktoken --quiet
!pip3 install langchain_community

In [None]:
import os

#Better way
from google.colab import userdata
os.environ['OPENAI_API_KEY'] = userdata.get("OPENAI_API_KEY")
os.environ['COHERE_API_KEY'] = userdata.get("COHERE_API_KEY")

In [None]:
# Helper function for printing docs


def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

# Step-1: Document Loading

In [None]:
!pip3 install pypdf --quiet

In [None]:
from langchain.document_loaders import PyPDFLoader

In [None]:
#!wget https://raw.githubusercontent.com/giridhar276/Datasets/master/Agreements/EMPLOYEE_AGREEMENT.pdf
#loader = PyPDFLoader("EMPLOYEE_AGREEMENT.pdf")
#pages = loader.load()
#print(len(pages))

In [None]:
import requests
from langchain_community.document_loaders import PyPDFLoader

# Step 1: Download the PDF using requests
url = "https://raw.githubusercontent.com/giridhar276/Datasets/master/Agreements/EMPLOYEE_AGREEMENT.pdf"
response = requests.get(url)

with open("EMPLOYEE_AGREEMENT.pdf", "wb") as f:
    f.write(response.content)

print("File downloaded successfully!")

# Step 2: Load PDF with PyPDFLoader
loader = PyPDFLoader("EMPLOYEE_AGREEMENT.pdf")
pages = loader.load()

# Step 3: Check number of pages
print("Total pages in PDF:", len(pages))

In [None]:
full_text =""
for page in pages:
  full_text += page.page_content

print("Pages", len(pages))
print("Lines" , len(full_text.split("\n")))
print("Words" , len(full_text.split(" ")))
print("Characters", len(full_text))

# Step-2:Split the data into Chunks

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=30)
chunks = text_splitter.split_documents(pages)
print(len(chunks))

In [None]:
print(chunks[1])

# Step-3: Creating embeddings

In [None]:
from langchain.embeddings import OpenAIEmbeddings, CohereEmbeddings

In [None]:
embeddings = OpenAIEmbeddings()
#embeddings = CohereEmbeddings(user_agent="langchain")
#embeddings = OpenAIEmbeddings(model="text-embedding-3-small")  # 1536-dim

In [None]:
embeddings

In [None]:
# A sample embedding

sample_embedding = embeddings.embed_query("You must follow the rules")
print(sample_embedding)

In [None]:
# A sample embedding

sample_embedding = embeddings.embed_query(chunks[1].page_content)
print(len(sample_embedding))

In [None]:
len(sample_embedding)

In [None]:
sample_docs=["You must follow the rules",
            "You must not disclose the rules"]

embeded_vectors1= embeddings.embed_documents(sample_docs)
print(embeded_vectors1)

In [None]:
print(len(embeded_vectors1))
print(len(embeded_vectors1[0]))

In [None]:
print(sample_embedding)
print(embeded_vectors1[0])

# Step-4: Storing in Vector Stores

In [None]:
!pip3 install chromadb -q

In [None]:
from langchain.vectorstores import Chroma

In [None]:
emp_rules_db= Chroma.from_documents(chunks,
                                    embeddings,
                                    persist_directory="emp_rules_db1"
                          )
emp_rules_db.persist()

# Step-5: Retrieval

In [None]:
retriever = emp_rules_db.as_retriever()
result=retriever.get_relevant_documents("What is the policy for sick leaves")
result

In [None]:
for i in range(len(result)):
  print(result[i].metadata)

In [None]:
retriever = emp_rules_db.as_retriever()
result=retriever.get_relevant_documents("What is base compensation")
result

You can also set a retrieval method that sets a similarity score threshold and only returns documents with a score above that threshold.


### pretty_print_docs

In [None]:
retriever = emp_rules_db.as_retriever()
result=retriever.get_relevant_documents("What is the policy for insurance?")
pretty_print_docs(result)

In [None]:
for i in range(len(result)):
  print(result[i].metadata)

In [None]:
retriever = emp_rules_db.as_retriever()
result=retriever.get_relevant_documents("Employee Monthly Salary ")
result

In [None]:
for i in range(len(result)):
  print(result[i].metadata)

# RAG Example-2

In [None]:
import requests
from langchain.document_loaders import PyPDFLoader

# Step 1: Download PDF using requests
url = "https://raw.githubusercontent.com/giridhar276/Datasets/master/Banking_System_Doc/BASEL.pdf"
pdf_path = "BASEL.pdf"

response = requests.get(url)
with open(pdf_path, "wb") as f:
    f.write(response.content)

# Step 2: Load PDF with PyPDFLoader
loader = PyPDFLoader(pdf_path)
pages = loader.load()

print(f"Total pages loaded: {len(pages)}")

In [None]:
#Step-2:Split the data into Chunks

from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(pages)
print(len(chunks))

In [None]:
#Step-3: Creating Embeddings
from langchain.embeddings import OpenAIEmbeddings, CohereEmbeddings
embeddings = OpenAIEmbeddings()
#embeddings = CohereEmbeddings(user_agent="langchain")


#Step-4: Storing in a Vector DB
#!rm -rf basel_norms_db
basel_norms_db= Chroma.from_documents(chunks,
                                    embeddings,
                                    persist_directory="basel_norms_db1"
                          )
basel_norms_db.persist()

In [None]:
#Step-5 Retrieval

retriever = basel_norms_db.as_retriever()
result=retriever.get_relevant_documents("What percentage is the minimum Capital Requirements")
print(result)
print(len(result))
print([i.metadata for i in result])

In [None]:
retriever = basel_norms_db.as_retriever()
result=retriever.get_relevant_documents("What are PD and LGD")
print(result)
print(len(result))
print([i.metadata for i in result])

# MultiQueryRetriever

In [None]:
!pip3 install wikipedia -q

In [None]:
from langchain.document_loaders import wikipedia

#Document Loading
loader=wikipedia.WikipediaLoader(query="MS Dhoni")
documents=loader.load()

#Splitting
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=50)
docs=text_splitter.split_documents(documents)
print(len(docs))

#Embeddings and VectorDB
from langchain.embeddings import OpenAIEmbeddings
embeddings=OpenAIEmbeddings()
#embeddings=CohereEmbeddings(user_agent="langchain")

embeddings_db=Chroma.from_documents(docs,embeddings,
                                    persist_directory="wiki_db1")
embeddings_db.persist()



In [None]:
from langchain.retrievers import MultiQueryRetriever
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI , Cohere


llm=OpenAI(temperature=0)
#llm=Cohere(temperature=0)

In [None]:
llm_based_retriver=MultiQueryRetriever.from_llm(
    retriever=embeddings_db.as_retriever(),
    llm=llm
)

In [None]:
llm_based_retriver

In [None]:
import logging
logging.basicConfig(level=logging.INFO)
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [None]:
question1="What is the DOB of Dhoni?"
rel_docs1=llm_based_retriver.get_relevant_documents(question1)


In [None]:

question2= "What Sport does Dhoni Play?"

rel_docs2=llm_based_retriver.get_relevant_documents(question2)

In [None]:
rel_docs1

In [None]:
rel_docs2

# Contextual compression

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [None]:
llm=OpenAI(temperature=0)
#llm=Cohere(temperature=0)

compressor=LLMChainExtractor.from_llm(llm)

compression_retriever=ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=embeddings_db.as_retriever()
)

In [None]:
compressed_docs=compression_retriever.get_relevant_documents(question1)

In [None]:
compressed_docs[0].metadata

In [None]:
compressed_docs[0].metadata["summary"]

In [None]:
'''
question -----> question1,  ----> summary1
                 question2, ----> summary2
                 question3 ----- > summary3

               summary1 + summary2 + summary3 = final summary
'''

# RetrievalQA Chain

In [None]:
question1="What is the DOB of Dhoni?"
question2= "What Sport does Dhoni Play?"
rel_docs1=llm_based_retriver.get_relevant_documents(question1)
rel_docs2=llm_based_retriver.get_relevant_documents(question2)

In [None]:
from langchain.chains import RetrievalQA
llm=OpenAI(temperature=0)
#llm=Cohere(temperature=0)

Q_AChain=RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",# It takes a list of documents, inserts them all into a prompt
    retriever=llm_based_retriver
)

In [None]:
query="What is the DOB of Dhoni?"
docs=Q_AChain({"query":query})
docs["result"]

In [None]:
docs

In [None]:
#question = what is the DOB of Dhoni ---> 3 queries ---> each query will be having some response ------>. searching on top of it ----> final answer

In [None]:
print(Q_AChain.combine_documents_chain.llm_chain.prompt.template)

In [None]:
query="what is the capital of Libya"
docs=Q_AChain({"query":query})
docs["result"]

Validation of the Results

In [None]:
import requests

url = "https://raw.githubusercontent.com/giridhar276/Datasets/master/COI/COI.pdf"
pdf_path = "COI.pdf"

response = requests.get(url)
with open(pdf_path, "wb") as f:
    f.write(response.content)

print("Download complete:", pdf_path)

In [None]:
#COI Data
#!wget https://raw.githubusercontent.com/giridhar276/Datasets/master/COI/COI.pdf
from langchain.document_loaders import PyPDFLoader

#Document Loading
loader = PyPDFLoader("COI.pdf")
pages = loader.load()

full_text =""
for page in pages:
  full_text += page.page_content

print("Pages", len(pages))
print("Lines" , len(full_text.split("\n")))
print("Words" , len(full_text.split(" ")))
print("Charecters", len(full_text))

#Split the data into Chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=400)
chunks = text_splitter.split_documents(pages)
print(len(chunks))

#Embeddings and Vector DB
from langchain.embeddings import OpenAIEmbeddings,CohereEmbeddings
from langchain.vectorstores import Chroma
embeddings = OpenAIEmbeddings()
#embeddings = CohereEmbeddings(user_agent="langchain")
coi_db= Chroma.from_documents(chunks,
                             embeddings,
                             persist_directory="coi_db2"
                             )
coi_db.persist()

#Retrieval Q_A Chain

from langchain.chains import RetrievalQA
from langchain.llms import OpenAI , Cohere

llm=OpenAI(temperature=0)
#llm = Cohere(temperature = 0)
Q_AChain=RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",# It takes a list of documents, inserts them all into a prompt
    retriever=coi_db.as_retriever()
)

In [None]:
query="""
According to constitution of India what are the fundamental rights of citizens of India ?
"""
docs=Q_AChain({"query":query})
print(docs["result"])

## RAG Validation Process

In [None]:
llm=OpenAI(temperature=0)

Q_AChain=RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",# It takes a list of documents, inserts them all into a prompt
    retriever=coi_db.as_retriever(),
    input_key="question" # Add this extra parameter to access the user questions using input_key
)

##Get Validation data

In [None]:
#!wget https://raw.githubusercontent.com/giridhar276/Datasets/master/COI/COI_Q_A.csv

In [None]:
import requests

url = "https://raw.githubusercontent.com/giridhar276/Datasets/master/COI/COI_Q_A.csv"
pdf_path = "COI_Q_A.csv"

response = requests.get(url)
with open(pdf_path, "wb") as f:
    f.write(response.content)

print("Download complete:", pdf_path)

In [None]:
import pandas as pd
test_data=pd.read_csv("COI_Q_A.csv")
test_data.head()

#Creating a iterative "Question", "Answer" pairs
test_qa_pairs = []
for index, row in test_data.iterrows():
  question = row['Question']
  answer = row['Answer']
  test_qa_pairs.append({'question': question, 'answer': answer})
print(test_qa_pairs)

In [None]:
#Add one more Question and Answer
question_new="Who is Narendra Modi?"
answer_new="""
Narendra Modi is the current Prime Minister of India, serving since 2014.
He is a member of the Bharatiya Jana    ta Party (BJP) and
previously served as the Chief Minister of Gujarat from 2001 to 2014.
"""
test_qa_pairs.append({'question': question_new, 'answer': answer_new})

## Get the Predictions

In [None]:
predictions = Q_AChain.apply(test_qa_pairs)
predictions

## Compare Actual Answers with predicted Answers

In [None]:
#The comparison is done using an LLM
from langchain.evaluation.qa import QAEvalChain

llm=OpenAI(temperature=0)

qa_eval_chain = QAEvalChain.from_llm(llm)

In [None]:
eval_result = qa_eval_chain.evaluate(test_qa_pairs,
                                     predictions,
                                      question_key="question",
                                      answer_key="answer"
                                     )
print(eval_result)

## Calculate The Accuracy