In [None]:
import os
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
import tiktoken

In [None]:
#Initialize the embeddings model
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

#Load and process local document(PDF)
file_path = os.path.join(os.getcwd(), "name.pdf")
pdf_loader = PyPDFLoader(file_path)
pdf_documents = pdf_loader.load()

#Split pdf into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
split_pdf_documents = text_splitter.split_documents(pdf_documents)

#Define URLs to fetch web content
urls = [
    "some Urls"
]

#Load web content
web_loader = WebBaseLoader(web_path=urls)
web_documents = web_loader.load()

#Split web documents into chunks
split_web_documents = text_splitter.split_documents(web_documents)

#Combine web and local documents
all_documents = split_pdf_documents + split_web_documents

#Index all documents in chroma
db = Chroma.from_documents(documents=all_documents, embedding=embeddings)
print("All documents indeced in Chroma successfully")

#Define a retriever to fetch relevant documents from the combined sources
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":5})#Retrieve top 5 relevant chunks

#Define the prompt template for the LLM
prompt = ChatPromptTemplate.from_template("""
You are an assistant for question-answering tasks.Use the following pieces of retrieved context to answer the question.
if you don't know the answer, just say that you don't know.Use three sentences maximum and keep the answer concise.
Question: {question}
Context: {context} 
Answer:                                                                                                                                                                       
                                          
""")

#Initialize the model
model = ChatOpenAI(model="gpt-4o-mini", temperature=0)

#Set up the RAG chain
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)



In [None]:
#Ask a question and generate a respoponse
question = "Who are the main author of FAISSE?"
for chunk in rag_chain.stream(question):
    print(chunk, end="", flush=True)