In [None]:
!pip install langchain langchain_community faiss-cpu # langchain_chroma

In [None]:
!pip install -qU langchain-openai

In [1]:
import getpass
import os

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass('langsmith key:')
os.environ["OPENAI_API_KEY"] = getpass.getpass('openai api key:')

In [2]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

In [None]:
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")

example_messages = prompt.invoke(
    {"context": "filler context", "question": "filler question"}
).to_messages()

print(example_messages[0].content)

In [None]:
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
# Load, chunk and index the contents of the blog.
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()
print(len(docs[0].page_content))

In [None]:
from langchain_community.document_loaders import BSHTMLLoader

file_path = 'data/webpage.html'
loader = BSHTMLLoader(
    file_path,
    open_encoding='utf-8',
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()
print(docs)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
splits = text_splitter.split_documents(docs)
print(len(splits), len(splits[0].page_content), splits[10].metadata)

In [None]:
import faiss
from uuid import uuid4
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

index = faiss.IndexFlatL2(len(OpenAIEmbeddings().embed_query("hello world")))
vector_store = FAISS(
    embedding_function=OpenAIEmbeddings(),
    index=index,
    docstore= InMemoryDocstore(),
    index_to_docstore_id={}
)
uuids = [str(uuid4()) for _ in range(len(splits))]
vector_store.add_documents(documents=splits, ids=uuids)

#### Uncomment below cell for usage of Chroma instead of FAISS vectorestore
Note: Chroma is error when install on Window

In [None]:
# from langchain_chroma import Chroma
# vector_store = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

---

In [None]:
# Retrieve and generate using the relevant snippets of the blog.
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 6})
retrieved_docs = retriever.invoke("What are the approaches to Task Decomposition?")
print(len(retrieved_docs), retrieved_docs[0].page_content)

In [None]:
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# for chunk in rag_chain.stream("What is Task Decomposition?"):
#     print(chunk, end="", flush=True)
rag_chain.invoke("What is Task Decomposition?")