In [None]:
! pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain python-dotenv bs4 sentence-transformers

In [11]:
import os

from dotenv import load_dotenv

load_dotenv()

True

In [5]:
os.environ["USER_AGENT"] = "Test App/1.0"

In [12]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
import bs4

In [11]:
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)

docs = loader.load()

In [None]:
print(docs)

In [15]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

In [17]:
splits = text_splitter.split_documents(docs)
print(len(splits))

66


In [1]:
texts = ["Capital of India", "New Delhi is the capital of India"]

In [None]:
hf_embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [14]:
embed_results = hf_embeddings.embed_documents(texts)

In [33]:
embed_results

[[0.06447591632604599,
  0.004543404560536146,
  -0.05524414777755737,
  0.05434794723987579,
  -0.02648809552192688,
  -0.06024197116494179,
  0.08294324576854706,
  0.004911324940621853,
  -0.042170796543359756,
  -0.037091679871082306,
  -0.011078628711402416,
  -0.14803680777549744,
  0.04406586289405823,
  -0.057990409433841705,
  0.04081453010439873,
  -0.060225874185562134,
  0.006882606539875269,
  0.002446805825456977,
  0.05603128671646118,
  -0.041738156229257584,
  -0.03199230507016182,
  0.02955857664346695,
  -0.0051704030483961105,
  -0.05991670489311218,
  0.04492837190628052,
  0.044982463121414185,
  0.032191116362810135,
  -0.03709763288497925,
  -0.029527561739087105,
  -0.021014010533690453,
  0.08979376405477524,
  -0.06354320794343948,
  -0.006944118067622185,
  -0.01180096436291933,
  -0.055113185197114944,
  0.002319923136383295,
  -0.06585197150707245,
  0.06808938086032867,
  0.15499247610569,
  -0.07143879681825638,
  0.008063670247793198,
  0.03830718994140

In [16]:
vector_db = Chroma.from_texts(texts, hf_embeddings)

In [31]:
print(vector_db._collection.count())

2


In [26]:
search_results = vector_db.similarity_search("ind")

for i in search_results:
    print(i.page_content)

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


Capital of India
New Delhi is the capital of India


In [23]:
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
vectorstore2 = Chroma.from_texts()

In [24]:
retriever = vectorstore.as_retriever()

In [26]:
prompt = hub.pull("rlm/rag-prompt")

In [40]:
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

In [34]:
#post processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [44]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
rag_chain.invoke("What is Task")