In [1]:
!pip install --quiet langchain langchain_community chromadb langchain_openai langgraph

In [2]:
import os
from google.colab import userdata
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain.load import loads, dumps

os.environ['LANGSMITH_API_KEY'] = userdata.get('LANG_TOKEN')
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['OPENAI_API_KEY'] = userdata.get('OAI_TOKEN')



In [3]:
loader = WebBaseLoader('https://en.wikipedia.org/wiki/Belgium')

docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(docs)

vectorstore = Chroma.from_documents(documents=texts, embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

prompt = hub.pull('rlm/rag-prompt')

llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)

def format_docs(docs):
  return '\n\n'.join(doc.page_content for doc in docs)

chain = (
    {'context': retriever | format_docs, 'question': RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

question = "What is Belgium most famous for?"
chain.invoke(question)

'Belgium is most famous for its cuisine, including dishes like moules-frites, beer, chocolate, waffles, and French fries. The country is also known for its highly ranked restaurants and variety of beers, including Trappist beer brewed by monks. In terms of sports, association football is the most popular, with the national team achieving top rankings and success in international competitions.'

In [6]:
template = """You are an AI language model assistant. Your task is to generate five
different versions of the given user question to retrieve relevant documents from a vector
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search.
Provide these alternative questions separated by newlines. Original question: {question}"""

prompt_template = ChatPromptTemplate.from_template(template)

gen_queries = (
    prompt_template
    | ChatOpenAI(temperature=0)
    | StrOutputParser()
    | (lambda x: x.split('\n'))
)

def get_unique_docs(docs):
  dump_docs = [dumps(doc) for sublist in docs for doc in sublist]
  unique_docs = list(set(dump_docs))
  return [loads(doc) for doc in unique_docs]

unique_docs = get_unique_docs(docs)

retrieval_chain = (
    gen_queries
    | retriever.map()
    | get_unique_docs
)

template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

chain = (
    {'context': retrieval_chain, 'question': RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

chain.invoke(question)

'Belgium is most famous for its beer, chocolate, waffles, and French fries.'