# Tutorial on RAG using LangChain

Sam Witteveen, youtube channel

In [1]:
# Get Open AI key from .env file saved as OPENAI_API_KEY 
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
import langchain, chromadb
langchain.__version__, chromadb.__version__
# 0.0.319 doesn't work with chromadb >= 0.4.16
# fixed: pip install -U langchain chromadb

('0.0.345', '0.4.18')

In [3]:
# pip install rank_bm25
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.schema import Document

# pip install faiss-cpu 
from langchain.vectorstores import Chroma, FAISS

# pip install tiktoken
from langchain.embeddings.openai import OpenAIEmbeddings

## 1. Retrieval using both sparse and dense methods

In [4]:
# Docs are structured like this: 
# [
#     Document(
#         page_content="I like apples", 
#         metadata={"title": "Apple"}, 
#     ), ...
# ]

# Text list, not really a doc list
# Call the retriever with `from_texts(doc_list)`
doc_list = [
    "I like apples", 
    "I like oranges", 
    "Apples and oranges are both fruit", 
    "I like computers by Apple", 
    "I love fruit juice"
]

### Sparse retrieval using BM25

In [5]:
bm25_ret = BM25Retriever.from_texts(doc_list)
bm25_ret

BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x7f411f3e13d0>, docs=[Document(page_content='I like apples'), Document(page_content='I like oranges'), Document(page_content='Apples and oranges are both fruit'), Document(page_content='I like computers by Apple'), Document(page_content='I love fruit juice')])

In [6]:
bm25_ret.k = 2
# bm25_ret.get_relevant_documents("I like apples")
bm25_ret.get_relevant_documents("apple")

[Document(page_content='I love fruit juice'),
 Document(page_content='I like computers by Apple')]

In [7]:
bm25_ret.dict

<bound method BaseModel.dict of BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x7f411f3e13d0>, docs=[Document(page_content='I like apples'), Document(page_content='I like oranges'), Document(page_content='Apples and oranges are both fruit'), Document(page_content='I like computers by Apple'), Document(page_content='I love fruit juice')], k=2)>

### Dense vector retrieval aka Embeddings

In [8]:
embeddings = OpenAIEmbeddings()

In [9]:
faiss_store = FAISS.from_texts(doc_list, embeddings)

In [10]:
faiss_ret = faiss_store.as_retriever(search_kwargs={"k": 2})
faiss_ret.get_relevant_documents("apple")

[Document(page_content='I like apples'),
 Document(page_content='I like computers by Apple')]

### Ensemble Retrieval

In [11]:
# ?EnsembleRetriever

In [12]:
ensemble_ret = EnsembleRetriever(
    retrievers=[bm25_ret, faiss_ret], 
    # weights = [0.5, 0.5]
)
ensemble_ret

EnsembleRetriever(retrievers=[BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x7f411f3e13d0>, docs=[Document(page_content='I like apples'), Document(page_content='I like oranges'), Document(page_content='Apples and oranges are both fruit'), Document(page_content='I like computers by Apple'), Document(page_content='I love fruit juice')], k=2), VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain.vectorstores.faiss.FAISS object at 0x7f411eaaea10>, search_kwargs={'k': 2})], weights=[0.5, 0.5])

In [13]:
ensemble_ret
ensemble_ret.get_relevant_documents("apple")

[Document(page_content='I like computers by Apple'),
 Document(page_content='I love fruit juice'),
 Document(page_content='I like apples')]

## 2. Self Querying Retrieval: Use LLM to break a query to a metadata filter + semantic search

In [14]:
# pip install lark

In [15]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [16]:
# List of documents describing different wines and their characteristics
docs = [
    Document(
        page_content="Complex, layered, rich red with dark fruit flavors",
        metadata={"name":"Opus One", "year": 2018, "rating": 96, "grape": "Cabernet Sauvignon", "color":"red", "country":"USA"},
    ),
    Document(
        page_content="Luxurious, sweet wine with flavors of honey, apricot, and peach",
        metadata={"name":"Château d'Yquem", "year": 2015, "rating": 98, "grape": "Sémillon", "color":"white", "country":"France"},
    ),
    Document(
        page_content="Full-bodied red with notes of black fruit and spice",
        metadata={"name":"Penfolds Grange", "year": 2017, "rating": 97, "grape": "Shiraz", "color":"red", "country":"Australia"},
    ),
    Document(
        page_content="Elegant, balanced red with herbal and berry nuances",
        metadata={"name":"Sassicaia", "year": 2016, "rating": 95, "grape": "Cabernet Franc", "color":"red", "country":"Italy"},
    ),
    Document(
        page_content="Highly sought-after Pinot Noir with red fruit and earthy notes",
        metadata={"name":"Domaine de la Romanée-Conti", "year": 2018, "rating": 100, "grape": "Pinot Noir", "color":"red", "country":"France"},
    ),
    Document(
        page_content="Crisp white with tropical fruit and citrus flavors",
        metadata={"name":"Cloudy Bay", "year": 2021, "rating": 92, "grape": "Sauvignon Blanc", "color":"white", "country":"New Zealand"},
    ),
    Document(
        page_content="Rich, complex Champagne with notes of brioche and citrus",
        metadata={"name":"Krug Grande Cuvée", "year": 2010, "rating": 93, "grape": "Chardonnay blend", "color":"sparkling", "country":"New Zealand"},
    ),
    Document(
        page_content="Intense, dark fruit flavors with hints of chocolate",
        metadata={"name":"Caymus Special Selection", "year": 2018, "rating": 96, "grape": "Cabernet Sauvignon", "color":"red", "country":"USA"},
    ),
    Document(
        page_content="Exotic, aromatic white with stone fruit and floral notes",
        metadata={"name":"Jermann Vintage Tunina", "year": 2020, "rating": 91, "grape": "Sauvignon Blanc blend", "color":"white", "country":"Italy"},
    ),
]

In [17]:
# Dense retrieval: ChromaDB as the vector store

embeddings = OpenAIEmbeddings()
chroma_store = Chroma.from_documents(docs, embeddings)

### Metadata schema/information

In [18]:
metadata_field_info = [
    AttributeInfo(
        name="grape",
        description="The grape used to make the wine",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="name",
        description="The name of the wine",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="color",
        description="The color of the wine",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="year",
        description="The year the wine was released",
        type="integer",
    ),
    AttributeInfo(
        name="country",
        description="The name of the country the wine comes from",
        type="string",
    ),
    AttributeInfo(
        name="rating", description="The Robert Parker rating for the wine 0-100", type="integer" #float
    ),
]

document_content_description = "Brief description of the wine"

In [19]:
llm = OpenAI(temperature=0)
# llm

In [20]:
?SelfQueryRetriever.from_llm

[0;31mSignature:[0m
[0mSelfQueryRetriever[0m[0;34m.[0m[0mfrom_llm[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mllm[0m[0;34m:[0m [0mlangchain_core[0m[0;34m.[0m[0mlanguage_models[0m[0;34m.[0m[0mbase[0m[0;34m.[0m[0mBaseLanguageModel[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvectorstore[0m[0;34m:[0m [0mlangchain_core[0m[0;34m.[0m[0mvectorstores[0m[0;34m.[0m[0mVectorStore[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdocument_contents[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmetadata_field_info[0m[0;34m:[0m [0mSequence[0m[0;34m[[0m[0mUnion[0m[0;34m[[0m[0mlangchain[0m[0;34m.[0m[0mchains[0m[0;34m.[0m[0mquery_constructor[0m[0;34m.[0m[0mschema[0m[0;34m.[0m[0mAttributeInfo[0m[0;34m,[0m [0mdict[0m[0;34m][0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstructured_query_translator[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mlangchain[0m[0;34m.[0m[0mchains[0m[0;34m.[0m[0mquery

In [21]:
retriever = SelfQueryRetriever.from_llm(
    llm=llm,
    vectorstore=chroma_store, 
    document_contents=document_content_description,
    metadata_field_info=metadata_field_info, 
    enable_limit=True, # limit the number of documents returned
    verbose=True
)

In [22]:
# `enable_limit` restricts outpus to a certain number of documents
retriever.invoke("Show me two red wines with fruity falvors?") 

[Document(page_content='Intense, dark fruit flavors with hints of chocolate', metadata={'color': 'red', 'country': 'USA', 'grape': 'Cabernet Sauvignon', 'name': 'Caymus Special Selection', 'rating': 96, 'year': 2018}),
 Document(page_content='Complex, layered, rich red with dark fruit flavors', metadata={'color': 'red', 'country': 'USA', 'grape': 'Cabernet Sauvignon', 'name': 'Opus One', 'rating': 96, 'year': 2018})]

In [23]:
# This example only specifies a relevant query
retriever.get_relevant_documents("What are some red wines", verbose=True)

[Document(page_content='Elegant, balanced red with herbal and berry nuances', metadata={'color': 'red', 'country': 'Italy', 'grape': 'Cabernet Franc', 'name': 'Sassicaia', 'rating': 95, 'year': 2016}),
 Document(page_content='Complex, layered, rich red with dark fruit flavors', metadata={'color': 'red', 'country': 'USA', 'grape': 'Cabernet Sauvignon', 'name': 'Opus One', 'rating': 96, 'year': 2018}),
 Document(page_content='Highly sought-after Pinot Noir with red fruit and earthy notes', metadata={'color': 'red', 'country': 'France', 'grape': 'Pinot Noir', 'name': 'Domaine de la Romanée-Conti', 'rating': 100, 'year': 2018}),
 Document(page_content='Intense, dark fruit flavors with hints of chocolate', metadata={'color': 'red', 'country': 'USA', 'grape': 'Cabernet Sauvignon', 'name': 'Caymus Special Selection', 'rating': 96, 'year': 2018})]

In [24]:
# Semantic search
retriever.get_relevant_documents("I want a wine that has fruity nodes")

[Document(page_content='Crisp white with tropical fruit and citrus flavors', metadata={'color': 'white', 'country': 'New Zealand', 'grape': 'Sauvignon Blanc', 'name': 'Cloudy Bay', 'rating': 92, 'year': 2021}),
 Document(page_content='Intense, dark fruit flavors with hints of chocolate', metadata={'color': 'red', 'country': 'USA', 'grape': 'Cabernet Sauvignon', 'name': 'Caymus Special Selection', 'rating': 96, 'year': 2018}),
 Document(page_content='Luxurious, sweet wine with flavors of honey, apricot, and peach', metadata={'color': 'white', 'country': 'France', 'grape': 'Sémillon', 'name': "Château d'Yquem", 'rating': 98, 'year': 2015}),
 Document(page_content='Complex, layered, rich red with dark fruit flavors', metadata={'color': 'red', 'country': 'USA', 'grape': 'Cabernet Sauvignon', 'name': 'Opus One', 'rating': 96, 'year': 2018})]

In [25]:
# This example specifies a query and a filter
retriever.get_relevant_documents("I want a wine that has fruity nodes and has a rating above 97")

[Document(page_content='Luxurious, sweet wine with flavors of honey, apricot, and peach', metadata={'color': 'white', 'country': 'France', 'grape': 'Sémillon', 'name': "Château d'Yquem", 'rating': 98, 'year': 2015}),
 Document(page_content='Highly sought-after Pinot Noir with red fruit and earthy notes', metadata={'color': 'red', 'country': 'France', 'grape': 'Pinot Noir', 'name': 'Domaine de la Romanée-Conti', 'rating': 100, 'year': 2018})]