# Astra DB and LangChain, quickstart / integration guide

In [1]:
! pip install --quiet \
    "langchain>=0.0.336" \
    "astrapy>=0.6.0" \
    "datasets>=2.14.6" \
    "openai>=1.3.0" \
    "pypdf>=3.17.0" \
    "tiktoken>=0.5.1"

In [2]:
import os
from getpass import getpass

from datasets import load_dataset

from langchain.vectorstores import AstraDB
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings

## Init

Example values:
- API Endpoint: `https://01234567-89ab-cdef-0123-456789abcdef-us-east1.apps.astra.datastax.com`
- Token: `AstraCS:6gBhNmsk135....` (it must have a role of at least "Database Administrator")
- _Keyspace. Optional, if provided:_ `my_keyspace`
- OpenAI API key: `sk-4fQ3F...`

In [3]:
if 'ASTRA_DB_API_ENDPOINT' not in os.environ:
    os.environ["ASTRA_DB_API_ENDPOINT"] = input("ASTRA_DB_API_ENDPOINT = ")

if 'ASTRA_DB_APPLICATION_TOKEN' not in os.environ:
    os.environ["ASTRA_DB_APPLICATION_TOKEN"] = getpass("ASTRA_DB_APPLICATION_TOKEN = ")

if 'ASTRA_DB_KEYSPACE' not in os.environ:
    ks = input("(Optional) ASTRA_DB_KEYSPACE = ")
    if ks:
        os.environ["ASTRA_DB_KEYSPACE"] = ks

In [4]:
if 'OPENAI_API_KEY' not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass("OPENAI_API_KEY = ")

In [5]:
embe = OpenAIEmbeddings()
vstore = AstraDB(
    embedding=embe,
    collection_name="astra_vector_demo",
    token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
    api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
    namespace=os.environ.get("ASTRA_DB_KEYSPACE"),     #  Usually no need for this
)

## Load a small dataset

In [6]:
philo_dataset = load_dataset("datastax/philosopher-quotes")["train"]
print("An example entry:")
print(philo_dataset[16])

An example entry:
{'author': 'aristotle', 'quote': 'Love well, be loved and do something of value.', 'tags': 'love;ethics'}


## Add dataset documents to the vector store

In [7]:
docs = []
for entry in philo_dataset:
    metadata = {"author": entry["author"]}
    if entry["tags"]:
        for tag in entry["tags"].split(";"):
            metadata[tag] = "y"
    doc = Document(page_content=entry["quote"], metadata=metadata)
    docs.append(doc)

inserted_ids = vstore.add_documents(docs)
print(f"\nInserted {len(inserted_ids)} documents.")


Inserted 450 documents.


### Alternatively, use `add_texts`

This has the advantage that you can specify the IDs, so that you don't risk duplicating the entries if you run the insertion multiple times.

In [8]:
texts = [
    "I think, therefore I am.",
    "To the things themselves!",
]
metadatas = [
    {"author": "descartes", "knowledge": "y"},
    {"author": "husserl", "knowledge": "y"},
]
ids = [
    "desc_01",
    "huss_xy",
]
inserted_ids_2 = vstore.add_texts(texts=texts, metadatas=metadatas, ids=ids)
print(f"\nInserted {len(inserted_ids_2)} documents.")


Inserted 2 documents.


## ANN, simple

In [9]:
results = vstore.similarity_search("Our life is what we make of it", k=3)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* We are what we are because we have been what we have been. [{'author': 'freud', 'history': 'y'}]
* We are what we are because we have been what we have been. [{'author': 'freud', 'history': 'y'}]
* We become what we contemplate. [{'author': 'plato', 'knowledge': 'y', 'ethics': 'y'}]


### Similarity values

In [10]:
results = vstore.similarity_search_with_score("Our life is what we make of it", k=3)
for res, score in results:
    print(f"* [SIM={score:3f}] {res.page_content} [{res.metadata}]")

* [SIM=0.934130] We are what we are because we have been what we have been. [{'author': 'freud', 'history': 'y'}]
* [SIM=0.934110] We are what we are because we have been what we have been. [{'author': 'freud', 'history': 'y'}]
* [SIM=0.932172] We become what we contemplate. [{'author': 'plato', 'knowledge': 'y', 'ethics': 'y'}]


## ANN, with metadata

In [11]:
results = vstore.similarity_search(
    "Our life is what we make of it",
    k=3,
    filter={"author": "aristotle"},
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* The quality of life is determined by its activities. [{'author': 'aristotle'}]
* The quality of life is determined by its activities. [{'author': 'aristotle'}]
* You are what you repeatedly do [{'author': 'aristotle'}]


## MMR search

In [12]:
results = vstore.max_marginal_relevance_search(
    "Our life is what we make of it",
    k=3,
    filter={"author": "aristotle"},
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* The quality of life is determined by its activities. [{'author': 'aristotle'}]
* You are what you repeatedly do [{'author': 'aristotle'}]
* The man who is truly good and wise will bear with dignity whatever fortune sends, and will always make the best of his circumstances. [{'author': 'aristotle', 'knowledge': 'y', 'ethics': 'y'}]


## Deleting documents

### By document ID

In [13]:
delete_1 = vstore.delete(inserted_ids[:3])
print(f"all_succeed={delete_1}")  # True, all documents deleted

all_succeed=True


In [14]:
delete_2 = vstore.delete(inserted_ids[2:5])
print(f"some_succeeds={delete_2}")  # True, though some IDs were gone already

some_succeeds=True


### Retrieve and then delete

Sometimes you do not have the IDs, ... but you might want to run a search and then delete the results for some reason:

In [15]:
ids_to_delete = []
for res_doc, res_score, res_id in vstore.similarity_search_with_score_id(
    "Philosophy has no goals",
    k=2,
):
    print(f"* [SIM={res_score:3f}] {res_doc.page_content} [{res_doc.metadata}]")
    ids_to_delete.append(res_id)

print(f"Deleting IDs = {ids_to_delete} ...")
success = vstore.delete(ids_to_delete)
print(f"Deletion succeeded = {success}")

* [SIM=0.920211] Philosophy can make people sick. [{'author': 'aristotle', 'politics': 'y'}]
* [SIM=0.920160] Philosophy can make people sick. [{'author': 'aristotle', 'politics': 'y'}]
Deleting IDs = ['43', 'a81bc61456b049ee958ee2c092a05055'] ...
Deletion succeeded = True


Now try again the same search:

In [16]:
for res_doc, res_score, res_id in vstore.similarity_search_with_score_id(
    "Philosophy has no goals",
    k=2,
):
    print(f"* [SIM={res_score:3f}] {res_doc.page_content} [{res_doc.metadata}]")

* [SIM=0.920148] For what purpose humanity is there should not even concern us: why you are here, that you should ask yourself: and if you have no ready answer, then set for yourself goals, high and noble goals, and perish in pursuit of them! [{'author': 'nietzsche', 'ethics': 'y', 'knowledge': 'y'}]
* [SIM=0.918522] Philosophy is by its nature something esoteric, neither made for the mob nor capable of being prepared for the mob. [{'author': 'hegel'}]


### Delete the **whole** store

> _Warning: use with caution. Data loss!_

In [17]:
vstore.clear()

## A mini-RAG

The store is now empty. Let us re-populate it, this time by loading from a (locally available) PDF file.

_(The file is an abridged version of a public document found at [this link](https://commons.bellevuecollege.edu/wp-content/uploads/sites/125/2017/04/Intro-to-Phil-full-text.pdf))_

The whole ingestion of the document, from reading the input PDF to sensibly splitting its text to computing and storing the sentence embeddings, is handled within LangChain by the code in the two cells below:

In [18]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

**(Colab-only) Get the source PDF file**

> You don't need to run the following cell unless you are on a Google Colab notebook:

In [None]:
# Run this cell if on a Google Colab:
!mkdir -p sources
!curl -L \
    "https://github.com/awesome-astra/datasets/blob/main/demo-resources/what-is-philosophy/what-is-philosophy.pdf?raw=true" \
    -o "sources/what-is-philosophy.pdf"

#### Load the PDF file in the vector store:

In [19]:
pdf_loader = PyPDFLoader("sources/what-is-philosophy.pdf")
splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)
docs_from_pdf = pdf_loader.load_and_split(text_splitter=splitter)

print(f"Documents from PDF: {len(docs_from_pdf)}.")
inserted_ids_from_pdf = vstore.add_documents(docs_from_pdf)
print(f"Inserted {len(inserted_ids_from_pdf)} documents.")

Documents from PDF: 38.
Inserted 38 documents.


We use the LCEL (LangChain Expression Language), ready to be served e.g. through `langchain serve` among other delivery methods:

In [20]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

In [21]:
retriever = vstore.as_retriever(search_kwargs={'k': 3})

philo_template = """
You are a philosopher that draws inspiration from great thinkers of the past
to craft well-thought answers to user questions. Use the provided context as the basis
for your answers and do not make up new reasoning paths - just mix-and-match what you are given.
Your answers must be concise and to the point, and refrain from answering about other topics than philosophy.

CONTEXT:
{context}

QUESTION: {question}

YOUR ANSWER:"""

philo_prompt = ChatPromptTemplate.from_template(philo_template)

llm = ChatOpenAI()

chain = (
    {"context": retriever, "question": RunnablePassthrough()} 
    | philo_prompt 
    | llm 
    | StrOutputParser()
)

In [22]:
chain.invoke("How does Russel elaborate on Peirce's idea of the security blanket?")

"Russell elaborates on Peirce's idea of the security blanket by describing the intellectual consequences of this paradox. He states that a person without any philosophy is imprisoned in the prejudices derived from common sense and the habitual beliefs of their age or nation. This implies that clinging to comforting beliefs can limit one's intellectual growth and hinder their ability to critically examine and question their own beliefs."

## Cleanup

Let us completely delete the collection, thereby freeing the associated resources on Astra DB:

> _Warning: use with caution. Data loss!_

In [23]:
vstore.delete_collection()