# Astra DB and LangChain, quickstart / integration guide

In [27]:
# TEMPORARY INSTALL: this will become "pip install langchain[+version specifier]"
!pip install --quiet "git+https://github.com/hemidactylus/langchain@SL-astra-db#egg=langchain&subdirectory=libs/langchain"

! pip install --quiet \
    "astrapy>=0.5.0" \
    "datasets==2.14.6" \
    "openai==0.28.1" \
    "pypdf==3.17.0" \
    "tiktoken==0.5.1"


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
import os
from getpass import getpass

from datasets import load_dataset

from langchain.vectorstores import AstraDB
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings

## Init

Example values:
- API Endpoint: `https://01234567-89ab-cdef-0123-456789abcdef-us-east1.apps.astra.datastax.com`
- Token: `AstraCS:6gBhNmsk135....` (it must have a role of at least "Database Administrator")
- Keyspace. Optional, if provided: `my_keyspace`
- OpenAI API key: `sk-4fQ3F...`

In [2]:
if 'ASTRA_DB_API_ENDPOINT' not in os.environ:
    os.environ["ASTRA_DB_API_ENDPOINT"] = input("ASTRA_DB_API_ENDPOINT = ")

if 'ASTRA_DB_APPLICATION_TOKEN' not in os.environ:
    os.environ["ASTRA_DB_APPLICATION_TOKEN"] = getpass("ASTRA_DB_APPLICATION_TOKEN = ")

if 'ASTRA_DB_KEYSPACE' not in os.environ:
    ks = input("(Optional) ASTRA_DB_KEYSPACE = ")
    if ks:
        os.environ["ASTRA_DB_KEYSPACE"] = ks

In [3]:
if 'OPENAI_API_KEY' not in os.environ:
    os.environ["OPENAI_API_KEY"] = input("OPENAI_API_KEY = ")

In [4]:
embe = OpenAIEmbeddings()
vstore = AstraDB(
    embedding=embe,
    collection_name="astra_vector_demo",
    token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
    api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
    namespace=os.environ.get("ASTRA_DB_KEYSPACE"),     # <-- this will disappear in a typical quickstart
)

## Load a small dataset

In [5]:
philo_dataset = load_dataset("datastax/philosopher-quotes")["train"]
print("An example entry:")
print(philo_dataset[16])

An example entry:
{'author': 'aristotle', 'quote': 'Love well, be loved and do something of value.', 'tags': 'love;ethics'}


## Add dataset documents to the vector store

In [6]:
docs = []
for entry in philo_dataset:
    metadata = {"author": entry["author"]}
    if entry["tags"]:
        for tag in entry["tags"].split(";"):
            metadata[tag] = "y"
    doc = Document(page_content=entry["quote"], metadata=metadata)
    docs.append(doc)

inserted_ids = vstore.add_documents(docs)
print(f"\nInserted {len(inserted_ids)} documents.")


Inserted 450 documents.


### Alternatively, use `add_texts`

This has the advantage that you can specify the IDs, so that you don't risk duplicating the entries if you run the insertion multiple times.

In [15]:
texts = [
    "I think, therefore I am.",
    "To the things themselves!",
]
metadatas = [
    {"author": "descartes", "knowledge": "y"},
    {"author": "husserl", "knowledge": "y"},
]
ids = [
    "desc_01",
    "huss_xy",
]
inserted_ids_2 = vstore.add_texts(texts=texts, metadatas=metadatas, ids=ids)
print(f"\nInserted {len(inserted_ids_2)} documents.")


Inserted 2 documents.


## ANN, simple

In [8]:
results = vstore.similarity_search("Our life is what we make of it", k=3)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* We are what we are because we have been what we have been. [{'author': 'freud', 'history': 'y'}]
* We become what we contemplate. [{'author': 'plato', 'knowledge': 'y', 'ethics': 'y'}]
* In the blessings as well as in the ills of life, less depends upon what befalls us than upon the way in which it is met. [{'author': 'schopenhauer', 'knowledge': 'y', 'ethics': 'y'}]


### Similarity values

In [9]:
results = vstore.similarity_search_with_score("Our life is what we make of it", k=3)
for res, score in results:
    print(f"* [SIM={score:3f}] {res.page_content} [{res.metadata}]")

* [SIM=0.934118] We are what we are because we have been what we have been. [{'author': 'freud', 'history': 'y'}]
* [SIM=0.932172] We become what we contemplate. [{'author': 'plato', 'knowledge': 'y', 'ethics': 'y'}]
* [SIM=0.928756] In the blessings as well as in the ills of life, less depends upon what befalls us than upon the way in which it is met. [{'author': 'schopenhauer', 'knowledge': 'y', 'ethics': 'y'}]


## ANN, with metadata

In [10]:
results = vstore.similarity_search(
    "Our life is what we make of it",
    k=3,
    filter={"author": "aristotle"},
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* The quality of life is determined by its activities. [{'author': 'aristotle'}]
* You are what you repeatedly do [{'author': 'aristotle'}]
* You are what you do repeatedly. [{'author': 'aristotle'}]


## MMR search

In [11]:
results = vstore.max_marginal_relevance_search(
    "Our life is what we make of it",
    k=3,
    filter={"author": "aristotle"},
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* The quality of life is determined by its activities. [{'author': 'aristotle'}]
* Love is composed of a single soul inhabiting two bodies. [{'author': 'aristotle', 'love': 'y'}]
* We must be neither cowardly nor rash but courageous. [{'author': 'aristotle', 'ethics': 'y', 'knowledge': 'y'}]


## Deleting documents

### By document ID

These methods return True if and only if the deletion succeeds completely (in the multiple-ID case, if all IDs result in a successful deletion):

In [16]:
all_succeed = vstore.delete(inserted_ids_2)
single_succeeds = vstore.delete_by_document_id(inserted_ids_2[0])  # this will fail since we just deleted it...
print(f"all_succeed={all_succeed}, single_succeeds={single_succeeds}.")

all_succeed=True, single_succeeds=False.


### Retrieve and then delete

Sometimes you do not have the IDs, ... but you might want to run a search and then delete the results for some reason:

In [21]:
ids_to_delete = []
for res_doc, res_score, res_id in vstore.similarity_search_with_score_id(
    "Philosophy has no goals",
    k=2,
):
    print(f"* [SIM={res_score:3f}] {res_doc.page_content} [{res_doc.metadata}]")
    ids_to_delete.append(res_id)

print(f"Deleting IDs = {ids_to_delete} ...")
success = vstore.delete(ids_to_delete)
print(f"Deletion succeeded = {success}")

* [SIM=0.920229] Philosophy can make people sick. [{'author': 'aristotle', 'politics': 'y'}]
* [SIM=0.920148] For what purpose humanity is there should not even concern us: why you are here, that you should ask yourself: and if you have no ready answer, then set for yourself goals, high and noble goals, and perish in pursuit of them! [{'author': 'nietzsche', 'ethics': 'y', 'knowledge': 'y'}]
Deleting IDs = ['fce604a02e0c40b492f39d6985fd61da', '60036cae6a054fa3a190ad3cadc44bd4'] ...
Deletion succeeded = True


Now try again the same search:

In [22]:
for res_doc, res_score, res_id in vstore.similarity_search_with_score_id(
    "Philosophy has no goals",
    k=2,
):
    print(f"* [SIM={res_score:3f}] {res_doc.page_content} [{res_doc.metadata}]")

* [SIM=0.918580] Philosophy is by its nature something esoteric, neither made for the mob nor capable of being prepared for the mob. [{'author': 'hegel'}]
* [SIM=0.916111] The business of philosophy is not to give rules, but to analyze the private judgments of common reason. [{'author': 'kant'}]


### Delete the **whole** store

> _Warning: use with caution. Data loss!_

In [23]:
vstore.clear()

## A mini-RAG

The store is now empty. Let us re-populate it, this time by loading from a (locally available) PDF file.

_(The file is an abridged version of a public document found at [this link](https://commons.bellevuecollege.edu/wp-content/uploads/sites/125/2017/04/Intro-to-Phil-full-text.pdf))_

The whole ingestion of the document, from reading the input PDF to sensibly splitting its text to computing and storing the sentence embeddings, is handled within LangChain by the code in the two cells below:

In [25]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [31]:
pdf_loader = PyPDFLoader("sources/what-is-philosophy.pdf")
splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)
docs_from_pdf = pdf_loader.load_and_split(text_splitter=splitter)

print(f"Documents from PDF: {len(docs_from_pdf)}.")
inserted_ids_from_pdf = vstore.add_documents(docs_from_pdf)
print(f"Inserted {len(inserted_ids_from_pdf)} documents.")

Documents: 38.
Inserted 38 documents.


We use the LCEL (langchain expression language), ready to be served e.g. through `langchain serve` among other delivery methods:

In [32]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

In [33]:
retriever = vstore.as_retriever(search_kwargs={'k': 3})

philo_template = """
You are a philosopher that draws inspiration from great thinkers of the past
to craft well-thought answers to user questions. Use the provided context as the basis
for your answers and do not make up new reasoning paths - just mix-and-match what you are given.
Your answers must be concise and to the point, and refrain from answering about other topics than philosophy.

CONTEXT:
{context}

QUESTION: {question}

YOUR ANSWER:"""

philo_prompt = ChatPromptTemplate.from_template(philo_template)

llm = ChatOpenAI()

chain = (
    {"context": retriever, "question": RunnablePassthrough()} 
    | philo_prompt 
    | llm 
    | StrOutputParser()
)

In [34]:
chain.invoke("Would you say that science is a part of philosophy? That art is?")

'Yes, science is a part of philosophy as it addresses questions about the limits of human inquiry and knowledge, which are philosophical questions. Art, on the other hand, is not explicitly mentioned in the provided context, but it can be considered as a form of human expression and creativity, which can also be explored through philosophical inquiry.'

## Cleanup

Let us completely delete the collection, thereby freeing the associated resources on Astra DB:

> _Warning: use with caution. Data loss!_

In [35]:
vstore.delete_collection()