In [None]:
import add_packages
import config
from pprint import pprint
import my_langchain
from my_langchain import (document_loaders, vector_stores, text_embedding_models,
                          text_splitters, utils)

# FAISS

In [None]:
from my_langchain import (document_loaders, vector_stores, text_embedding_models,
                          text_splitters)

# Ingestion
loader = document_loaders.text_loader("../data/state_of_the_union.txt")
documents = loader.load()
text_splitter = text_splitters.character_text_splitter(
  chunk_size=1000, chunk_overlap=0,
)
docs = text_splitter.split_documents(documents)
embeddings = text_embedding_models.openai_embeddings()
db = vector_stores.faiss_store.from_documents(docs, embeddings)
retriever = db.as_retriever()

## Querying
 

In [None]:
query = "What did the president say about Ketanji Brown Jackson"
docs = db.similarity_search(query) 
docs = retriever.invoke(query)

## Similarity Search with score

In [None]:
docs_scored = db.similarity_search_with_score(query)
pprint(docs_scored)

In [None]:
query_embedding = embeddings.embed_query(query)
docs_scored = db.similarity_search_by_vector(query_embedding)
pprint(docs_scored)

## Saving and loading


In [None]:
def save_faiss_index(path: str, db):
  db.save_local(path)

def load_faiss_index(path: str, embeddings):
  db = vector_stores.faiss_store.load_local(path, embeddings)
  return db

path = "../data/store/faiss_index"
save_faiss_index(path, db)

In [None]:
db_load = load_faiss_index(path, embeddings)
docs = db_load.similarity_search(query)
pprint(docs)

## Merging


In [None]:
db1 = vector_stores.faiss_store.from_texts(["foo"], embeddings)
db2 = vector_stores.faiss_store.from_texts(["bar"], embeddings)
print(db1.docstore.__dict__)
print(db2.docstore.__dict__)

db1.merge_from(db2)
print(db1.docstore.__dict__)

## Similarity Search with filtering

In [None]:
from my_langchain import documents

docs_info = [
  {"content": "foo", "metadata": {"page": 1}},
  {"content": "bar", "metadata": {"page": 1}},
  {"content": "foo", "metadata": {"page": 2}},
  {"content": "barbar", "metadata": {"page": 2}},
  {"content": "foo", "metadata": {"page": 3}},
  {"content": "bar burr", "metadata": {"page": 3}},
  {"content": "foo", "metadata": {"page": 4}},
  {"content": "bar bruh", "metadata": {"page": 4}},
]

docs_lst = [
  documents.Document(page_content=doc["content"], metadata=doc["metadata"])
    for doc in docs_info
]

db = vector_stores.faiss_store.from_documents(docs_lst, embeddings)
results_scored = db.similarity_search_with_score("foo")

for doc, score in results_scored:
  print(f"Content: {doc.page_content}\nMetadata: {doc.metadata}, Score: {score}\n")

In [None]:
results = db.similarity_search(
  "foo",
  filter={
    "page": 1,
  },
  k=1,
  fetch_k=4, # > k
)
results

In [None]:
results_scored = db.similarity_search_with_score(
  "foo",
  filter={
    "page": 1,
  }
)
for doc, score in results_scored:
  print(f"Content: {doc.page_content}\nMetadata: {doc.metadata}, Score: {score}\n")

In [None]:
results = db.max_marginal_relevance_search(
  "foo",
  filter={
    "page": 1,
  }
)
results

# Qdrant

## Vector store

In [None]:
embeddings = text_embedding_models.openai_embeddings()

In [None]:
# OPTION
loader = document_loaders.text_loader("../data/state_of_the_union.txt")
documents = loader.load()

text_splitter = text_splitters.character_text_splitter(
  chunk_size=1000,
  chunk_overlap=0,
)
docs = text_splitter.split_documents(documents)

### Connecting to Qdrant from LangChain


#### Local mode


##### In-memory

In [None]:
qdrant_memory = vector_stores.qdrant_store.from_documents(
  docs,
  embeddings,
  location=":memory:",  # Local mode with in-memory storage only
  collection_name="my_documents",
)

##### On-disk storage

In [None]:
qdrant_disk = vector_stores.qdrant_store.from_documents(
  docs,
  embeddings,
  path="../data/store/local_qdrant",
  collection_name="my_documents",
  content_payload_key="my_page_content_key",
  metadata_payload_key="my_meta",
  force_recreate=True,
)

#### On-premise server deployment


In [None]:
"""
Whether launching Qdrant locally via Docker or opting for a Kubernetes deployment
with the official Helm chart, connecting to the instance remains the same - 
provide a URL pointing to the service.
"""

"""
url = "<---qdrant url here --->"
qdrant = Qdrant.from_documents(
    docs,
    embeddings,
    url=url,
    prefer_grpc=True,
    collection_name="my_documents",
)
"""

#### Qdrant Cloud


#### Recreating the collection


In [None]:
"""
url = "<---qdrant url here --->"
qdrant = Qdrant.from_documents(
    docs,
    embeddings,
    url=url,
    prefer_grpc=True,
    collection_name="my_documents",
    force_recreate=True, # Delete the old collection, enabling a fresh start.
)
"""

### Similarity search


In [None]:
query = "What did the president say about Ketanji Brown Jackson"
found_docs = qdrant_disk.similarity_search(query)
pprint(found_docs)

### Similarity search with score


In [None]:
query = "What did the president say about Ketanji Brown Jackson"
found_docs = qdrant_disk.similarity_search_with_score(query)

#### Metadata filtering


In [None]:
from qdrant_client.http import models as rest

query = "What did the president say about Ketanji Brown Jackson"
found_docs = qdrant_disk.similarity_search_with_score(
  query,
  filter=rest.Filter(...),
)

### Maximum marginal relevance search (MMR)


In [None]:
query = "What did the president say about Ketanji Brown Jackson"
found_docs = qdrant_disk.max_marginal_relevance_search(
  query, k=2, fetch_k=10,
)

### Qdrant as a Retriever


In [None]:
retriever = qdrant_disk.as_retriever(
  search_type="mmr",
)

query = "What did the president say about Ketanji Brown Jackson"
results = retriever.get_relevant_documents(query)

## Retriever

In [None]:
from langchain_community.vectorstores import qdrant
from langchain_core.documents import Document
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_openai import OpenAI

docs = [
  Document(
    page_content="A bunch of scientists bring back dinosaurs and mayhem breaks loose",
    metadata={"year": 1993, "rating": 7.7, "genre": "science fiction"},
  ),
  Document(
    page_content="Leo DiCaprio gets lost in a dream within a dream within a dream within a ...",
    metadata={"year": 2010, "director": "Christopher Nolan", "rating": 8.2},
  ),
  Document(
    page_content="A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea",
    metadata={"year": 2006, "director": "Satoshi Kon", "rating": 8.6},
  ),
  Document(
    page_content="A bunch of normal-sized women are supremely wholesome and some men pine after them",
    metadata={"year": 2019, "director": "Greta Gerwig", "rating": 8.3},
  ),
  Document(
    page_content="Toys come alive and have a blast doing so",
    metadata={"year": 1995, "genre": "animated"},
  ),
  Document(
    page_content="Three men walk into the Zone, three men walk out of the Zone",
    metadata={
      "year": 1979,
      "rating": 9.9,
      "director": "Andrei Tarkovsky",
      "genre": "science fiction",
    },
  ),
]

vectorstore = qdrant.Qdrant.from_documents(
  docs, embeddings, 
  path="../data/store/qdrant_movie", collection_name="qdrant_movie"
)

metadata_field_info = [
  AttributeInfo(
    name="genre",
    description="The genre of the movie",
    type="string or list[string]",
  ),
  AttributeInfo(
    name="year",
    description="The year the movie was released",
    type="integer",
  ),
    AttributeInfo(
    name="director",
    description="The name of the movie director",
    type="string",
  ),
    AttributeInfo(
    name="rating", 
    description="A 1-10 rating for the movie", 
    type="float",
  ),
]

document_content_description = "Brief summary of a movie"
llm = OpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
  llm=llm, 
  vectorstore=vectorstore,
  document_contents=document_content_description,
  metadata_field_info=metadata_field_info,
  verbose=True,
)

results = retriever.get_relevant_documents("What are some movies about dinosaurs")


# Chroma