From d88038acebe8700c24908637fa49e0259ebd60c5 Mon Sep 17 00:00:00 2001 From: Ping Xie Date: Thu, 8 Feb 2024 22:17:52 -0800 Subject: [PATCH 1/9] feat(vector_store): Implement RedisVectorStore class This commit introduces the RedisVectorStore class, extending VectorStore to enable vector storage and retrieval using Memorystore Redis. --- .../__init__.py | 1 + .../vector_store.py | 721 ++++++++++++++++++ 2 files changed, 722 insertions(+) create mode 100644 src/langchain_google_memorystore_redis/vector_store.py diff --git a/src/langchain_google_memorystore_redis/__init__.py b/src/langchain_google_memorystore_redis/__init__.py index 6d5e14b..b2d7e33 100644 --- a/src/langchain_google_memorystore_redis/__init__.py +++ b/src/langchain_google_memorystore_redis/__init__.py @@ -11,3 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from .vector_store import FLATConfig, HNSWConfig, RedisVectorStore diff --git a/src/langchain_google_memorystore_redis/vector_store.py b/src/langchain_google_memorystore_redis/vector_store.py new file mode 100644 index 0000000..7e5d35a --- /dev/null +++ b/src/langchain_google_memorystore_redis/vector_store.py @@ -0,0 +1,721 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import logging +import operator +import re +import uuid +from abc import ABC +from enum import Enum, auto +from itertools import zip_longest +from typing import Any, Iterable, List, Optional, Tuple + +import numpy as np +import redis +from langchain_community.vectorstores.utils import ( + DistanceStrategy, + maximal_marginal_relevance, +) +from langchain_core._api import deprecated +from langchain_core.callbacks import CallbackManagerForRetrieverRun +from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings +from langchain_core.utils import get_from_dict_or_env +from langchain_core.vectorstores import VectorStore, VectorStoreRetriever + +# Setting up a basic logger +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) + + +class IndexConfig(ABC): + """ + Base configuration class for all types of indexes. + """ + + def __init__( + self, + name: str, + field_name: str, + type: str, + ): + """ + Initializes the IndexConfig object. + + Args: + name (str): A unique identifier for the index. This name is used to + distinguish the index within the storage system, enabling targeted + operations such as updates, queries, and deletions. + field_name (str): Specifies the name of the field within the data that + will be indexed. This field name directs the indexing process to + the relevant part of the data structure. + type (str): Indicates the type of index to be created, which determines + the underlying algorithm or structure used for indexing and search + operations. Examples include "HNSW" for hierarchical navigable small + world indexes and "FLAT" for brute-force search indexes. + data_type (str, optional): Defines the data type of the elements within + the vector being indexed, such as "FLOAT32" for 32-bit floating-point + numbers. This parameter is crucial for ensuring that the index + accommodates the vector data appropriately. Defaults to "FLOAT32". + + """ + self.name = name + self.field_name = field_name + self.type = type + + +class VectorIndexConfig(IndexConfig): + SUPPORTED_DISTANCE_STRATEGIES = { + DistanceStrategy.COSINE, + DistanceStrategy.EUCLIDEAN_DISTANCE, + DistanceStrategy.MAX_INNER_PRODUCT, + } + + def __init__( + self, + name: str, + field_name: str, + type: str, + distance_strategy: DistanceStrategy, + vector_size: int, + data_type: str, + ): + """ + Initializes the VectorIndexConfig object. + + Args: + name (str): The unique name for the vector index. This name is used to + identify and reference the index within the vector storage system. + field_name (str): The name of the field in the data structure that contains + the vector data to be indexed. This specifies the target data for indexing. + type (str): The type of vector index. This parameter determines the indexing + algorithm or structure to be used (e.g., "FLAT", "HNSW"). + distance_strategy (DistanceStrategy): Enum specifying the metric used to + calculate the distance or similarity between vectors. Supported strategies + include COSINE, EUCLIDEAN_DISTANCE (L2), and MAX_INNER_PRODUCT (IP), + influencing how search results are ranked and returned. + vector_size (int): The dimensionality of the vectors that will be stored + and indexed. All vectors must conform to this specified size. + data_type (str, optional): The data type of the vector elements (e.g., "float32"). + This specifies the precision and format of the vector data, affecting storage + requirements and possibly search performance. Defaults to "float32". + """ + if distance_strategy not in self.SUPPORTED_DISTANCE_STRATEGIES: + supported_strategies = ", ".join( + [ds.value for ds in self.SUPPORTED_DISTANCE_STRATEGIES] + ) + raise ValueError( + f"Unsupported distance strategy: {distance_strategy}. " + f"Supported strategies are: {supported_strategies}." + ) + + super().__init__(name, field_name, type) + self.distance_strategy = distance_strategy + self.vector_size = vector_size + self.data_type = data_type + + @property + def distance_metric(self): + mapping = { + DistanceStrategy.EUCLIDEAN_DISTANCE: "L2", + DistanceStrategy.MAX_INNER_PRODUCT: "IP", + DistanceStrategy.DOT_PRODUCT: "IP", + DistanceStrategy.COSINE: "COSINE", + } + return mapping[self.distance_strategy] + + +class HNSWConfig(VectorIndexConfig): + """ + Configuration class for HNSW (Hierarchical Navigable Small World) vector indexes. + """ + + def __init__( + self, + name: str, + field_name: str, + vector_size: int, + distance_strategy: DistanceStrategy = DistanceStrategy.COSINE, + initial_cap: int = 10000, + m: int = 16, + ef_construction: int = 200, + ef_runtime: int = 10, + ): + """ + Initializes the HNSWConfig object. + + Args: + name (str): The unique name for the vector index, serving as an identifier + within the vector store or database system. + field_name (str): The name of the field in the dataset that holds the vector + data to be indexed. This specifies which part of the data structure is + used for indexing and searching. + vector_size (int): The dimensionality of the vectors that the index will + accommodate. All vectors must match this specified size. + distance_strategy (DistanceStrategy): The metric used for calculating + distances or similarities between vectors, influencing how search results + are ranked. Defaults to `DistanceStrategy.COSINE`. + initial_cap (int): Specifies the initial capacity of the index in terms of + the number of vectors it can hold, impacting the initial memory allocation. + Defaults to 10000. + m (int): Determines the maximum number of outgoing edges each node in the + index graph can have, directly affecting the graph's connectivity and + search performance. Defaults to 16. + ef_construction (int): Controls the size of the dynamic candidate list during + the construction of the index, influencing the index build time and quality. + Defaults to 200. + ef_runtime (int): Sets the size of the dynamic candidate list during search + queries, balancing between search speed and accuracy. Defaults to 10. + + """ + super().__init__( + name, field_name, "HNSW", distance_strategy, vector_size, "FLOAT32" + ) + self.initial_cap = initial_cap + self.m = m + self.ef_construction = ef_construction + self.ef_runtime = ef_runtime + + +class FLATConfig(VectorIndexConfig): + """ + Configuration class for FLAT vector indexes, utilizing brute-force search. + """ + + def __init__( + self, + name: str, + field_name: str, + vector_size: int, + distance_strategy: DistanceStrategy = DistanceStrategy.COSINE, + ): + """ + Initializes the FLATConfig object. + + Args: + name (str): The unique name for the vector index. This name is used + to identify the index within the vector store or database. + field_name (str): The name of the field that contains the vector data + to be indexed. This field should exist within the data structure + that stores the documents or vectors. + vector_size (int): Specifies the dimensionality of the vectors to be + indexed. All vectors added to this index must conform to this size. + distance_strategy (DistanceStrategy, optional): Determines the metric + used to calculate the distance or similarity between vectors during + search operations. Defaults to `DistanceStrategy.COSINE`, which + measures the cosine similarity between vectors. + """ + super().__init__( + name, field_name, "FLAT", distance_strategy, vector_size, "FLOAT32" + ) + + +class RedisVectorStore(VectorStore): + + DEFAULT_CONTENT_FIELD = "page_content" + DEFAULT_VECTOR_FIELD = "vector" + DEFAULT_DATA_TYPE = "float32" + + def __init__( + self, + client: redis.Redis, + index_name: str, + embedding_service: Embeddings, + key_prefix: Optional[str] = None, + content_field: str = DEFAULT_CONTENT_FIELD, + vector_field: str = DEFAULT_VECTOR_FIELD, + ): + """ + Initialize a RedisVectorStore instance. + + Args: + client (redis.Redis): The Redis client instance to be used for database + operations, providing connectivity and command execution against the + Redis instance. + index_name (str): The name assigned to the vector index within Redis. This + name is used to identify the index for operations such as searching and + indexing. + embedding_service (Embeddings): An instance of an embedding service or model + capable of generating vector embeddings from document content. This + service is utilized to convert text documents into vector representations + for storage and search. + key_prefix (Optional[str], optional): An optional prefix for Redis HASH keys + that are to be included in the vector index. This allows for selective + indexing of documents based on their keys. If None, all HASH keys in the + Redis database are considered for indexing. Defaults to None. + content_field (str, optional): The field within the Redis HASH where document + content is stored. This field is read to obtain document text for + embedding during indexing operations. Defaults to 'page_content', which + can be overridden if document content is stored under a different field. + vector_field (str, optional): The field within the Redis HASH designated for + storing the vector embedding of the document. This field is used both + when adding new documents to the store and when retrieving or searching + documents based on their vector embeddings. Defaults to 'vector'. + """ + self.client = client + self.index_name = index_name + self.embedding_service = embedding_service + self.key_prefix = key_prefix + ":" if key_prefix is not None else "" + self.content_field = content_field + self.vector_field = vector_field + + # Helper function to check if a string is JSON parsable + @staticmethod + def _is_json_parsable(s: str) -> bool: + try: + json.loads(s) + return True + except ValueError: + return False + + @staticmethod + def init_index( + client: redis.Redis, index_config: IndexConfig, key_prefix: Optional[str] = None + ): + """ + Initializes a named VectorStore index in Redis with specified configurations. + """ + if not isinstance(index_config, HNSWConfig): + raise ValueError("index_config must be an instance of HNSWConfig") + + # Use the index name if no key_prefix is provided + key_prefix = key_prefix + ":" if key_prefix is not None else index_config.name + + # Preparing the command string to avoid long lines + command = ( + f"FT.CREATE {index_config.name} ON HASH PREFIX 1 {key_prefix} " + f"SCHEMA {index_config.field_name} VECTOR {index_config.type} " + f"6 TYPE {index_config.data_type} DIM {index_config.vector_size} " + f"DISTANCE_METRIC {index_config.distance_metric}" + ) + + try: + client.execute_command(command) + except redis.exceptions.ResponseError as e: + if re.match(r"Redis module key \w+ already exists", str(e)): + logger.info("Index already exists, skipping creation.") + else: + raise + + # TODO: When "Redis module key \w+ already exists" is caught, we should + # call FT.INFO to validate if the index properties match the arguments. + # If not, an exception should be thrown. This check is pending support + # for FT.INFO in the client library. + + @staticmethod + def drop_index(client: redis.Redis, index_name: str, index_only: bool = False): + """ + Drops an index from the Redis database. Optionally, it can also delete + the documents associated with the index. + + Args: + client (Redis): The Redis client instance used to connect to the database. + This client provides the necessary commands to interact with the database. + index_name (str): The name of the index to be dropped. This name must exactly + match the name of the existing index in the Redis database. + index_only (bool, optional): A flag indicating whether to drop only the index + structure (True) or to also delete the documents associated with the index (False). + Defaults to False, implying that both the index and its documents will be deleted. + + Raises: + redis.RedisError: If any Redis-specific error occurs during the operation. This + includes connection issues, authentication failures, or errors from executing + the command to drop the index. Callers should handle these exceptions to + manage error scenarios gracefully. + """ + command = ( + f"FT.DROPINDEX {index_name} {'KEEPDOCS' if index_only else ''}".strip() + ) + client.execute_command(command) + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + batch_size: int = 1000, + **kwargs: Any, + ) -> List[str]: + """ + Adds a collection of texts along with their metadata to a vector store, + generating unique keys for each entry if not provided. + + Args: + texts (Iterable[str]): An iterable collection of text documents to be added to the vector store. + metadatas (Optional[List[dict]], optional): An optional list of metadata dictionaries, + where each dictionary corresponds to a text document in the same order as the `texts` iterable. + Each metadata dictionary should contain key-value pairs representing the metadata attributes + for the associated text document. + batch_size (int, optional): The number of documents to process in a single batch operation. + This parameter helps manage memory and performance when adding a large number of documents. + Defaults to 1000. + **kwargs (Any): Additional keyword arguments for extended functionality. This includes: + - 'keys' or 'ids' (List[str], optional): Custom identifiers for each document. If provided, + the length of this list should match the length of `texts`. If not provided, the system + will generate unique identifiers. + + Returns: + List[str]: A list containing the unique keys or identifiers for each added document. These keys + can be used to retrieve or reference the documents within the vector store. + + Note: + If both 'keys' (or 'ids') and 'metadatas' are provided, they must be of the same length as the + `texts` iterable to ensure each document is correctly associated with its metadata and identifier. + """ + # Generate or extend keys/IDs for the documents + keys_or_ids = kwargs.get("keys", kwargs.get("ids", [])) + # Ensure there's a unique ID for each text document + keys_or_ids = (keys_or_ids + [str(uuid.uuid4()) for _ in texts])[ + len(keys_or_ids) : + ] + # Fallback for empty metadata + metadatas = metadatas if metadatas is not None else [{} for _ in texts] + # Generate embeddings for all documents + embeddings = self.embedding_service.embed_documents(list(texts)) + + ids = [] + pipeline = self.client.pipeline(transaction=False) + for i, bundle in enumerate( + zip_longest(keys_or_ids, texts, embeddings, metadatas), start=1 + ): + key, text, embedding, metadata = bundle + key = self.key_prefix + key + + # Initialize the mapping with content and vector fields + mapping = { + self.content_field: text, + self.vector_field: np.array(embedding) + .astype(self.DEFAULT_DATA_TYPE) + .tobytes(), + } + + # Process metadata: directly add non-dict items, JSON-serialize dict items + for meta_key, meta_value in metadata.items(): + if isinstance(meta_value, dict): + # If the value is a dict, JSON-serialize it + mapping[meta_key] = json.dumps(meta_value) + else: + # Directly add non-dict items + mapping[meta_key] = str(meta_value) + + # Add the document to the Redis hash + pipeline.hset(key, mapping=mapping) + ids.append(key) + + # Ensure to execute any remaining commands in the pipeline after the loop + if i % batch_size != 0: + pipeline.execute() + + # Final execution to catch any remaining items in the pipeline + pipeline.execute() + + return ids + + @classmethod + def from_texts( + cls, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> "RedisVectorStore": + """ + Creates an instance of RedisVectorStore from provided texts. + + Args: + texts (List[str]): A list of text documents to be embedded and indexed. + embedding (Embeddings): An instance capable of generating embeddings for the + provided text documents. + metadatas (Optional[List[dict]]): A list of dictionaries where each dictionary + contains metadata corresponding to each text document in `texts`. If provided, + the length of `metadatas` must match the length of `texts`. + **kwargs (Any): Additional keyword arguments that can include: + - 'client': A Redis client instance to be used by the RedisVectorStore. + - 'index_name': The name of the index to be created or used in Redis. + If not provided, a default name may be used. + + Returns: + RedisVectorStore: An instance of RedisVectorStore that has been populated with + the embeddings of the provided texts, along with their associated metadata. + + Raises: + ValueError: If a Redis client instance is not provided in `kwargs`, indicating + that the method cannot proceed without a connection to a Redis database. + """ + + if "client" not in kwargs: + raise ValueError( + "A 'client' must be provided to initialize RedisVectorStore" + ) + + if "index_name" not in kwargs: + raise ValueError( + "A 'index_name' must be provided to initialize RedisVectorStore" + ) + + kwargs_copy = kwargs.copy() + + # Extract 'client' and remove it from kwargs to prevent passing it twice + client = kwargs_copy.pop("client") + index_name = kwargs_copy.pop("index_name") + + # Initialize RedisVectorStore instance + instance = cls( + client, + index_name, + embedding, + **kwargs_copy, + ) + + # Add texts and their corresponding metadata to the instance + instance.add_texts(texts, metadatas) + + return instance + + def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]: + if not ids: # Check if ids list is empty or None + logger.info("No IDs provided for deletion.") + return None # Or False, depending on intended behavior when ids is empty or None + + try: + self.client.delete(*ids) + logger.info("Entries deleted.") + return True + except Exception as e: # It's better to catch specific exceptions + logger.error(f"Failed to delete entries: {e}") + return False + + def _similarity_search_by_vector_with_score_and_embeddings( + self, query_embedding: List[float], k: int = 4, **kwargs: Any + ) -> List[Tuple[Document, float, List[float]]]: + + distance_threshold = kwargs.get( + "distance_threshold", kwargs.get("score_threshold") + ) + + query_k = k + if distance_threshold is not None: + distance_strategy = kwargs.get("distance_strategy", DistanceStrategy.COSINE) + query_k *= 4 # Quadruple k if a distance threshold is specified + + query_args = [ + "FT.SEARCH", + self.index_name, + f"*=>[KNN {query_k} @{self.vector_field} $query_vector AS distance]", + "PARAMS", + 2, + "query_vector", + np.array([query_embedding]).astype(self.DEFAULT_DATA_TYPE).tobytes(), + "DIALECT", + 2, + ] + + initial_results = self.client.execute_command(*query_args) + + # Process the results + final_results: List[Tuple[Document, float, List[float]]] = [] + + if not initial_results: + return final_results + + for i in range(2, len(initial_results), 2): + page_content: str = "" + metadata = {} + distance = 0.0 + embedding: List[float] = [] + for j in range(0, len(initial_results[i]), 2): + key = initial_results[i][j].decode() + value = initial_results[i][j + 1] + if key == self.content_field: + page_content = value.decode() + elif key == self.vector_field: + embedding = np.frombuffer( + value, dtype=self.DEFAULT_DATA_TYPE + ).tolist() + elif key == "distance": + distance = float(value.decode()) + else: + if isinstance(value, bytes) and self._is_json_parsable( + value.decode() + ): + metadata[key] = json.loads(value.decode()) + else: + metadata[key] = value.decode() + + final_results.append( + ( + Document(page_content=page_content, metadata=metadata), + distance, + embedding, + ) + ) + + if distance_threshold is not None: + cmp = ( + operator.ge + if distance_strategy == DistanceStrategy.MAX_INNER_PRODUCT + else operator.le + ) + final_results = [ + (doc, distance, embedding) + for doc, distance, embedding in final_results + if cmp(distance, distance_threshold) + ] + return final_results[:k] + + def _similarity_search_by_vector_with_score( + self, query_embedding: List[float], k: int = 4, **kwargs: Any + ) -> List[Tuple[Document, float]]: + initial_results = self._similarity_search_by_vector_with_score_and_embeddings( + query_embedding, k, **kwargs + ) + # Extract just the Document objects from the search results + return [(doc, embedding) for doc, embedding, _ in initial_results] + + def similarity_search_with_score( + self, + query: str, + k: int = 4, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """ + Performs a similarity search using the given query, returning documents and their similarity scores. + + Args: + query (str): The query string to search for. + k (int): The number of closest documents to return. + **kwargs (Any): Additional keyword arguments for future use. + Returns: + List[Tuple[Document, float]]: A ranked list of tuples, each containing a Document object and its + corresponding similarity score. The list includes up to 'k' entries, representing the + documents most relevant to the query according to the similarity scores. + """ + # Embed the query using the embedding function + query_embedding = self.embedding_service.embed_query(query) + return self._similarity_search_by_vector_with_score( + query_embedding, k, **kwargs + ) + + def similarity_search_by_vector( + self, embedding: List[float], k: int = 4, **kwargs: Any + ) -> List[Document]: + """ + Performs a similarity search for the given embedding and returns the top k + most similar Document objects, discarding their similarity scores. + + Args: + embedding (List[float]): The query embedding for the similarity search. + k (int): The number of top documents to return. + **kwargs (Any): Additional keyword arguments to pass to the search. + + Returns: + List[Document]: A list containing up to 'k' Document objects, ranked by their + similarity to the query. These documents represent the most relevant + results found by the search operation, subject to the additional constraints + and configurations specified. + """ + initial_results = self._similarity_search_by_vector_with_score( + embedding, k, **kwargs + ) + # Extract just the Document objects from the search results + return [doc for doc, _ in initial_results] + + def similarity_search( + self, + query: str, + k: int = 4, + **kwargs: Any, + ) -> List[Document]: + """ + Conducts a similarity search based on the specified query, returning a list + of the top 'k' documents that are most similar to the query. + + Args: + query (str): The text query based on which similar documents are to be retrieved. + k (int): Specifies the number of documents to return, effectively setting a limit + on the size of the result set. Defaults to 4. + **kwargs (Any): A flexible argument allowing for the inclusion of additional + search parameters or options. + + Returns: + List[Document]: A list containing up to 'k' Document objects, ranked by their + similarity to the query. These documents represent the most relevant results + found by the search operation, subject to the additional constraints and + configurations specified. + + Raises: + ValueError: If any of the provided search parameters are invalid or if the search + operation encounters an error due to misconfiguration or execution issues within + the search backend. + """ + # Embed the query using the embedding function + query_embedding = self.embedding_service.embed_query(query) + return self.similarity_search_by_vector(query_embedding, k, **kwargs) + + def max_marginal_relevance_search( + self, + query: str, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + **kwargs: Any, + ) -> List[Document]: + """ + Performs a search to find documents that are both relevant to the query and diverse + among each other based on Maximal Marginal Relevance (MMR). + + The MMR algorithm optimizes a combination of relevance to the query and diversity + among the results, controlled by the lambda_mult parameter. + + Args: + query (str): The query string used to find similar documents. + k (int): The number of documents to return. + fetch_k (int): The number of documents to fetch for consideration. This should + be larger than k to allow for diversity calculation. + lambda_mult (float): Controls the trade-off between relevance and diversity. + Ranges from 0 (max diversity) to 1 (max relevance). + **kwargs: Additional keyword arguments + + Returns: + List[Document]: A list of document objects selected based on maximal marginal relevance. + + Raises: + ValueError: If lambda_mult is not in the range [0, 1]. + """ + # Validate the lambda_mult parameter to ensure it's within the valid range. + if not 0 <= lambda_mult <= 1: + raise ValueError("lambda_mult must be between 0 and 1.") + + # Embed the query using a hypothetical method to convert text to vector. + query_embedding = self.embedding_service.embed_query(query) + + # Fetch initial documents based on query embedding. + initial_results = self._similarity_search_by_vector_with_score_and_embeddings( + query_embedding, + k=fetch_k, + **kwargs, + ) + + inital_embeddings = [embedding for _, _, embedding in initial_results] + + # Calculate MMR to select diverse and relevant documents. + selected_indices = maximal_marginal_relevance( + np.array(query_embedding), inital_embeddings, lambda_mult=lambda_mult, k=k + ) + + return [initial_results[i][0] for i in selected_indices] From 77ae692e8bc3e621a31da044ab183cf171b83fac Mon Sep 17 00:00:00 2001 From: Craig Chi Date: Mon, 12 Feb 2024 14:50:55 -0800 Subject: [PATCH 2/9] feat: add integration test for MemorystoreChatMessageHistory (#13) --- integration.cloudbuild.yaml | 6 +++ tests/test_chat_message_history.py | 67 ++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 tests/test_chat_message_history.py diff --git a/integration.cloudbuild.yaml b/integration.cloudbuild.yaml index c78148e..3edce93 100644 --- a/integration.cloudbuild.yaml +++ b/integration.cloudbuild.yaml @@ -22,3 +22,9 @@ steps: name: python:3.11 entrypoint: python args: ["-m", "pytest"] + env: + - 'REDIS_URL=$_REDIS_URL' + +options: + pool: + name: 'projects/$PROJECT_ID/locations/$LOCATION/workerPools/redis' diff --git a/tests/test_chat_message_history.py b/tests/test_chat_message_history.py new file mode 100644 index 0000000..5094841 --- /dev/null +++ b/tests/test_chat_message_history.py @@ -0,0 +1,67 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +import uuid +from langchain_core.messages import AIMessage, BaseMessage, HumanMessage +from langchain_google_memorystore_redis import MemorystoreChatMessageHistory +import redis + + +def test_redis_multiple_sessions() -> None: + client = redis.from_url( + get_env_var("REDIS_URL", "URL of the Redis instance") + ) + + session_id1 = uuid.uuid4().hex + history1 = MemorystoreChatMessageHistory( + client=client, + session_id=session_id1, + ) + session_id2 = uuid.uuid4().hex + history2 = MemorystoreChatMessageHistory( + client=client, + session_id=session_id2, + ) + + history1.add_ai_message("Hey! I am AI!") + history1.add_user_message("Hey! I am human!") + history2.add_user_message("Hey! I am human in another session!") + messages1 = history1.messages + messages2 = history2.messages + + assert len(messages1) == 2 + assert len(messages2) == 1 + assert isinstance(messages1[0], AIMessage) + assert messages1[0].content == "Hey! I am AI!" + assert isinstance(messages1[1], HumanMessage) + assert messages1[1].content == "Hey! I am human!" + assert isinstance(messages2[0], HumanMessage) + assert messages2[0].content == "Hey! I am human in another session!" + + history1.clear() + assert len(history1.messages) == 0 + assert len(history2.messages) == 1 + + history2.clear() + assert len(history1.messages) == 0 + assert len(history2.messages) == 0 + + +def get_env_var(key: str, desc: str) -> str: + v = os.environ.get(key) + if v is None: + raise ValueError(f"Must set env var {key} to: {desc}") + return v From ca95ae8ffa2829361723f3cff6409c454ee52ae6 Mon Sep 17 00:00:00 2001 From: Craig Chi Date: Tue, 13 Feb 2024 11:04:49 -0800 Subject: [PATCH 3/9] fix: fix linter and encoding of MemorystoreChatMessageHistory class. (#20) * fix: fix linter and encoding of MemorystoreChatMessageHistory class. * fix: use relative path for __init__ imports --- .../__init__.py | 2 +- .../chat_message_history.py | 15 +++++++-------- .../vector_store.py | 2 -- tests/test_chat_message_history.py | 8 ++++---- 4 files changed, 12 insertions(+), 15 deletions(-) diff --git a/src/langchain_google_memorystore_redis/__init__.py b/src/langchain_google_memorystore_redis/__init__.py index da471a5..830a5f1 100644 --- a/src/langchain_google_memorystore_redis/__init__.py +++ b/src/langchain_google_memorystore_redis/__init__.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from langchain_google_memorystore_redis.chat_message_history import MemorystoreChatMessageHistory +from .chat_message_history import MemorystoreChatMessageHistory from .vector_store import FLATConfig, HNSWConfig, RedisVectorStore __all__ = ["MemorystoreChatMessageHistory"] diff --git a/src/langchain_google_memorystore_redis/chat_message_history.py b/src/langchain_google_memorystore_redis/chat_message_history.py index 2662230..b2c80cf 100644 --- a/src/langchain_google_memorystore_redis/chat_message_history.py +++ b/src/langchain_google_memorystore_redis/chat_message_history.py @@ -13,15 +13,11 @@ # limitations under the License. import json -import redis from typing import List, Optional +import redis from langchain_core.chat_history import BaseChatMessageHistory -from langchain_core.messages import ( - BaseMessage, - message_to_dict, - messages_from_dict, -) +from langchain_core.messages import BaseMessage, message_to_dict, messages_from_dict class MemorystoreChatMessageHistory(BaseChatMessageHistory): @@ -46,13 +42,16 @@ def __init__( self._redis = client self._key = session_id self._ttl = ttl + self._encoder = client.connection_pool.get_encoder() @property - def messages(self) -> List[BaseMessage]: + def messages(self) -> List[BaseMessage]: # type: ignore """Retrieve all messages chronologically stored in this session.""" all_elements = self._redis.lrange(self._key, 0, -1) + + assert isinstance(all_elements, list) messages = messages_from_dict( - [json.loads(e.decode("utf-8")) for e in all_elements] + [json.loads(self._encoder.decode(e)) for e in all_elements] ) return messages diff --git a/src/langchain_google_memorystore_redis/vector_store.py b/src/langchain_google_memorystore_redis/vector_store.py index 7e5d35a..468b08e 100644 --- a/src/langchain_google_memorystore_redis/vector_store.py +++ b/src/langchain_google_memorystore_redis/vector_store.py @@ -227,7 +227,6 @@ def __init__( class RedisVectorStore(VectorStore): - DEFAULT_CONTENT_FIELD = "page_content" DEFAULT_VECTOR_FIELD = "vector" DEFAULT_DATA_TYPE = "float32" @@ -503,7 +502,6 @@ def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[boo def _similarity_search_by_vector_with_score_and_embeddings( self, query_embedding: List[float], k: int = 4, **kwargs: Any ) -> List[Tuple[Document, float, List[float]]]: - distance_threshold = kwargs.get( "distance_threshold", kwargs.get("score_threshold") ) diff --git a/tests/test_chat_message_history.py b/tests/test_chat_message_history.py index 5094841..7a0448a 100644 --- a/tests/test_chat_message_history.py +++ b/tests/test_chat_message_history.py @@ -15,15 +15,15 @@ import os import uuid + +import redis from langchain_core.messages import AIMessage, BaseMessage, HumanMessage + from langchain_google_memorystore_redis import MemorystoreChatMessageHistory -import redis def test_redis_multiple_sessions() -> None: - client = redis.from_url( - get_env_var("REDIS_URL", "URL of the Redis instance") - ) + client = redis.from_url(get_env_var("REDIS_URL", "URL of the Redis instance")) session_id1 = uuid.uuid4().hex history1 = MemorystoreChatMessageHistory( From 1097c08aff5dda4989672271af64f4dd8c3c634d Mon Sep 17 00:00:00 2001 From: Ping Xie Date: Tue, 13 Feb 2024 12:29:01 -0800 Subject: [PATCH 4/9] feat(vector store): Add usage demo and enhance Redis integration (#17) * feat(vector store): Add usage demo and enhance Redis integration - Add Jupyter notebook in `docs/` for using langchain vector store with Memorystore Redis. - Update `vector_store.py` for improved performance and reliability with Redis. - Modify `__init__.py` to reflect package structure changes and new functionalities. - Include `requirements.txt` and `setup.py` for easy installation and dependency management. - Add `state_of_the_union.txt` in `docs/` as a sample dataset for the notebook demo. * incorporate review feedback * added missing license header * excluded the test file from license check * reformatted source files * incorporated review feedback * fixed lint errors * fixed more lint errors * removed key_prefix argument from vectorstore * incorporated review feedback * fixed formatting errors * fixed a bad merge * suppress mypy errors for setuptools * trying mypy.ini * remove setup.py and requirements * add numpy dependency --- .github/header-checker-lint.yml | 4 +- docs/state_of_the_union.txt | 723 ++++++++++++++++++ docs/vector_store.ipynb | 431 +++++++++-- pyproject.toml | 1 + .../__init__.py | 10 +- .../chat_message_history.py | 4 +- .../vector_store.py | 185 +++-- 7 files changed, 1205 insertions(+), 153 deletions(-) create mode 100644 docs/state_of_the_union.txt diff --git a/.github/header-checker-lint.yml b/.github/header-checker-lint.yml index 6fe78aa..f259a98 100644 --- a/.github/header-checker-lint.yml +++ b/.github/header-checker-lint.yml @@ -1,6 +1,6 @@ {"allowedCopyrightHolders": ["Google LLC"], "allowedLicenses": ["Apache-2.0", "MIT", "BSD-3"], - "ignoreFiles": ["**/requirements.txt", "**/requirements-test.txt", "**/__init__.py", "samples/**/constraints.txt", "samples/**/constraints-test.txt"], + "ignoreFiles": ["**/requirements.txt", "**/requirements-test.txt", "**/__init__.py", "samples/**/constraints.txt", "samples/**/constraints-test.txt", "docs/state_of_the_union.txt"], "sourceFileExtensions": [ "ts", "js", @@ -12,4 +12,4 @@ "html", "txt" ] -} \ No newline at end of file +} diff --git a/docs/state_of_the_union.txt b/docs/state_of_the_union.txt new file mode 100644 index 0000000..b453aac --- /dev/null +++ b/docs/state_of_the_union.txt @@ -0,0 +1,723 @@ +Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. + +Last year COVID-19 kept us apart. This year we are finally together again. + +Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. + +With a duty to one another to the American people to the Constitution. + +And with an unwavering resolve that freedom will always triumph over tyranny. + +Six days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. + +He thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. + +He met the Ukrainian people. + +From President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world. + +Groups of citizens blocking tanks with their bodies. Everyone from students to retirees teachers turned soldiers defending their homeland. + +In this struggle as President Zelenskyy said in his speech to the European Parliament “Light will win over darkness.” The Ukrainian Ambassador to the United States is here tonight. + +Let each of us here tonight in this Chamber send an unmistakable signal to Ukraine and to the world. + +Please rise if you are able and show that, Yes, we the United States of America stand with the Ukrainian people. + +Throughout our history we’ve learned this lesson when dictators do not pay a price for their aggression they cause more chaos. + +They keep moving. + +And the costs and the threats to America and the world keep rising. + +That’s why the NATO Alliance was created to secure peace and stability in Europe after World War 2. + +The United States is a member along with 29 other nations. + +It matters. American diplomacy matters. American resolve matters. + +Putin’s latest attack on Ukraine was premeditated and unprovoked. + +He rejected repeated efforts at diplomacy. + +He thought the West and NATO wouldn’t respond. And he thought he could divide us at home. Putin was wrong. We were ready. Here is what we did. + +We prepared extensively and carefully. + +We spent months building a coalition of other freedom-loving nations from Europe and the Americas to Asia and Africa to confront Putin. + +I spent countless hours unifying our European allies. We shared with the world in advance what we knew Putin was planning and precisely how he would try to falsely justify his aggression. + +We countered Russia’s lies with truth. + +And now that he has acted the free world is holding him accountable. + +Along with twenty-seven members of the European Union including France, Germany, Italy, as well as countries like the United Kingdom, Canada, Japan, Korea, Australia, New Zealand, and many others, even Switzerland. + +We are inflicting pain on Russia and supporting the people of Ukraine. Putin is now isolated from the world more than ever. + +Together with our allies –we are right now enforcing powerful economic sanctions. + +We are cutting off Russia’s largest banks from the international financial system. + +Preventing Russia’s central bank from defending the Russian Ruble making Putin’s $630 Billion “war fund” worthless. + +We are choking off Russia’s access to technology that will sap its economic strength and weaken its military for years to come. + +Tonight I say to the Russian oligarchs and corrupt leaders who have bilked billions of dollars off this violent regime no more. + +The U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs. + +We are joining with our European allies to find and seize your yachts your luxury apartments your private jets. We are coming for your ill-begotten gains. + +And tonight I am announcing that we will join our allies in closing off American air space to all Russian flights – further isolating Russia – and adding an additional squeeze –on their economy. The Ruble has lost 30% of its value. + +The Russian stock market has lost 40% of its value and trading remains suspended. Russia’s economy is reeling and Putin alone is to blame. + +Together with our allies we are providing support to the Ukrainians in their fight for freedom. Military assistance. Economic assistance. Humanitarian assistance. + +We are giving more than $1 Billion in direct assistance to Ukraine. + +And we will continue to aid the Ukrainian people as they defend their country and to help ease their suffering. + +Let me be clear, our forces are not engaged and will not engage in conflict with Russian forces in Ukraine. + +Our forces are not going to Europe to fight in Ukraine, but to defend our NATO Allies – in the event that Putin decides to keep moving west. + +For that purpose we’ve mobilized American ground forces, air squadrons, and ship deployments to protect NATO countries including Poland, Romania, Latvia, Lithuania, and Estonia. + +As I have made crystal clear the United States and our Allies will defend every inch of territory of NATO countries with the full force of our collective power. + +And we remain clear-eyed. The Ukrainians are fighting back with pure courage. But the next few days weeks, months, will be hard on them. + +Putin has unleashed violence and chaos. But while he may make gains on the battlefield – he will pay a continuing high price over the long run. + +And a proud Ukrainian people, who have known 30 years of independence, have repeatedly shown that they will not tolerate anyone who tries to take their country backwards. + +To all Americans, I will be honest with you, as I’ve always promised. A Russian dictator, invading a foreign country, has costs around the world. + +And I’m taking robust action to make sure the pain of our sanctions is targeted at Russia’s economy. And I will use every tool at our disposal to protect American businesses and consumers. + +Tonight, I can announce that the United States has worked with 30 other countries to release 60 Million barrels of oil from reserves around the world. + +America will lead that effort, releasing 30 Million barrels from our own Strategic Petroleum Reserve. And we stand ready to do more if necessary, unified with our allies. + +These steps will help blunt gas prices here at home. And I know the news about what’s happening can seem alarming. + +But I want you to know that we are going to be okay. + +When the history of this era is written Putin’s war on Ukraine will have left Russia weaker and the rest of the world stronger. + +While it shouldn’t have taken something so terrible for people around the world to see what’s at stake now everyone sees it clearly. + +We see the unity among leaders of nations and a more unified Europe a more unified West. And we see unity among the people who are gathering in cities in large crowds around the world even in Russia to demonstrate their support for Ukraine. + +In the battle between democracy and autocracy, democracies are rising to the moment, and the world is clearly choosing the side of peace and security. + +This is a real test. It’s going to take time. So let us continue to draw inspiration from the iron will of the Ukrainian people. + +To our fellow Ukrainian Americans who forge a deep bond that connects our two nations we stand with you. + +Putin may circle Kyiv with tanks, but he will never gain the hearts and souls of the Ukrainian people. + +He will never extinguish their love of freedom. He will never weaken the resolve of the free world. + +We meet tonight in an America that has lived through two of the hardest years this nation has ever faced. + +The pandemic has been punishing. + +And so many families are living paycheck to paycheck, struggling to keep up with the rising cost of food, gas, housing, and so much more. + +I understand. + +I remember when my Dad had to leave our home in Scranton, Pennsylvania to find work. I grew up in a family where if the price of food went up, you felt it. + +That’s why one of the first things I did as President was fight to pass the American Rescue Plan. + +Because people were hurting. We needed to act, and we did. + +Few pieces of legislation have done more in a critical moment in our history to lift us out of crisis. + +It fueled our efforts to vaccinate the nation and combat COVID-19. It delivered immediate economic relief for tens of millions of Americans. + +Helped put food on their table, keep a roof over their heads, and cut the cost of health insurance. + +And as my Dad used to say, it gave people a little breathing room. + +And unlike the $2 Trillion tax cut passed in the previous administration that benefitted the top 1% of Americans, the American Rescue Plan helped working people—and left no one behind. + +And it worked. It created jobs. Lots of jobs. + +In fact—our economy created over 6.5 Million new jobs just last year, more jobs created in one year +than ever before in the history of America. + +Our economy grew at a rate of 5.7% last year, the strongest growth in nearly 40 years, the first step in bringing fundamental change to an economy that hasn’t worked for the working people of this nation for too long. + +For the past 40 years we were told that if we gave tax breaks to those at the very top, the benefits would trickle down to everyone else. + +But that trickle-down theory led to weaker economic growth, lower wages, bigger deficits, and the widest gap between those at the top and everyone else in nearly a century. + +Vice President Harris and I ran for office with a new economic vision for America. + +Invest in America. Educate Americans. Grow the workforce. Build the economy from the bottom up +and the middle out, not from the top down. + +Because we know that when the middle class grows, the poor have a ladder up and the wealthy do very well. + +America used to have the best roads, bridges, and airports on Earth. + +Now our infrastructure is ranked 13th in the world. + +We won’t be able to compete for the jobs of the 21st Century if we don’t fix that. + +That’s why it was so important to pass the Bipartisan Infrastructure Law—the most sweeping investment to rebuild America in history. + +This was a bipartisan effort, and I want to thank the members of both parties who worked to make it happen. + +We’re done talking about infrastructure weeks. + +We’re going to have an infrastructure decade. + +It is going to transform America and put us on a path to win the economic competition of the 21st Century that we face with the rest of the world—particularly with China. + +As I’ve told Xi Jinping, it is never a good bet to bet against the American people. + +We’ll create good jobs for millions of Americans, modernizing roads, airports, ports, and waterways all across America. + +And we’ll do it all to withstand the devastating effects of the climate crisis and promote environmental justice. + +We’ll build a national network of 500,000 electric vehicle charging stations, begin to replace poisonous lead pipes—so every child—and every American—has clean water to drink at home and at school, provide affordable high-speed internet for every American—urban, suburban, rural, and tribal communities. + +4,000 projects have already been announced. + +And tonight, I’m announcing that this year we will start fixing over 65,000 miles of highway and 1,500 bridges in disrepair. + +When we use taxpayer dollars to rebuild America – we are going to Buy American: buy American products to support American jobs. + +The federal government spends about $600 Billion a year to keep the country safe and secure. + +There’s been a law on the books for almost a century +to make sure taxpayers’ dollars support American jobs and businesses. + +Every Administration says they’ll do it, but we are actually doing it. + +We will buy American to make sure everything from the deck of an aircraft carrier to the steel on highway guardrails are made in America. + +But to compete for the best jobs of the future, we also need to level the playing field with China and other competitors. + +That’s why it is so important to pass the Bipartisan Innovation Act sitting in Congress that will make record investments in emerging technologies and American manufacturing. + +Let me give you one example of why it’s so important to pass it. + +If you travel 20 miles east of Columbus, Ohio, you’ll find 1,000 empty acres of land. + +It won’t look like much, but if you stop and look closely, you’ll see a “Field of dreams,” the ground on which America’s future will be built. + +This is where Intel, the American company that helped build Silicon Valley, is going to build its $20 billion semiconductor “mega site”. + +Up to eight state-of-the-art factories in one place. 10,000 new good-paying jobs. + +Some of the most sophisticated manufacturing in the world to make computer chips the size of a fingertip that power the world and our everyday lives. + +Smartphones. The Internet. Technology we have yet to invent. + +But that’s just the beginning. + +Intel’s CEO, Pat Gelsinger, who is here tonight, told me they are ready to increase their investment from +$20 billion to $100 billion. + +That would be one of the biggest investments in manufacturing in American history. + +And all they’re waiting for is for you to pass this bill. + +So let’s not wait any longer. Send it to my desk. I’ll sign it. + +And we will really take off. + +And Intel is not alone. + +There’s something happening in America. + +Just look around and you’ll see an amazing story. + +The rebirth of the pride that comes from stamping products “Made In America.” The revitalization of American manufacturing. + +Companies are choosing to build new factories here, when just a few years ago, they would have built them overseas. + +That’s what is happening. Ford is investing $11 billion to build electric vehicles, creating 11,000 jobs across the country. + +GM is making the largest investment in its history—$7 billion to build electric vehicles, creating 4,000 jobs in Michigan. + +All told, we created 369,000 new manufacturing jobs in America just last year. + +Powered by people I’ve met like JoJo Burgess, from generations of union steelworkers from Pittsburgh, who’s here with us tonight. + +As Ohio Senator Sherrod Brown says, “It’s time to bury the label “Rust Belt.” + +It’s time. + +But with all the bright spots in our economy, record job growth and higher wages, too many families are struggling to keep up with the bills. + +Inflation is robbing them of the gains they might otherwise feel. + +I get it. That’s why my top priority is getting prices under control. + +Look, our economy roared back faster than most predicted, but the pandemic meant that businesses had a hard time hiring enough workers to keep up production in their factories. + +The pandemic also disrupted global supply chains. + +When factories close, it takes longer to make goods and get them from the warehouse to the store, and prices go up. + +Look at cars. + +Last year, there weren’t enough semiconductors to make all the cars that people wanted to buy. + +And guess what, prices of automobiles went up. + +So—we have a choice. + +One way to fight inflation is to drive down wages and make Americans poorer. + +I have a better plan to fight inflation. + +Lower your costs, not your wages. + +Make more cars and semiconductors in America. + +More infrastructure and innovation in America. + +More goods moving faster and cheaper in America. + +More jobs where you can earn a good living in America. + +And instead of relying on foreign supply chains, let’s make it in America. + +Economists call it “increasing the productive capacity of our economy.” + +I call it building a better America. + +My plan to fight inflation will lower your costs and lower the deficit. + +17 Nobel laureates in economics say my plan will ease long-term inflationary pressures. Top business leaders and most Americans support my plan. And here’s the plan: + +First – cut the cost of prescription drugs. Just look at insulin. One in ten Americans has diabetes. In Virginia, I met a 13-year-old boy named Joshua Davis. + +He and his Dad both have Type 1 diabetes, which means they need insulin every day. Insulin costs about $10 a vial to make. + +But drug companies charge families like Joshua and his Dad up to 30 times more. I spoke with Joshua’s mom. + +Imagine what it’s like to look at your child who needs insulin and have no idea how you’re going to pay for it. + +What it does to your dignity, your ability to look your child in the eye, to be the parent you expect to be. + +Joshua is here with us tonight. Yesterday was his birthday. Happy birthday, buddy. + +For Joshua, and for the 200,000 other young people with Type 1 diabetes, let’s cap the cost of insulin at $35 a month so everyone can afford it. + +Drug companies will still do very well. And while we’re at it let Medicare negotiate lower prices for prescription drugs, like the VA already does. + +Look, the American Rescue Plan is helping millions of families on Affordable Care Act plans save $2,400 a year on their health care premiums. Let’s close the coverage gap and make those savings permanent. + +Second – cut energy costs for families an average of $500 a year by combatting climate change. + +Let’s provide investments and tax credits to weatherize your homes and businesses to be energy efficient and you get a tax credit; double America’s clean energy production in solar, wind, and so much more; lower the price of electric vehicles, saving you another $80 a month because you’ll never have to pay at the gas pump again. + +Third – cut the cost of child care. Many families pay up to $14,000 a year for child care per child. + +Middle-class and working families shouldn’t have to pay more than 7% of their income for care of young children. + +My plan will cut the cost in half for most families and help parents, including millions of women, who left the workforce during the pandemic because they couldn’t afford child care, to be able to get back to work. + +My plan doesn’t stop there. It also includes home and long-term care. More affordable housing. And Pre-K for every 3- and 4-year-old. + +All of these will lower costs. + +And under my plan, nobody earning less than $400,000 a year will pay an additional penny in new taxes. Nobody. + +The one thing all Americans agree on is that the tax system is not fair. We have to fix it. + +I’m not looking to punish anyone. But let’s make sure corporations and the wealthiest Americans start paying their fair share. + +Just last year, 55 Fortune 500 corporations earned $40 billion in profits and paid zero dollars in federal income tax. + +That’s simply not fair. That’s why I’ve proposed a 15% minimum tax rate for corporations. + +We got more than 130 countries to agree on a global minimum tax rate so companies can’t get out of paying their taxes at home by shipping jobs and factories overseas. + +That’s why I’ve proposed closing loopholes so the very wealthy don’t pay a lower tax rate than a teacher or a firefighter. + +So that’s my plan. It will grow the economy and lower costs for families. + +So what are we waiting for? Let’s get this done. And while you’re at it, confirm my nominees to the Federal Reserve, which plays a critical role in fighting inflation. + +My plan will not only lower costs to give families a fair shot, it will lower the deficit. + +The previous Administration not only ballooned the deficit with tax cuts for the very wealthy and corporations, it undermined the watchdogs whose job was to keep pandemic relief funds from being wasted. + +But in my administration, the watchdogs have been welcomed back. + +We’re going after the criminals who stole billions in relief money meant for small businesses and millions of Americans. + +And tonight, I’m announcing that the Justice Department will name a chief prosecutor for pandemic fraud. + +By the end of this year, the deficit will be down to less than half what it was before I took office. + +The only president ever to cut the deficit by more than one trillion dollars in a single year. + +Lowering your costs also means demanding more competition. + +I’m a capitalist, but capitalism without competition isn’t capitalism. + +It’s exploitation—and it drives up prices. + +When corporations don’t have to compete, their profits go up, your prices go up, and small businesses and family farmers and ranchers go under. + +We see it happening with ocean carriers moving goods in and out of America. + +During the pandemic, these foreign-owned companies raised prices by as much as 1,000% and made record profits. + +Tonight, I’m announcing a crackdown on these companies overcharging American businesses and consumers. + +And as Wall Street firms take over more nursing homes, quality in those homes has gone down and costs have gone up. + +That ends on my watch. + +Medicare is going to set higher standards for nursing homes and make sure your loved ones get the care they deserve and expect. + +We’ll also cut costs and keep the economy going strong by giving workers a fair shot, provide more training and apprenticeships, hire them based on their skills not degrees. + +Let’s pass the Paycheck Fairness Act and paid leave. + +Raise the minimum wage to $15 an hour and extend the Child Tax Credit, so no one has to raise a family in poverty. + +Let’s increase Pell Grants and increase our historic support of HBCUs, and invest in what Jill—our First Lady who teaches full-time—calls America’s best-kept secret: community colleges. + +And let’s pass the PRO Act when a majority of workers want to form a union—they shouldn’t be stopped. + +When we invest in our workers, when we build the economy from the bottom up and the middle out together, we can do something we haven’t done in a long time: build a better America. + +For more than two years, COVID-19 has impacted every decision in our lives and the life of the nation. + +And I know you’re tired, frustrated, and exhausted. + +But I also know this. + +Because of the progress we’ve made, because of your resilience and the tools we have, tonight I can say +we are moving forward safely, back to more normal routines. + +We’ve reached a new moment in the fight against COVID-19, with severe cases down to a level not seen since last July. + +Just a few days ago, the Centers for Disease Control and Prevention—the CDC—issued new mask guidelines. + +Under these new guidelines, most Americans in most of the country can now be mask free. + +And based on the projections, more of the country will reach that point across the next couple of weeks. + +Thanks to the progress we have made this past year, COVID-19 need no longer control our lives. + +I know some are talking about “living with COVID-19”. Tonight – I say that we will never just accept living with COVID-19. + +We will continue to combat the virus as we do other diseases. And because this is a virus that mutates and spreads, we will stay on guard. + +Here are four common sense steps as we move forward safely. + +First, stay protected with vaccines and treatments. We know how incredibly effective vaccines are. If you’re vaccinated and boosted you have the highest degree of protection. + +We will never give up on vaccinating more Americans. Now, I know parents with kids under 5 are eager to see a vaccine authorized for their children. + +The scientists are working hard to get that done and we’ll be ready with plenty of vaccines when they do. + +We’re also ready with anti-viral treatments. If you get COVID-19, the Pfizer pill reduces your chances of ending up in the hospital by 90%. + +We’ve ordered more of these pills than anyone in the world. And Pfizer is working overtime to get us 1 Million pills this month and more than double that next month. + +And we’re launching the “Test to Treat” initiative so people can get tested at a pharmacy, and if they’re positive, receive antiviral pills on the spot at no cost. + +If you’re immunocompromised or have some other vulnerability, we have treatments and free high-quality masks. + +We’re leaving no one behind or ignoring anyone’s needs as we move forward. + +And on testing, we have made hundreds of millions of tests available for you to order for free. + +Even if you already ordered free tests tonight, I am announcing that you can order more from covidtests.gov starting next week. + +Second – we must prepare for new variants. Over the past year, we’ve gotten much better at detecting new variants. + +If necessary, we’ll be able to deploy new vaccines within 100 days instead of many more months or years. + +And, if Congress provides the funds we need, we’ll have new stockpiles of tests, masks, and pills ready if needed. + +I cannot promise a new variant won’t come. But I can promise you we’ll do everything within our power to be ready if it does. + +Third – we can end the shutdown of schools and businesses. We have the tools we need. + +It’s time for Americans to get back to work and fill our great downtowns again. People working from home can feel safe to begin to return to the office. + +We’re doing that here in the federal government. The vast majority of federal workers will once again work in person. + +Our schools are open. Let’s keep it that way. Our kids need to be in school. + +And with 75% of adult Americans fully vaccinated and hospitalizations down by 77%, most Americans can remove their masks, return to work, stay in the classroom, and move forward safely. + +We achieved this because we provided free vaccines, treatments, tests, and masks. + +Of course, continuing this costs money. + +I will soon send Congress a request. + +The vast majority of Americans have used these tools and may want to again, so I expect Congress to pass it quickly. + +Fourth, we will continue vaccinating the world. + +We’ve sent 475 Million vaccine doses to 112 countries, more than any other nation. + +And we won’t stop. + +We have lost so much to COVID-19. Time with one another. And worst of all, so much loss of life. + +Let’s use this moment to reset. Let’s stop looking at COVID-19 as a partisan dividing line and see it for what it is: A God-awful disease. + +Let’s stop seeing each other as enemies, and start seeing each other for who we really are: Fellow Americans. + +We can’t change how divided we’ve been. But we can change how we move forward—on COVID-19 and other issues we must face together. + +I recently visited the New York City Police Department days after the funerals of Officer Wilbert Mora and his partner, Officer Jason Rivera. + +They were responding to a 9-1-1 call when a man shot and killed them with a stolen gun. + +Officer Mora was 27 years old. + +Officer Rivera was 22. + +Both Dominican Americans who’d grown up on the same streets they later chose to patrol as police officers. + +I spoke with their families and told them that we are forever in debt for their sacrifice, and we will carry on their mission to restore the trust and safety every community deserves. + +I’ve worked on these issues a long time. + +I know what works: Investing in crime prevention and community police officers who’ll walk the beat, who’ll know the neighborhood, and who can restore trust and safety. + +So let’s not abandon our streets. Or choose between safety and equal justice. + +Let’s come together to protect our communities, restore trust, and hold law enforcement accountable. + +That’s why the Justice Department required body cameras, banned chokeholds, and restricted no-knock warrants for its officers. + +That’s why the American Rescue Plan provided $350 Billion that cities, states, and counties can use to hire more police and invest in proven strategies like community violence interruption—trusted messengers breaking the cycle of violence and trauma and giving young people hope. + +We should all agree: The answer is not to Defund the police. The answer is to FUND the police with the resources and training they need to protect our communities. + +I ask Democrats and Republicans alike: Pass my budget and keep our neighborhoods safe. + +And I will keep doing everything in my power to crack down on gun trafficking and ghost guns you can buy online and make at home—they have no serial numbers and can’t be traced. + +And I ask Congress to pass proven measures to reduce gun violence. Pass universal background checks. Why should anyone on a terrorist list be able to purchase a weapon? + +Ban assault weapons and high-capacity magazines. + +Repeal the liability shield that makes gun manufacturers the only industry in America that can’t be sued. + +These laws don’t infringe on the Second Amendment. They save lives. + +The most fundamental right in America is the right to vote – and to have it counted. And it’s under assault. + +In state after state, new laws have been passed, not only to suppress the vote, but to subvert entire elections. + +We cannot let this happen. + +Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. + +Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. + +One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. + +And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. + +A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. + +And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. + +We can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. + +We’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. + +We’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. + +We’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders. + +We can do all this while keeping lit the torch of liberty that has led generations of immigrants to this land—my forefathers and so many of yours. + +Provide a pathway to citizenship for Dreamers, those on temporary status, farm workers, and essential workers. + +Revise our laws so businesses have the workers they need and families don’t wait decades to reunite. + +It’s not only the right thing to do—it’s the economically smart thing to do. + +That’s why immigration reform is supported by everyone from labor unions to religious leaders to the U.S. Chamber of Commerce. + +Let’s get it done once and for all. + +Advancing liberty and justice also requires protecting the rights of women. + +The constitutional right affirmed in Roe v. Wade—standing precedent for half a century—is under attack as never before. + +If we want to go forward—not backward—we must protect access to health care. Preserve a woman’s right to choose. And let’s continue to advance maternal health care in America. + +And for our LGBTQ+ Americans, let’s finally get the bipartisan Equality Act to my desk. The onslaught of state laws targeting transgender Americans and their families is wrong. + +As I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. + +While it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice. + +And soon, we’ll strengthen the Violence Against Women Act that I first wrote three decades ago. It is important for us to show the nation that we can come together and do big things. + +So tonight I’m offering a Unity Agenda for the Nation. Four big things we can do together. + +First, beat the opioid epidemic. + +There is so much we can do. Increase funding for prevention, treatment, harm reduction, and recovery. + +Get rid of outdated rules that stop doctors from prescribing treatments. And stop the flow of illicit drugs by working with state and local law enforcement to go after traffickers. + +If you’re suffering from addiction, know you are not alone. I believe in recovery, and I celebrate the 23 million Americans in recovery. + +Second, let’s take on mental health. Especially among our children, whose lives and education have been turned upside down. + +The American Rescue Plan gave schools money to hire teachers and help students make up for lost learning. + +I urge every parent to make sure your school does just that. And we can all play a part—sign up to be a tutor or a mentor. + +Children were also struggling before the pandemic. Bullying, violence, trauma, and the harms of social media. + +As Frances Haugen, who is here with us tonight, has shown, we must hold social media platforms accountable for the national experiment they’re conducting on our children for profit. + +It’s time to strengthen privacy protections, ban targeted advertising to children, demand tech companies stop collecting personal data on our children. + +And let’s get all Americans the mental health services they need. More people they can turn to for help, and full parity between physical and mental health care. + +Third, support our veterans. + +Veterans are the best of us. + +I’ve always believed that we have a sacred obligation to equip all those we send to war and care for them and their families when they come home. + +My administration is providing assistance with job training and housing, and now helping lower-income veterans get VA care debt-free. + +Our troops in Iraq and Afghanistan faced many dangers. + +One was stationed at bases and breathing in toxic smoke from “burn pits” that incinerated wastes of war—medical and hazard material, jet fuel, and more. + +When they came home, many of the world’s fittest and best trained warriors were never the same. + +Headaches. Numbness. Dizziness. + +A cancer that would put them in a flag-draped coffin. + +I know. + +One of those soldiers was my son Major Beau Biden. + +We don’t know for sure if a burn pit was the cause of his brain cancer, or the diseases of so many of our troops. + +But I’m committed to finding out everything we can. + +Committed to military families like Danielle Robinson from Ohio. + +The widow of Sergeant First Class Heath Robinson. + +He was born a soldier. Army National Guard. Combat medic in Kosovo and Iraq. + +Stationed near Baghdad, just yards from burn pits the size of football fields. + +Heath’s widow Danielle is here with us tonight. They loved going to Ohio State football games. He loved building Legos with their daughter. + +But cancer from prolonged exposure to burn pits ravaged Heath’s lungs and body. + +Danielle says Heath was a fighter to the very end. + +He didn’t know how to stop fighting, and neither did she. + +Through her pain she found purpose to demand we do better. + +Tonight, Danielle—we are. + +The VA is pioneering new ways of linking toxic exposures to diseases, already helping more veterans get benefits. + +And tonight, I’m announcing we’re expanding eligibility to veterans suffering from nine respiratory cancers. + +I’m also calling on Congress: pass a law to make sure veterans devastated by toxic exposures in Iraq and Afghanistan finally get the benefits and comprehensive health care they deserve. + +And fourth, let’s end cancer as we know it. + +This is personal to me and Jill, to Kamala, and to so many of you. + +Cancer is the #2 cause of death in America–second only to heart disease. + +Last month, I announced our plan to supercharge +the Cancer Moonshot that President Obama asked me to lead six years ago. + +Our goal is to cut the cancer death rate by at least 50% over the next 25 years, turn more cancers from death sentences into treatable diseases. + +More support for patients and families. + +To get there, I call on Congress to fund ARPA-H, the Advanced Research Projects Agency for Health. + +It’s based on DARPA—the Defense Department project that led to the Internet, GPS, and so much more. + +ARPA-H will have a singular purpose—to drive breakthroughs in cancer, Alzheimer’s, diabetes, and more. + +A unity agenda for the nation. + +We can do this. + +My fellow Americans—tonight , we have gathered in a sacred space—the citadel of our democracy. + +In this Capitol, generation after generation, Americans have debated great questions amid great strife, and have done great things. + +We have fought for freedom, expanded liberty, defeated totalitarianism and terror. + +And built the strongest, freest, and most prosperous nation the world has ever known. + +Now is the hour. + +Our moment of responsibility. + +Our test of resolve and conscience, of history itself. + +It is in this moment that our character is formed. Our purpose is found. Our future is forged. + +Well I know this nation. + +We will meet the test. + +To protect freedom and liberty, to expand fairness and opportunity. + +We will save democracy. + +As hard as these times have been, I am more optimistic about America today than I have been my whole life. + +Because I see the future that is within our grasp. + +Because I know there is simply nothing beyond our capacity. + +We are the only nation on Earth that has always turned every crisis we have faced into an opportunity. + +The only nation that can be defined by a single word: possibilities. + +So on this night, in our 245th year as a nation, I have come to report on the State of the Union. + +And my report is this: the State of the Union is strong—because you, the American people, are strong. + +We are stronger today than we were a year ago. + +And we will be stronger a year from now than we are today. + +Now is our moment to meet and overcome the challenges of our time. + +And we will, as one people. + +One America. + +The United States of America. + +May God bless you all. May God protect our troops. \ No newline at end of file diff --git a/docs/vector_store.ipynb b/docs/vector_store.ipynb index 8b1a4cf..20b3173 100644 --- a/docs/vector_store.ipynb +++ b/docs/vector_store.ipynb @@ -1,79 +1,356 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Google DATABASE\n", - "\n", - "[Google DATABASE](https://cloud.google.com/DATABASE).\n", - "\n", - "Save chat messages into `DATABASE`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Pre-reqs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "%pip install PACKAGE_NAME" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from PACKAGE import LOADER" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Basic Usage" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Google Database\n", + "\n", + "Use [Google Memorystore for Redis](https://cloud.google.com/memorystore/docs/redis/memorystore-for-redis-overview) as a vector store for LangChain." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Pre-reqs" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setting Up a Memorystore for Redis Instance\n", + "\n", + "Before proceeding, an active Memorystore for Redis instance is needed to store vectors:\n", + "\n", + "* Create a Memorystore for Reids Instance (v7.2): If an instance doesn't exist, follow the instructions at https://cloud.google.com/memorystore/docs/redis/create-instance-console to create a new one. Ensure version 7.2 is selected.\n", + "* Obtain Endpoint: Note the endpoint associated with the instance." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Installing the LangChain Memorystore for Redis Module\n", + "\n", + "Interaction with the Memorystore for Redis instance from LangChain requires installing the necessary module:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Install Memorystore for Redis for LangChain module\n", + "%pip install langchainlangchain_google_memorystore_redis" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Basic Usage" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Initialize a Vector Index" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "import redis\n", + "from langchain_google_memorystore_redis import (\n", + " DistanceStrategy,\n", + " HNSWConfig,\n", + " RedisVectorStore,\n", + ")\n", + "\n", + "# Connect to a Memorystore for Redis instance\n", + "redis_client = redis.from_url(\"redis://127.0.0.1:6379\")\n", + "\n", + "# Configure HNSW index with descriptive parameters\n", + "index_config = HNSWConfig(\n", + " name=\"my_vector_index\", distance_strategy=DistanceStrategy.COSINE, vector_size=128\n", + ")\n", + "\n", + "# Initialize/create the vector store index\n", + "RedisVectorStore.init_index(client=redis_client, index_config=index_config)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare Documents\n", + "\n", + "Text needs processing and numerical representation before interacting with a vector store. This involves:\n", + "\n", + "* Loading Text: The TextLoader obtains text data from a file (e.g., \"state_of_the_union.txt\").\n", + "* Text Splitting: The CharacterTextSplitter breaks the text into smaller chunks for embedding models." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain_community.document_loaders import TextLoader\n", + "\n", + "loader = TextLoader(\"./state_of_the_union.txt\")\n", + "documents = loader.load()\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Add Documents to the Vector Store\n", + "\n", + "After text preparation and embedding generation, the following methods insert them into the Redis vector store." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Method 1: Classmethod for Direct Insertion\n", + "\n", + "This approach combines embedding creation and insertion into a single step using the from_documents classmethod:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.embeddings.fake import FakeEmbeddings\n", + "\n", + "embeddings = FakeEmbeddings(size=128)\n", + "redis_client = redis.from_url(\"redis://127.0.0.1:6379\")\n", + "rvs = RedisVectorStore.from_documents(\n", + " docs, embedding=embeddings, client=redis_client, index_name=\"my_vector_index\"\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Method 2: Instance-Based Insertion\n", + "This approach offers flexibility when working with a new or existing RedisVectorStore:\n", + "\n", + "* [Optional] Create a RedisVectorStore Instance: Instantiate a RedisVectorStore object for customization. If you already have an instance, proceed to the next step.\n", + "* Add Text with Metadata: Provide raw text and metadata to the instance. Embedding generation and insertion into the vector store are handled automatically." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rvs = RedisVectorStore(\n", + " client=redis_client, index_name=\"my_vector_index\", embedding_service=embeddings\n", + ")\n", + "ids = rvs.add_texts(\n", + " texts=[d.page_content for d in docs], metadatas=[d.metadata for d in docs]\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Perform a Similarity Search (KNN)\n", + "\n", + "With the vector store populated, it's possible to search for text semantically similar to a query. Here's how to use KNN (K-Nearest Neighbors) with default settings:\n", + "\n", + "* Formulate the Query: A natural language question expresses the search intent (e.g., \"What did the president say about Ketanji Brown Jackson\").\n", + "* Retrieve Similar Results: The `similarity_search` method finds items in the vector store closest to the query in meaning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pprint\n", + "\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "knn_results = rvs.similarity_search(query=query)\n", + "pprint.pprint(knn_results)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Perform a Range-Based Similarity Search\n", + "\n", + "Range queries provide more control by specifying a desired similarity threshold along with the query text:\n", + "\n", + "* Formulate the Query: A natural language question defines the search intent.\n", + "* Set Similarity Threshold: The distance_threshold parameter determines how close a match must be considered relevant.\n", + "* Retrieve Results: The `similarity_search_with_score` method finds items from the vector store that fall within the specified similarity threshold." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rq_results = rvs.similarity_search_with_score(query=query, distance_threshold=0.8)\n", + "pprint.pprint(rq_results)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Perform a Maximal Marginal Relevance (MMR) Search\n", + "\n", + "MMR queries aim to find results that are both relevant to the query and diverse from each other, reducing redundancy in search results.\n", + "\n", + "* Formulate the Query: A natural language question defines the search intent.\n", + "* Balance Relevance and Diversity: The lambda_mult parameter controls the trade-off between strict relevance and promoting variety in the results.\n", + "* Retrieve MMR Results: The `max_marginal_relevance_search` method returns items that optimize the combination of relevance and diversity based on the lambda setting." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mmr_results = rvs.max_marginal_relevance_search(query=query, lambda_mult=0.90)\n", + "pprint.pprint(mmr_results)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use the Vector Store as a Retriever\n", + "\n", + "For seamless integration with other LangChain components, a vector store can be converted into a Retriever. This offers several advantages:\n", + "\n", + "* LangChain Compatibility: Many LangChain tools and methods are designed to directly interact with retrievers.\n", + "* Ease of Use: The `as_retriever()` method converts the vector store into a format that simplifies querying." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "retriever = rvs.as_retriever()\n", + "results = retriever.invoke(query)\n", + "pprint.pprint(results)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Clean up" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Delete Documents from the Vector Store\n", + "\n", + "Occasionally, it's necessary to remove documents (and their associated vectors) from the vector store. The `delete` method provides this functionality." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rvs.delete(ids)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Delete a Vector Index\n", + "\n", + "There might be circumstances where the deletion of an existing vector index is necessary. Common reasons include:\n", + "\n", + "* Index Configuration Changes: If index parameters need modification, it's often required to delete and recreate the index.\n", + "* Storage Management: Removing unused indices can help free up space within the Redis instance.\n", + "\n", + "Caution: Vector index deletion is an irreversible operation. Be certain that the stored vectors and search functionality are no longer required before proceeding." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# Delete the vector index\n", + "RedisVectorStore.drop_index(client=redis_client, index_name=\"my_vector_index\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } diff --git a/pyproject.toml b/pyproject.toml index 2017123..d28e669 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,7 @@ requires-python = ">=3.8" dependencies = [ "langchain==0.1.1", "redis>=5.0.0", + "numpy>=1.21.0", ] [project.urls] diff --git a/src/langchain_google_memorystore_redis/__init__.py b/src/langchain_google_memorystore_redis/__init__.py index 830a5f1..dd49815 100644 --- a/src/langchain_google_memorystore_redis/__init__.py +++ b/src/langchain_google_memorystore_redis/__init__.py @@ -12,6 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. from .chat_message_history import MemorystoreChatMessageHistory -from .vector_store import FLATConfig, HNSWConfig, RedisVectorStore +from .vector_store import DistanceStrategy, FLATConfig, HNSWConfig, RedisVectorStore -__all__ = ["MemorystoreChatMessageHistory"] +__all__ = [ + "MemorystoreChatMessageHistory", + "DistanceStrategy", + "FLATConfig", + "HNSWConfig", + "RedisVectorStore", +] diff --git a/src/langchain_google_memorystore_redis/chat_message_history.py b/src/langchain_google_memorystore_redis/chat_message_history.py index b2c80cf..e04ae79 100644 --- a/src/langchain_google_memorystore_redis/chat_message_history.py +++ b/src/langchain_google_memorystore_redis/chat_message_history.py @@ -50,10 +50,10 @@ def messages(self) -> List[BaseMessage]: # type: ignore all_elements = self._redis.lrange(self._key, 0, -1) assert isinstance(all_elements, list) - messages = messages_from_dict( + loaded_messages = messages_from_dict( [json.loads(self._encoder.decode(e)) for e in all_elements] ) - return messages + return loaded_messages def add_message(self, message: BaseMessage) -> None: """Append one message to this session.""" diff --git a/src/langchain_google_memorystore_redis/vector_store.py b/src/langchain_google_memorystore_redis/vector_store.py index 468b08e..f244051 100644 --- a/src/langchain_google_memorystore_redis/vector_store.py +++ b/src/langchain_google_memorystore_redis/vector_store.py @@ -15,6 +15,7 @@ import json import logging import operator +import pprint import re import uuid from abc import ABC @@ -37,12 +38,17 @@ # Setting up a basic logger logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) +logger.setLevel(logging.WARNING) handler = logging.StreamHandler() formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") handler.setFormatter(formatter) logger.addHandler(handler) +DEFAULT_CONTENT_FIELD = "page_content" +DEFAULT_VECTOR_FIELD = "vector" +DEFAULT_DATA_TYPE = "float32" +DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.COSINE + class IndexConfig(ABC): """ @@ -72,7 +78,7 @@ def __init__( data_type (str, optional): Defines the data type of the elements within the vector being indexed, such as "FLOAT32" for 32-bit floating-point numbers. This parameter is crucial for ensuring that the index - accommodates the vector data appropriately. Defaults to "FLOAT32". + accommodates the vector data appropriately. """ self.name = name @@ -94,7 +100,7 @@ def __init__( type: str, distance_strategy: DistanceStrategy, vector_size: int, - data_type: str, + data_type: str = DEFAULT_DATA_TYPE, ): """ Initializes the VectorIndexConfig object. @@ -112,9 +118,9 @@ def __init__( influencing how search results are ranked and returned. vector_size (int): The dimensionality of the vectors that will be stored and indexed. All vectors must conform to this specified size. - data_type (str, optional): The data type of the vector elements (e.g., "float32"). + data_type (str, optional): The data type of the vector elements (e.g., "FLOAT32"). This specifies the precision and format of the vector data, affecting storage - requirements and possibly search performance. Defaults to "float32". + requirements and possibly search performance. """ if distance_strategy not in self.SUPPORTED_DISTANCE_STRATEGIES: supported_strategies = ", ".join( @@ -149,9 +155,9 @@ class HNSWConfig(VectorIndexConfig): def __init__( self, name: str, - field_name: str, - vector_size: int, - distance_strategy: DistanceStrategy = DistanceStrategy.COSINE, + field_name=None, + vector_size: int = 128, + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, initial_cap: int = 10000, m: int = 16, ef_construction: int = 200, @@ -170,7 +176,7 @@ def __init__( accommodate. All vectors must match this specified size. distance_strategy (DistanceStrategy): The metric used for calculating distances or similarities between vectors, influencing how search results - are ranked. Defaults to `DistanceStrategy.COSINE`. + are ranked. initial_cap (int): Specifies the initial capacity of the index in terms of the number of vectors it can hold, impacting the initial memory allocation. Defaults to 10000. @@ -184,6 +190,8 @@ def __init__( queries, balancing between search speed and accuracy. Defaults to 10. """ + if field_name is None: + field_name = DEFAULT_VECTOR_FIELD super().__init__( name, field_name, "HNSW", distance_strategy, vector_size, "FLOAT32" ) @@ -201,9 +209,9 @@ class FLATConfig(VectorIndexConfig): def __init__( self, name: str, - field_name: str, - vector_size: int, - distance_strategy: DistanceStrategy = DistanceStrategy.COSINE, + field_name=None, + vector_size: int = 128, + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, ): """ Initializes the FLATConfig object. @@ -218,25 +226,21 @@ def __init__( indexed. All vectors added to this index must conform to this size. distance_strategy (DistanceStrategy, optional): Determines the metric used to calculate the distance or similarity between vectors during - search operations. Defaults to `DistanceStrategy.COSINE`, which - measures the cosine similarity between vectors. + search operations. """ + if field_name is None: + field_name = DEFAULT_VECTOR_FIELD super().__init__( name, field_name, "FLAT", distance_strategy, vector_size, "FLOAT32" ) class RedisVectorStore(VectorStore): - DEFAULT_CONTENT_FIELD = "page_content" - DEFAULT_VECTOR_FIELD = "vector" - DEFAULT_DATA_TYPE = "float32" - def __init__( self, client: redis.Redis, index_name: str, embedding_service: Embeddings, - key_prefix: Optional[str] = None, content_field: str = DEFAULT_CONTENT_FIELD, vector_field: str = DEFAULT_VECTOR_FIELD, ): @@ -254,10 +258,6 @@ def __init__( capable of generating vector embeddings from document content. This service is utilized to convert text documents into vector representations for storage and search. - key_prefix (Optional[str], optional): An optional prefix for Redis HASH keys - that are to be included in the vector index. This allows for selective - indexing of documents based on their keys. If None, all HASH keys in the - Redis database are considered for indexing. Defaults to None. content_field (str, optional): The field within the Redis HASH where document content is stored. This field is read to obtain document text for embedding during indexing operations. Defaults to 'page_content', which @@ -267,14 +267,33 @@ def __init__( when adding new documents to the store and when retrieving or searching documents based on their vector embeddings. Defaults to 'vector'. """ - self.client = client + if client == None: + raise ValueError( + "A Redis 'client' must be provided to initialize RedisVectorStore" + ) + + if index_name == None: + raise ValueError( + "A 'index_name' must be provided to initialize RedisVectorStore" + ) + + if embedding_service == None: + raise ValueError( + "An 'embedding_service' must be provided to initialize RedisVectorStore" + ) + + self._client = client self.index_name = index_name self.embedding_service = embedding_service - self.key_prefix = key_prefix + ":" if key_prefix is not None else "" + self.key_prefix = self.get_key_prefix(index_name) self.content_field = content_field self.vector_field = vector_field + self.encoding = client.get_encoder().encoding + + @staticmethod + def get_key_prefix(index_name: str, key_prefix: Optional[str] = None): + return key_prefix if key_prefix is not None else index_name - # Helper function to check if a string is JSON parsable @staticmethod def _is_json_parsable(s: str) -> bool: try: @@ -284,21 +303,16 @@ def _is_json_parsable(s: str) -> bool: return False @staticmethod - def init_index( - client: redis.Redis, index_config: IndexConfig, key_prefix: Optional[str] = None - ): + def init_index(client: redis.Redis, index_config: IndexConfig): """ Initializes a named VectorStore index in Redis with specified configurations. """ if not isinstance(index_config, HNSWConfig): raise ValueError("index_config must be an instance of HNSWConfig") - # Use the index name if no key_prefix is provided - key_prefix = key_prefix + ":" if key_prefix is not None else index_config.name - # Preparing the command string to avoid long lines command = ( - f"FT.CREATE {index_config.name} ON HASH PREFIX 1 {key_prefix} " + f"FT.CREATE {index_config.name} ON HASH PREFIX 1 {RedisVectorStore.get_key_prefix(index_config.name)} " f"SCHEMA {index_config.field_name} VECTOR {index_config.type} " f"6 TYPE {index_config.data_type} DIM {index_config.vector_size} " f"DISTANCE_METRIC {index_config.distance_metric}" @@ -318,7 +332,7 @@ def init_index( # for FT.INFO in the client library. @staticmethod - def drop_index(client: redis.Redis, index_name: str, index_only: bool = False): + def drop_index(client: redis.Redis, index_name: str, index_only: bool = True): """ Drops an index from the Redis database. Optionally, it can also delete the documents associated with the index. @@ -330,7 +344,7 @@ def drop_index(client: redis.Redis, index_name: str, index_only: bool = False): match the name of the existing index in the Redis database. index_only (bool, optional): A flag indicating whether to drop only the index structure (True) or to also delete the documents associated with the index (False). - Defaults to False, implying that both the index and its documents will be deleted. + Defaults to True, implying that only the index will be deleted. Raises: redis.RedisError: If any Redis-specific error occurs during the operation. This @@ -338,9 +352,10 @@ def drop_index(client: redis.Redis, index_name: str, index_only: bool = False): the command to drop the index. Callers should handle these exceptions to manage error scenarios gracefully. """ - command = ( - f"FT.DROPINDEX {index_name} {'KEEPDOCS' if index_only else ''}".strip() - ) + if index_only == False: + raise ValueError("Not supported") + + command = f"FT.DROPINDEX {index_name} {'DD' if not index_only else ''}".strip() client.execute_command(command) def add_texts( @@ -388,7 +403,7 @@ def add_texts( embeddings = self.embedding_service.embed_documents(list(texts)) ids = [] - pipeline = self.client.pipeline(transaction=False) + pipeline = self._client.pipeline(transaction=False) for i, bundle in enumerate( zip_longest(keys_or_ids, texts, embeddings, metadatas), start=1 ): @@ -399,7 +414,7 @@ def add_texts( mapping = { self.content_field: text, self.vector_field: np.array(embedding) - .astype(self.DEFAULT_DATA_TYPE) + .astype(DEFAULT_DATA_TYPE) .tobytes(), } @@ -417,12 +432,14 @@ def add_texts( ids.append(key) # Ensure to execute any remaining commands in the pipeline after the loop - if i % batch_size != 0: + if i % batch_size == 0: pipeline.execute() # Final execution to catch any remaining items in the pipeline pipeline.execute() + logger.info(f"{len(ids)} documents ingested into Redis.") + return ids @classmethod @@ -431,6 +448,8 @@ def from_texts( texts: List[str], embedding: Embeddings, metadatas: Optional[List[dict]] = None, + client=None, + index_name=None, **kwargs: Any, ) -> "RedisVectorStore": """ @@ -457,28 +476,22 @@ def from_texts( that the method cannot proceed without a connection to a Redis database. """ - if "client" not in kwargs: + if "client" == None: raise ValueError( "A 'client' must be provided to initialize RedisVectorStore" ) - if "index_name" not in kwargs: + if "index_name" == None: raise ValueError( "A 'index_name' must be provided to initialize RedisVectorStore" ) - kwargs_copy = kwargs.copy() - - # Extract 'client' and remove it from kwargs to prevent passing it twice - client = kwargs_copy.pop("client") - index_name = kwargs_copy.pop("index_name") - # Initialize RedisVectorStore instance instance = cls( client, index_name, embedding, - **kwargs_copy, + **kwargs, ) # Add texts and their corresponding metadata to the instance @@ -489,41 +502,67 @@ def from_texts( def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]: if not ids: # Check if ids list is empty or None logger.info("No IDs provided for deletion.") - return None # Or False, depending on intended behavior when ids is empty or None + return False try: - self.client.delete(*ids) + self._client.delete(*ids) logger.info("Entries deleted.") return True - except Exception as e: # It's better to catch specific exceptions + except Exception as e: logger.error(f"Failed to delete entries: {e}") return False def _similarity_search_by_vector_with_score_and_embeddings( self, query_embedding: List[float], k: int = 4, **kwargs: Any ) -> List[Tuple[Document, float, List[float]]]: + """ + Performs a similarity search by a vector with score and embeddings, offering + various customization options via keyword arguments. + + Args: + query_embedding (List[float]): A list of floats representing the embedding + vector of the query for similarity search. + k (int, optional): The number of nearest neighbors to retrieve. Defaults to 4. + **kwargs (Any): Additional keyword arguments allowing for customization of + the search operation. Key options include: + - 'distance_threshold' (float, optional): A threshold value for filtering + results based on their distance or score. If not specified directly, + it may use 'score_threshold' if provided. + - 'distance_strategy' (str, optional): Strategy to apply when comparing + distances or scores. Uses a default strategy if not specified. + + Returns: + List[Tuple[Document, float, List[float]]]: A list of tuples, each containing + a Document object, its distance (score) from the query embedding, and its own + embedding vector. The Document object includes content and metadata. + + Note: + - The function dynamically adjusts its behavior based on the presence and values + of keyword arguments. For instance, if a 'distance_threshold' is provided, + only results meeting this threshold are returned. + """ + distance_threshold = kwargs.get( "distance_threshold", kwargs.get("score_threshold") ) - query_k = k - if distance_threshold is not None: - distance_strategy = kwargs.get("distance_strategy", DistanceStrategy.COSINE) - query_k *= 4 # Quadruple k if a distance threshold is specified + distance_strategy = kwargs.get("distance_strategy", DEFAULT_DISTANCE_STRATEGY) query_args = [ "FT.SEARCH", self.index_name, - f"*=>[KNN {query_k} @{self.vector_field} $query_vector AS distance]", + f"*=>[KNN {k} @{self.vector_field} $query_vector AS distance]", "PARAMS", 2, "query_vector", - np.array([query_embedding]).astype(self.DEFAULT_DATA_TYPE).tobytes(), + np.array([query_embedding]).astype(DEFAULT_DATA_TYPE).tobytes(), "DIALECT", 2, ] - initial_results = self.client.execute_command(*query_args) + initial_results = self._client.execute_command(*query_args) + + logger.info(f"{int((len(initial_results)-1)/2)} documents returned by Redis") # Process the results final_results: List[Tuple[Document, float, List[float]]] = [] @@ -537,23 +576,21 @@ def _similarity_search_by_vector_with_score_and_embeddings( distance = 0.0 embedding: List[float] = [] for j in range(0, len(initial_results[i]), 2): - key = initial_results[i][j].decode() + key = initial_results[i][j].decode(self.encoding) value = initial_results[i][j + 1] if key == self.content_field: - page_content = value.decode() + page_content = value.decode(self.encoding) elif key == self.vector_field: - embedding = np.frombuffer( - value, dtype=self.DEFAULT_DATA_TYPE - ).tolist() + embedding = np.frombuffer(value, dtype=DEFAULT_DATA_TYPE).tolist() elif key == "distance": - distance = float(value.decode()) + distance = float(value.decode(self.encoding)) else: if isinstance(value, bytes) and self._is_json_parsable( - value.decode() + value.decode(self.encoding) ): - metadata[key] = json.loads(value.decode()) + metadata[key] = json.loads(value.decode(self.encoding)) else: - metadata[key] = value.decode() + metadata[key] = value.decode(self.encoding) final_results.append( ( @@ -574,7 +611,15 @@ def _similarity_search_by_vector_with_score_and_embeddings( for doc, distance, embedding in final_results if cmp(distance, distance_threshold) ] - return final_results[:k] + + # Directly sort final_results based on distance, applying the determined sort order + final_results = sorted( + final_results, + key=lambda d: d[1], + reverse=distance_strategy == DistanceStrategy.MAX_INNER_PRODUCT, + ) + + return final_results def _similarity_search_by_vector_with_score( self, query_embedding: List[float], k: int = 4, **kwargs: Any From 3e5ced31214a8c10e211b399753a4676e76aa0e0 Mon Sep 17 00:00:00 2001 From: Averi Kitsch Date: Wed, 14 Feb 2024 09:18:10 -0800 Subject: [PATCH 5/9] Update integration.cloudbuild.yaml --- integration.cloudbuild.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration.cloudbuild.yaml b/integration.cloudbuild.yaml index 3edce93..f4bcfd9 100644 --- a/integration.cloudbuild.yaml +++ b/integration.cloudbuild.yaml @@ -27,4 +27,4 @@ steps: options: pool: - name: 'projects/$PROJECT_ID/locations/$LOCATION/workerPools/redis' + name: 'projects/$PROJECT_ID/locations/us-central1/workerPools/redis' From 5bd6b26220b1b55f52f514327de15a4982c85b35 Mon Sep 17 00:00:00 2001 From: Craig Chi Date: Wed, 14 Feb 2024 13:31:11 -0800 Subject: [PATCH 6/9] feat: add MemorystoreDocumentSaver class (#18) * feat: add MemorystoreDocumentSaver class * fix: use variable in the worker pool of the Cloud Build integration * fix: update doc saver for json encoding * fix: align key_prefix behavior * fix: add back key_prefix if keys_or_ids is given * fix: fix indent of the comments * fix: get encoder directly from the Redis client * fix: add pipeline support for Redis client * fix: reorder prefix and content_field in constructor * fix: insert prefix to all given document IDs * fix: fix tests prefix and chat encoding --- integration.cloudbuild.yaml | 2 +- .../__init__.py | 2 + .../chat_message_history.py | 4 +- .../doc_saver.py | 113 +++++++++++++++++ .../vector_store.py | 9 +- tests/test_doc_saver.py | 118 ++++++++++++++++++ 6 files changed, 242 insertions(+), 6 deletions(-) create mode 100644 src/langchain_google_memorystore_redis/doc_saver.py create mode 100644 tests/test_doc_saver.py diff --git a/integration.cloudbuild.yaml b/integration.cloudbuild.yaml index f4bcfd9..62d8d22 100644 --- a/integration.cloudbuild.yaml +++ b/integration.cloudbuild.yaml @@ -27,4 +27,4 @@ steps: options: pool: - name: 'projects/$PROJECT_ID/locations/us-central1/workerPools/redis' + name: '$_WORKER_POOL' diff --git a/src/langchain_google_memorystore_redis/__init__.py b/src/langchain_google_memorystore_redis/__init__.py index dd49815..94cb66f 100644 --- a/src/langchain_google_memorystore_redis/__init__.py +++ b/src/langchain_google_memorystore_redis/__init__.py @@ -12,10 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. from .chat_message_history import MemorystoreChatMessageHistory +from .doc_saver import MemorystoreDocumentSaver from .vector_store import DistanceStrategy, FLATConfig, HNSWConfig, RedisVectorStore __all__ = [ "MemorystoreChatMessageHistory", + "MemorystoreDocumentSaver", "DistanceStrategy", "FLATConfig", "HNSWConfig", diff --git a/src/langchain_google_memorystore_redis/chat_message_history.py b/src/langchain_google_memorystore_redis/chat_message_history.py index e04ae79..90c32fe 100644 --- a/src/langchain_google_memorystore_redis/chat_message_history.py +++ b/src/langchain_google_memorystore_redis/chat_message_history.py @@ -42,7 +42,7 @@ def __init__( self._redis = client self._key = session_id self._ttl = ttl - self._encoder = client.connection_pool.get_encoder() + self._encoding = client.get_encoder().encoding @property def messages(self) -> List[BaseMessage]: # type: ignore @@ -51,7 +51,7 @@ def messages(self) -> List[BaseMessage]: # type: ignore assert isinstance(all_elements, list) loaded_messages = messages_from_dict( - [json.loads(self._encoder.decode(e)) for e in all_elements] + [json.loads(e.decode(self._encoding)) for e in all_elements] ) return loaded_messages diff --git a/src/langchain_google_memorystore_redis/doc_saver.py b/src/langchain_google_memorystore_redis/doc_saver.py new file mode 100644 index 0000000..9b8c31c --- /dev/null +++ b/src/langchain_google_memorystore_redis/doc_saver.py @@ -0,0 +1,113 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import uuid +from typing import Optional, Sequence, Set, Union + +import redis +from langchain_core.documents.base import Document + + +class MemorystoreDocumentSaver: + """Document Saver for Cloud Memorystore for Redis database.""" + + def __init__( + self, + client: redis.Redis, + key_prefix: str, + content_field: str, + metadata_fields: Optional[Set[str]] = None, + ): + """Initializes the Document Saver for Memorystore for Redis. + + Args: + client: A redis.Redis client object. + key_prefix: A prefix for the keys to store Documents in Redis. + content_field: The field of the hash that Redis uses to store the + page_content of the Document. + metadata_fields: The metadata fields of the Document that will be + stored in the Redis. If None, Redis stores all metadata fields. + """ + + self._redis = client + if not key_prefix: + raise ValueError("key_prefix must not be empty") + self._key_prefix = key_prefix + self._content_field = content_field + self._metadata_fields = metadata_fields + + def add_documents( + self, + documents: Sequence[Document], + ids: Optional[Sequence[str]] = None, + batch_size: int = 1000, + ) -> None: + """Save a list of Documents to Redis. + + Args: + documents: A List of Documents. + ids: The list of suffixes for keys that Redis uses to store the + Documents. If specified, the length of the IDs must be the same + as Documents. If not specified, random UUIDs appended after + prefix are used to store each Document. + batch_size: The number of documents to process in a single batch + operation. This parameter helps manage memory and performance + when adding a large number of documents. Defaults to 1000. + """ + if ids and len(documents) != len(ids): + raise ValueError("The length of documents must match the length of the IDs") + if batch_size <= 0: + raise ValueError("batch_size must be greater than 0") + + doc_ids = ids if ids else [str(uuid.uuid4()) for _ in documents] + doc_ids = [self._key_prefix + doc_id for doc_id in doc_ids] + + pipeline = self._redis.pipeline(transaction=False) + for i, doc in enumerate(documents): + mapping = self._filter_metadata_by_fields(doc.metadata) + mapping.update({self._content_field: doc.page_content}) + + # Remove existing key in Redis to avoid reusing the doc ID. + pipeline.delete(doc_ids[i]) + pipeline.hset(doc_ids[i], mapping=mapping) + if (i + 1) % batch_size == 0 or i == len(documents) - 1: + pipeline.execute() + + def _filter_metadata_by_fields(self, metadata: Optional[dict]) -> dict: + """Filter metadata fields to be stored in Redis. + + Args: + metadata: The metadata field of a Document object. + + Returns: + dict: A subset dict of the metadata that only contains the fields + specified in the initialization of the saver. The value of each + metadata key is serialized by JSON if it is a dict. + """ + if not metadata: + return {} + filtered_fields = ( + self._metadata_fields & metadata.keys() + if self._metadata_fields + else metadata.keys() + ) + filtered_metadata = { + k: self._jsonify_if_dict(metadata[k]) for k in filtered_fields + } + return filtered_metadata + + @staticmethod + def _jsonify_if_dict(s: Union[str, dict]) -> str: + return s if isinstance(s, str) else json.dumps(s) diff --git a/src/langchain_google_memorystore_redis/vector_store.py b/src/langchain_google_memorystore_redis/vector_store.py index f244051..0fc9ac9 100644 --- a/src/langchain_google_memorystore_redis/vector_store.py +++ b/src/langchain_google_memorystore_redis/vector_store.py @@ -393,10 +393,13 @@ def add_texts( """ # Generate or extend keys/IDs for the documents keys_or_ids = kwargs.get("keys", kwargs.get("ids", [])) + if keys_or_ids and len(keys_or_ids) != len(list(texts)): + raise ValueError( + "The length of keys or ids must match the length of the texts" + ) + if not keys_or_ids: + keys_or_ids = [str(uuid.uuid4()) for _ in texts] # Ensure there's a unique ID for each text document - keys_or_ids = (keys_or_ids + [str(uuid.uuid4()) for _ in texts])[ - len(keys_or_ids) : - ] # Fallback for empty metadata metadatas = metadatas if metadatas is not None else [{} for _ in texts] # Generate embeddings for all documents diff --git a/tests/test_doc_saver.py b/tests/test_doc_saver.py new file mode 100644 index 0000000..0320a28 --- /dev/null +++ b/tests/test_doc_saver.py @@ -0,0 +1,118 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import os + +import pytest +import redis +from langchain_core.documents.base import Document + +from langchain_google_memorystore_redis.doc_saver import MemorystoreDocumentSaver + + +@pytest.mark.parametrize( + "page_content,metadata,content_field,metadata_fields", + [ + ( + '"content1"', + {"key1": "doc1_value1", "key2": "doc1_value2"}, + "page_content", + None, + ), + ( + '"content2"', + {"key1": {'"nested_key"': {'"double_nested"': '"doc2_value1"'}}}, + "special_page_content", + None, + ), + ( + '"content3"', + {"key1": {"k": "not_in_filter"}, "key2": {'"key"': "in_filter"}}, + "page_content", + set(["key2"]), + ), + ], +) +def test_doc_saver_add_documents_one_doc( + page_content, metadata, content_field, metadata_fields +): + client = redis.from_url(get_env_var("REDIS_URL", "URL of the Redis instance")) + prefix = "prefix:" + + saver = MemorystoreDocumentSaver( + client=client, + key_prefix=prefix, + content_field=content_field, + metadata_fields=metadata_fields, + ) + + doc = Document.construct(page_content=page_content, metadata=metadata) + doc_id = "doc" + saver.add_documents([doc], [doc_id]) + + # Only verify the metadata keys given in the metadata_fields + metadata_to_verify = {} + for k, v in metadata.items(): + if not metadata_fields or k in metadata_fields: + metadata_to_verify[k] = v + + verify_stored_values( + client, + prefix + doc_id, + page_content, + content_field, + metadata_to_verify, + ) + + client.delete(prefix + doc_id) + + +def verify_stored_values( + client: redis.Redis, + key: str, + page_content: str, + content_field: str, + metadata_to_verify: dict, +): + stored_value = client.hgetall(key) + assert isinstance(stored_value, dict) + assert len(stored_value) == 1 + len(metadata_to_verify) + + for k, v in stored_value.items(): + decoded_value = v.decode() + if k == content_field.encode(): + assert page_content == decoded_value + else: + assert ( + metadata_to_verify[k.decode()] == json.loads(decoded_value) + if is_json_parsable(decoded_value) + else decoded_value + ) + + +def is_json_parsable(s: str) -> bool: + try: + json.loads(s) + return True + except ValueError: + return False + + +def get_env_var(key: str, desc: str) -> str: + v = os.environ.get(key) + if v is None: + raise ValueError(f"Must set env var {key} to: {desc}") + return v From aa887a0eff4a9793950c93182976cc85551b1486 Mon Sep 17 00:00:00 2001 From: Craig Chi Date: Thu, 15 Feb 2024 09:16:06 -0800 Subject: [PATCH 7/9] feat: add MemorystoreDocumentLoader class (#24) --- .../__init__.py | 2 + .../doc_loader.py | 99 ++++++++++++++ tests/test_doc_loader.py | 123 ++++++++++++++++++ 3 files changed, 224 insertions(+) create mode 100644 src/langchain_google_memorystore_redis/doc_loader.py create mode 100644 tests/test_doc_loader.py diff --git a/src/langchain_google_memorystore_redis/__init__.py b/src/langchain_google_memorystore_redis/__init__.py index 94cb66f..c85c5b3 100644 --- a/src/langchain_google_memorystore_redis/__init__.py +++ b/src/langchain_google_memorystore_redis/__init__.py @@ -12,11 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. from .chat_message_history import MemorystoreChatMessageHistory +from .doc_loader import MemorystoreDocumentLoader from .doc_saver import MemorystoreDocumentSaver from .vector_store import DistanceStrategy, FLATConfig, HNSWConfig, RedisVectorStore __all__ = [ "MemorystoreChatMessageHistory", + "MemorystoreDocumentLoader", "MemorystoreDocumentSaver", "DistanceStrategy", "FLATConfig", diff --git a/src/langchain_google_memorystore_redis/doc_loader.py b/src/langchain_google_memorystore_redis/doc_loader.py new file mode 100644 index 0000000..6097978 --- /dev/null +++ b/src/langchain_google_memorystore_redis/doc_loader.py @@ -0,0 +1,99 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from typing import Iterator, List, Optional, Sequence, Set, Union + +import redis +from langchain_community.document_loaders.base import BaseLoader +from langchain_core.documents.base import Document + + +class MemorystoreDocumentLoader(BaseLoader): + """Document Loader for Cloud Memorystore for Redis database.""" + + def __init__( + self, + client: redis.Redis, + key_prefix: str, + content_fields: Set[str], + metadata_fields: Optional[Set[str]] = None, + ): + """Initializes the Document Loader for Memorystore for Redis. + + Args: + client: A redis.Redis client object. + key_prefix: A prefix for the keys to store Documents in Redis. + content_fields: The set of fields of the hash that Redis uses to + store the page_content of the Document. If more than one field + are specified, a JSON encoded dict containing the fields as top + level keys will be filled in the page_content of the Documents. + metadata_fields: The metadata fields of the Document that will be + stored in the Redis. If None, Redis stores all metadata fields. + """ + + self._redis = client + self._content_fields = content_fields + self._metadata_fields = metadata_fields + if metadata_fields and len(content_fields & metadata_fields): + raise ValueError( + "Fields {} are specified in both content_fields and" + " metadata_fields.".format(content_fields & metadata_fields) + ) + self._key_prefix = key_prefix if key_prefix else "" + self._encoding = client.get_encoder().encoding + + def lazy_load(self) -> Iterator[Document]: + """Lazy load the Documents and yield them one by one.""" + for key in self._redis.scan_iter(match=f"{self._key_prefix}*", _type="HASH"): + doc = {} + stored_value = self._redis.hgetall(key) + if not isinstance(stored_value, dict): + raise RuntimeError(f"{key} returns unexpected {stored_value}") + decoded_value = { + k.decode(self._encoding): v.decode(self._encoding) + for k, v in stored_value.items() + } + + if len(self._content_fields) == 1: + doc["page_content"] = decoded_value[next(iter(self._content_fields))] + else: + doc["page_content"] = json.dumps( + {k: decoded_value[k] for k in self._content_fields} + ) + + filtered_fields = ( + self._metadata_fields if self._metadata_fields else decoded_value.keys() + ) + filtered_fields = filtered_fields - self._content_fields + doc["metadata"] = { + k: self._decode_if_json_parsable(decoded_value[k]) + for k in filtered_fields + } + + yield Document.construct(**doc) + + def load(self) -> List[Document]: + """Load all Documents at once.""" + return list(self.lazy_load()) + + @staticmethod + def _decode_if_json_parsable(s: str) -> Union[str, dict]: + """Decode a JSON string to a dict if it is JSON.""" + try: + decoded = json.loads(s) + return decoded + except ValueError: + pass + return s diff --git a/tests/test_doc_loader.py b/tests/test_doc_loader.py new file mode 100644 index 0000000..b0431a0 --- /dev/null +++ b/tests/test_doc_loader.py @@ -0,0 +1,123 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import os + +import pytest +import redis +from langchain_core.documents.base import Document + +from langchain_google_memorystore_redis.doc_loader import MemorystoreDocumentLoader +from langchain_google_memorystore_redis.doc_saver import MemorystoreDocumentSaver + + +@pytest.mark.parametrize( + "page_content,metadata,content_field,metadata_fields", + [ + ( + '"content1"', + {"key1": "doc1_value1", "key2": "doc1_value2"}, + "page_content", + None, + ), + ( + '"content2"', + {"key1": {'"nested_key"': {'"double_nested"': '"doc2_value1"'}}}, + "special_page_content", + None, + ), + ( + '"content3"', + {"key1": {"k": "not_in_filter"}, "key2": {'"key"': "in_filter"}}, + "page_content", + set(["key2"]), + ), + ], +) +def test_doc_loader_one_doc(page_content, metadata, content_field, metadata_fields): + client = redis.from_url(get_env_var("REDIS_URL", "URL of the Redis instance")) + + prefix = "prefix:" + saver = MemorystoreDocumentSaver( + client=client, + key_prefix=prefix, + content_field=content_field, + metadata_fields=metadata_fields, + ) + doc = Document.construct(page_content=page_content, metadata=metadata) + doc_id = "saved_doc" + saver.add_documents([doc], [doc_id]) + + loader = MemorystoreDocumentLoader( + client=client, + key_prefix=prefix, + content_fields=set([content_field]), + metadata_fields=metadata_fields, + ) + loaded_docs = loader.load() + expected_doc = ( + doc + if not metadata_fields + else Document.construct( + page_content=page_content, + metadata={k: metadata[k] for k in metadata_fields}, + ) + ) + assert loaded_docs == [expected_doc] + client.delete(prefix + doc_id) + + +def test_doc_loader_multiple_docs(): + client = redis.from_url(get_env_var("REDIS_URL", "URL of the Redis instance")) + + prefix = "multidocs:" + # Clean up stored documents with the same prefix + for key in client.keys(f"{prefix}*"): + client.delete(key) + + content_field = "page_content" + saver = MemorystoreDocumentSaver( + client=client, + key_prefix=prefix, + content_field=content_field, + ) + docs = [] + for content in range(10): + docs.append( + Document.construct( + page_content=f"{content}", + metadata={"metadata": f"meta: {content}"}, + ) + ) + + saver.add_documents(docs) + + loader = MemorystoreDocumentLoader( + client=client, + key_prefix=prefix, + content_fields=set([content_field]), + ) + loaded_docs = [] + for doc in loader.lazy_load(): + loaded_docs.append(doc) + assert sorted(loaded_docs, key=lambda d: d.page_content) == docs + + +def get_env_var(key: str, desc: str) -> str: + v = os.environ.get(key) + if v is None: + raise ValueError(f"Must set env var {key} to: {desc}") + return v From c1cf6b602554d39f47d8fd988ff57f246598a186 Mon Sep 17 00:00:00 2001 From: Ping Xie Date: Thu, 15 Feb 2024 09:40:43 -0800 Subject: [PATCH 8/9] refactor(vector-store): enhance validation and initialization processes (#22) * feat(vector store): Add usage demo and enhance Redis integration - Add Jupyter notebook in `docs/` for using langchain vector store with Memorystore Redis. - Update `vector_store.py` for improved performance and reliability with Redis. - Modify `__init__.py` to reflect package structure changes and new functionalities. - Include `requirements.txt` and `setup.py` for easy installation and dependency management. - Add `state_of_the_union.txt` in `docs/` as a sample dataset for the notebook demo. * incorporate review feedback * added missing license header * excluded the test file from license check * reformatted source files * incorporated review feedback * fixed lint errors * fixed more lint errors * removed key_prefix argument from vectorstore * incorporated review feedback * fixed formatting errors * fixed a bad merge * suppress mypy errors for setuptools * trying mypy.ini * remove setup.py and requirements * add numpy dependency * refactor(vector-store): enhance validation and initialization processes - Add validation for unsupported data types and negative vector sizes in VectorIndexConfig. - Refactor field_name parameters in HNSWConfig and FLATConfig to support type hinting with optional strings. - Enhance RedisVectorStore initialization by enforcing type checks for client, index_name, and embedding_service. - Introduce an optional 'ids' parameter in methods to allow explicit document identifiers, improving document management. - Adjust method documentation and error messages for clarity and consistency. * Incorporated review feedback * fixed formatting * incorporated review feedback * formatting * fixed bugs * fix formatting * added vector store unittests * actualy added vector store tests * fix formatting * fix: fix styles and formats --------- Co-authored-by: Craig Chi --- docs/vector_store.ipynb | 2 +- .../__init__.py | 9 +- .../chat_message_history.py | 4 +- .../vector_store.py | 172 ++++++------ tests/test_vector_store.py | 253 ++++++++++++++++++ 5 files changed, 362 insertions(+), 78 deletions(-) create mode 100644 tests/test_vector_store.py diff --git a/docs/vector_store.ipynb b/docs/vector_store.ipynb index 20b3173..564e338 100644 --- a/docs/vector_store.ipynb +++ b/docs/vector_store.ipynb @@ -174,7 +174,7 @@ "outputs": [], "source": [ "rvs = RedisVectorStore(\n", - " client=redis_client, index_name=\"my_vector_index\", embedding_service=embeddings\n", + " client=redis_client, index_name=\"my_vector_index\", embeddings=embeddings\n", ")\n", "ids = rvs.add_texts(\n", " texts=[d.page_content for d in docs], metadatas=[d.metadata for d in docs]\n", diff --git a/src/langchain_google_memorystore_redis/__init__.py b/src/langchain_google_memorystore_redis/__init__.py index c85c5b3..0231dc7 100644 --- a/src/langchain_google_memorystore_redis/__init__.py +++ b/src/langchain_google_memorystore_redis/__init__.py @@ -14,13 +14,20 @@ from .chat_message_history import MemorystoreChatMessageHistory from .doc_loader import MemorystoreDocumentLoader from .doc_saver import MemorystoreDocumentSaver -from .vector_store import DistanceStrategy, FLATConfig, HNSWConfig, RedisVectorStore +from .vector_store import ( + DistanceStrategy, + FLATConfig, + HNSWConfig, + RedisVectorStore, + VectorIndexConfig, +) __all__ = [ "MemorystoreChatMessageHistory", "MemorystoreDocumentLoader", "MemorystoreDocumentSaver", "DistanceStrategy", + "VectorIndexConfig", "FLATConfig", "HNSWConfig", "RedisVectorStore", diff --git a/src/langchain_google_memorystore_redis/chat_message_history.py b/src/langchain_google_memorystore_redis/chat_message_history.py index 90c32fe..bad05a3 100644 --- a/src/langchain_google_memorystore_redis/chat_message_history.py +++ b/src/langchain_google_memorystore_redis/chat_message_history.py @@ -49,7 +49,9 @@ def messages(self) -> List[BaseMessage]: # type: ignore """Retrieve all messages chronologically stored in this session.""" all_elements = self._redis.lrange(self._key, 0, -1) - assert isinstance(all_elements, list) + if not isinstance(all_elements, list): + raise TypeError("Expected a list from `lrange` but got a different type.") + loaded_messages = messages_from_dict( [json.loads(e.decode(self._encoding)) for e in all_elements] ) diff --git a/src/langchain_google_memorystore_redis/vector_store.py b/src/langchain_google_memorystore_redis/vector_store.py index 0fc9ac9..e40cac0 100644 --- a/src/langchain_google_memorystore_redis/vector_store.py +++ b/src/langchain_google_memorystore_redis/vector_store.py @@ -15,11 +15,9 @@ import json import logging import operator -import pprint import re import uuid from abc import ABC -from enum import Enum, auto from itertools import zip_longest from typing import Any, Iterable, List, Optional, Tuple @@ -29,12 +27,9 @@ DistanceStrategy, maximal_marginal_relevance, ) -from langchain_core._api import deprecated -from langchain_core.callbacks import CallbackManagerForRetrieverRun from langchain_core.documents import Document from langchain_core.embeddings import Embeddings -from langchain_core.utils import get_from_dict_or_env -from langchain_core.vectorstores import VectorStore, VectorStoreRetriever +from langchain_core.vectorstores import VectorStore # Setting up a basic logger logger = logging.getLogger(__name__) @@ -46,7 +41,7 @@ DEFAULT_CONTENT_FIELD = "page_content" DEFAULT_VECTOR_FIELD = "vector" -DEFAULT_DATA_TYPE = "float32" +DEFAULT_DATA_TYPE = "FLOAT32" DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.COSINE @@ -131,6 +126,12 @@ def __init__( f"Supported strategies are: {supported_strategies}." ) + if data_type.upper() != DEFAULT_DATA_TYPE: + raise ValueError(f"Unsupported data type: {data_type}") + + if vector_size < 0: + raise ValueError(f"Unsupported vector size: {vector_size}") + super().__init__(name, field_name, type) self.distance_strategy = distance_strategy self.vector_size = vector_size @@ -148,6 +149,12 @@ def distance_metric(self): class HNSWConfig(VectorIndexConfig): + DEFAULT_VECTOR_SIZE = 128 + DEFAULT_INITIAL_CAP = 10000 + DEFAULT_M = 16 + DEFAULT_EF_CONSTRUCTION = 200 + DEFAULT_EF_RUNTIME = 10 + """ Configuration class for HNSW (Hierarchical Navigable Small World) vector indexes. """ @@ -155,13 +162,13 @@ class HNSWConfig(VectorIndexConfig): def __init__( self, name: str, - field_name=None, - vector_size: int = 128, + field_name: Optional[str] = None, + vector_size: int = DEFAULT_VECTOR_SIZE, distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, - initial_cap: int = 10000, - m: int = 16, - ef_construction: int = 200, - ef_runtime: int = 10, + initial_cap: int = DEFAULT_INITIAL_CAP, + m: int = DEFAULT_M, + ef_construction: int = DEFAULT_EF_CONSTRUCTION, + ef_runtime: int = DEFAULT_EF_RUNTIME, ): """ Initializes the HNSWConfig object. @@ -179,21 +186,19 @@ def __init__( are ranked. initial_cap (int): Specifies the initial capacity of the index in terms of the number of vectors it can hold, impacting the initial memory allocation. - Defaults to 10000. m (int): Determines the maximum number of outgoing edges each node in the index graph can have, directly affecting the graph's connectivity and - search performance. Defaults to 16. + search performance. ef_construction (int): Controls the size of the dynamic candidate list during the construction of the index, influencing the index build time and quality. - Defaults to 200. ef_runtime (int): Sets the size of the dynamic candidate list during search - queries, balancing between search speed and accuracy. Defaults to 10. + queries, balancing between search speed and accuracy. """ if field_name is None: field_name = DEFAULT_VECTOR_FIELD super().__init__( - name, field_name, "HNSW", distance_strategy, vector_size, "FLOAT32" + name, field_name, "HNSW", distance_strategy, vector_size, DEFAULT_DATA_TYPE ) self.initial_cap = initial_cap self.m = m @@ -206,11 +211,13 @@ class FLATConfig(VectorIndexConfig): Configuration class for FLAT vector indexes, utilizing brute-force search. """ + DEFAULT_VECTOR_SIZE = 128 + def __init__( self, name: str, - field_name=None, - vector_size: int = 128, + field_name: Optional[str] = None, + vector_size: int = DEFAULT_VECTOR_SIZE, distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, ): """ @@ -231,7 +238,7 @@ def __init__( if field_name is None: field_name = DEFAULT_VECTOR_FIELD super().__init__( - name, field_name, "FLAT", distance_strategy, vector_size, "FLOAT32" + name, field_name, "FLAT", distance_strategy, vector_size, DEFAULT_DATA_TYPE ) @@ -240,7 +247,7 @@ def __init__( self, client: redis.Redis, index_name: str, - embedding_service: Embeddings, + embeddings: Embeddings, content_field: str = DEFAULT_CONTENT_FIELD, vector_field: str = DEFAULT_VECTOR_FIELD, ): @@ -254,7 +261,7 @@ def __init__( index_name (str): The name assigned to the vector index within Redis. This name is used to identify the index for operations such as searching and indexing. - embedding_service (Embeddings): An instance of an embedding service or model + embeddings (Embeddings): An instance of an embedding service or model capable of generating vector embeddings from document content. This service is utilized to convert text documents into vector representations for storage and search. @@ -267,24 +274,24 @@ def __init__( when adding new documents to the store and when retrieving or searching documents based on their vector embeddings. Defaults to 'vector'. """ - if client == None: + if not isinstance(client, redis.Redis): raise ValueError( "A Redis 'client' must be provided to initialize RedisVectorStore" ) - if index_name == None: + if not isinstance(index_name, str): raise ValueError( "A 'index_name' must be provided to initialize RedisVectorStore" ) - if embedding_service == None: + if not isinstance(embeddings, Embeddings): raise ValueError( - "An 'embedding_service' must be provided to initialize RedisVectorStore" + "An 'embeddings' must be provided to initialize RedisVectorStore" ) self._client = client self.index_name = index_name - self.embedding_service = embedding_service + self.embeddings_service = embeddings self.key_prefix = self.get_key_prefix(index_name) self.content_field = content_field self.vector_field = vector_field @@ -362,7 +369,8 @@ def add_texts( self, texts: Iterable[str], metadatas: Optional[List[dict]] = None, - batch_size: int = 1000, + ids: Optional[List[str]] = None, + batch_size: Optional[int] = 1000, **kwargs: Any, ) -> List[str]: """ @@ -375,49 +383,57 @@ def add_texts( where each dictionary corresponds to a text document in the same order as the `texts` iterable. Each metadata dictionary should contain key-value pairs representing the metadata attributes for the associated text document. + ids (Optional[List[str]], optional): An optional list of unique identifiers for each text document. + If not provided, the system will generate unique identifiers for each document. If provided, + the length of this list should match the length of `texts`. batch_size (int, optional): The number of documents to process in a single batch operation. This parameter helps manage memory and performance when adding a large number of documents. Defaults to 1000. - **kwargs (Any): Additional keyword arguments for extended functionality. This includes: - - 'keys' or 'ids' (List[str], optional): Custom identifiers for each document. If provided, - the length of this list should match the length of `texts`. If not provided, the system - will generate unique identifiers. Returns: List[str]: A list containing the unique keys or identifiers for each added document. These keys can be used to retrieve or reference the documents within the vector store. Note: - If both 'keys' (or 'ids') and 'metadatas' are provided, they must be of the same length as the - `texts` iterable to ensure each document is correctly associated with its metadata and identifier. + If both 'ids' and 'metadatas' are provided, they must be of the same length as the `texts` + iterable to ensure each document is correctly associated with its metadata and identifier. """ - # Generate or extend keys/IDs for the documents - keys_or_ids = kwargs.get("keys", kwargs.get("ids", [])) - if keys_or_ids and len(keys_or_ids) != len(list(texts)): - raise ValueError( - "The length of keys or ids must match the length of the texts" - ) - if not keys_or_ids: - keys_or_ids = [str(uuid.uuid4()) for _ in texts] - # Ensure there's a unique ID for each text document - # Fallback for empty metadata - metadatas = metadatas if metadatas is not None else [{} for _ in texts] + + # Generate ids if not provided + if ids is None: + ids = [str(uuid.uuid4()) for _ in texts] + + # Check if both ids and metadatas are provided and have the same length + if ids is not None: + if len(ids) != len(list(texts)): + raise ValueError("The length of 'ids' and 'texts' must be the same.") + + if not metadatas: + metadatas = [{} for _ in texts] + + if metadatas is not None: + if len(metadatas) != len(list(texts)): + raise ValueError( + "The length of 'metadatas' and 'texts' must be the same." + ) + + if not batch_size or batch_size <= 0: + raise ValueError("batch_size must be greater than 0.") + # Generate embeddings for all documents - embeddings = self.embedding_service.embed_documents(list(texts)) + embeddings = self.embeddings_service.embed_documents(list(texts)) - ids = [] + new_ids = [] pipeline = self._client.pipeline(transaction=False) - for i, bundle in enumerate( - zip_longest(keys_or_ids, texts, embeddings, metadatas), start=1 - ): - key, text, embedding, metadata = bundle - key = self.key_prefix + key + for i, bundle in enumerate(zip(ids, texts, embeddings, metadatas), start=1): + id, text, embedding, metadata = bundle + new_id = self.key_prefix + id # Initialize the mapping with content and vector fields mapping = { self.content_field: text, self.vector_field: np.array(embedding) - .astype(DEFAULT_DATA_TYPE) + .astype(DEFAULT_DATA_TYPE.lower()) .tobytes(), } @@ -431,8 +447,8 @@ def add_texts( mapping[meta_key] = str(meta_value) # Add the document to the Redis hash - pipeline.hset(key, mapping=mapping) - ids.append(key) + pipeline.hset(new_id, mapping=mapping) + new_ids.append(new_id) # Ensure to execute any remaining commands in the pipeline after the loop if i % batch_size == 0: @@ -443,7 +459,7 @@ def add_texts( logger.info(f"{len(ids)} documents ingested into Redis.") - return ids + return new_ids @classmethod def from_texts( @@ -451,8 +467,9 @@ def from_texts( texts: List[str], embedding: Embeddings, metadatas: Optional[List[dict]] = None, - client=None, - index_name=None, + ids: Optional[List[str]] = None, + client: Optional[redis.Redis] = None, + index_name: Optional[str] = None, **kwargs: Any, ) -> "RedisVectorStore": """ @@ -465,26 +482,29 @@ def from_texts( metadatas (Optional[List[dict]]): A list of dictionaries where each dictionary contains metadata corresponding to each text document in `texts`. If provided, the length of `metadatas` must match the length of `texts`. - **kwargs (Any): Additional keyword arguments that can include: - - 'client': A Redis client instance to be used by the RedisVectorStore. - - 'index_name': The name of the index to be created or used in Redis. - If not provided, a default name may be used. + ids (Optional[List[str]], optional): An optional list of unique identifiers for + each text document. If not provided, the system will generate unique identifiers + for each document. If provided, the length of this list should match the length + of `texts`. + client (redis.Redis): The Redis client instance to be used for database + operations, providing connectivity and command execution against the + Redis instance. + index_name (str): The name assigned to the vector index within Redis. This + name is used to identify the index for operations such as searching and + indexing. + **kwargs (Any): Additional keyword arguments Returns: RedisVectorStore: An instance of RedisVectorStore that has been populated with the embeddings of the provided texts, along with their associated metadata. - - Raises: - ValueError: If a Redis client instance is not provided in `kwargs`, indicating - that the method cannot proceed without a connection to a Redis database. """ - if "client" == None: + if not isinstance(client, redis.Redis): raise ValueError( "A 'client' must be provided to initialize RedisVectorStore" ) - if "index_name" == None: + if not isinstance(index_name, str): raise ValueError( "A 'index_name' must be provided to initialize RedisVectorStore" ) @@ -498,7 +518,7 @@ def from_texts( ) # Add texts and their corresponding metadata to the instance - instance.add_texts(texts, metadatas) + instance.add_texts(texts, metadatas, ids) return instance @@ -558,7 +578,7 @@ def _similarity_search_by_vector_with_score_and_embeddings( "PARAMS", 2, "query_vector", - np.array([query_embedding]).astype(DEFAULT_DATA_TYPE).tobytes(), + np.array([query_embedding]).astype(DEFAULT_DATA_TYPE.lower()).tobytes(), "DIALECT", 2, ] @@ -584,7 +604,9 @@ def _similarity_search_by_vector_with_score_and_embeddings( if key == self.content_field: page_content = value.decode(self.encoding) elif key == self.vector_field: - embedding = np.frombuffer(value, dtype=DEFAULT_DATA_TYPE).tolist() + embedding = np.frombuffer( + value, dtype=DEFAULT_DATA_TYPE.lower() + ).tolist() elif key == "distance": distance = float(value.decode(self.encoding)) else: @@ -652,7 +674,7 @@ def similarity_search_with_score( documents most relevant to the query according to the similarity scores. """ # Embed the query using the embedding function - query_embedding = self.embedding_service.embed_query(query) + query_embedding = self.embeddings_service.embed_query(query) return self._similarity_search_by_vector_with_score( query_embedding, k, **kwargs ) @@ -710,7 +732,7 @@ def similarity_search( the search backend. """ # Embed the query using the embedding function - query_embedding = self.embedding_service.embed_query(query) + query_embedding = self.embeddings_service.embed_query(query) return self.similarity_search_by_vector(query_embedding, k, **kwargs) def max_marginal_relevance_search( @@ -748,7 +770,7 @@ def max_marginal_relevance_search( raise ValueError("lambda_mult must be between 0 and 1.") # Embed the query using a hypothetical method to convert text to vector. - query_embedding = self.embedding_service.embed_query(query) + query_embedding = self.embeddings_service.embed_query(query) # Fetch initial documents based on query embedding. initial_results = self._similarity_search_by_vector_with_score_and_embeddings( diff --git a/tests/test_vector_store.py b/tests/test_vector_store.py new file mode 100644 index 0000000..3a84272 --- /dev/null +++ b/tests/test_vector_store.py @@ -0,0 +1,253 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import os +import uuid + +import numpy +import pytest +import redis +from langchain_community.embeddings.fake import FakeEmbeddings +from langchain_core.documents.base import Document + +from langchain_google_memorystore_redis import ( + DistanceStrategy, + HNSWConfig, + RedisVectorStore, + VectorIndexConfig, +) + + +def test_vector_store_init_index(): + client = redis.from_url(get_env_var("REDIS_URL", "URL of the Redis instance")) + index_name = str(uuid.uuid4()) + + index_config = HNSWConfig( + name=index_name, distance_strategy=DistanceStrategy.COSINE, vector_size=128 + ) + + assert not check_index_exists(client, index_name, index_config) + RedisVectorStore.init_index(client=client, index_config=index_config) + assert check_index_exists(client, index_name, index_config) + RedisVectorStore.drop_index(client=client, index_name=index_name) + assert not check_index_exists(client, index_name, index_config) + client.flushall() + + +@pytest.mark.parametrize( + "texts,metadatas,ids", + [ + # Test case 1: Basic scenario with texts only + (["text1", "text2"], None, None), + # Test case 2: Texts with metadatas + (["text1", "text2"], [{"meta1": "data1"}, {"meta2": "data2"}], None), + # Test case 3: Texts with metadatas and ids + (["text1", "text2"], [{"meta1": "data1"}, {"meta2": "data2"}], ["id1", "id2"]), + # Test case 4: Texts with ids only + (["text1", "text2"], None, ["id1", "id2"]), + # Additional test cases can be added as needed + ], +) +def test_vector_store_add_texts(texts, metadatas, ids): + client = redis.from_url(get_env_var("REDIS_URL", "URL of the Redis instance")) + + # Initialize the vector index + index_name = str(uuid.uuid4()) + index_config = HNSWConfig( + name=index_name, distance_strategy=DistanceStrategy.COSINE, vector_size=128 + ) + RedisVectorStore.init_index(client=client, index_config=index_config) + + # Insert the documents + rvs = RedisVectorStore( + client=client, index_name=index_name, embeddings=FakeEmbeddings(size=128) + ) + returned_ids = rvs.add_texts(texts=texts, metadatas=metadatas, ids=ids) + + original_metadatas = metadatas if metadatas is not None else [None] * len(texts) + original_ids = ids if ids is not None else [""] * len(texts) + + # Validate the results + for original_id, text, original_metadata, returned_id in zip( + original_ids, texts, original_metadatas, returned_ids + ): + expected_id = f"{index_name}{original_id}" + # Check if original_id is empty and adjust assertion accordingly + if original_id == "": + assert returned_id.startswith( + expected_id + ), f"Returned ID {returned_id} does not start with expected prefix {expected_id}" + else: + assert ( + returned_id == expected_id + ), f"Returned ID {returned_id} does not match expected {expected_id}" + + # Fetch the record from Redis + hash_record = client.hgetall(returned_id) + + # Validate page_content + fetched_page_content = hash_record[b"page_content"].decode("utf-8") + assert fetched_page_content == text, "Page content does not match" + + # Validate vector embedding + vector = numpy.frombuffer(hash_record[b"vector"], dtype=numpy.float32) + assert ( + len(vector) == 128 + ), f"Decoded 'vector' length is {len(vector)}, expected 128" + + # Iterate over each key-value pair in the hash_record + fetched_metadata = {} + for key, value in hash_record.items(): + # Decode the key from bytes to string + key_decoded = key.decode("utf-8") + + # Skip 'page_content' and 'vector' keys, include all others in fetched_metadata + if key_decoded not in ["page_content", "vector"]: + # Decode the value from bytes to string or JSON as needed + try: + # Attempt to load JSON content if applicable + value_decoded = json.loads(value.decode("utf-8")) + except json.JSONDecodeError: + # Fallback to simple string decoding if it's not JSON + value_decoded = value.decode("utf-8") + + # Add the decoded key-value pair to fetched_metadata + fetched_metadata[key_decoded] = value_decoded + + if original_metadata is None: + original_metadata = {} + + assert fetched_metadata == original_metadata, "Metadata does not match" + + # Verify no extra keys are present + all_keys = [key.decode("utf-8") for key in client.keys(f"{index_name}*")] + # Currently RedisQuery stores the index schema as a key using the index_name + assert len(all_keys) == len(returned_ids) + 1, "Found unexpected keys in Redis" + + # Clena up + RedisVectorStore.drop_index(client=client, index_name=index_name) + client.flushall() + + +def test_vector_store_knn_query(): + texts = [ + "The quick brown fox jumps over the lazy dog", + "A clever fox outwitted the guard dog to sneak into the farmyard at night", + "Exploring the mysteries of deep space and black holes", + "Delicious recipes for homemade pasta and pizza", + "Advanced techniques in machine learning and artificial intelligence", + "Sustainable living: Tips for reducing your carbon footprint", + ] + + client = redis.from_url(get_env_var("REDIS_URL", "URL of the Redis instance")) + + # Initialize the vector index + index_name = str(uuid.uuid4()) + index_config = HNSWConfig( + name=index_name, distance_strategy=DistanceStrategy.COSINE, vector_size=128 + ) + RedisVectorStore.init_index(client=client, index_config=index_config) + + # Insert the documents + rvs = RedisVectorStore( + client=client, index_name=index_name, embeddings=FakeEmbeddings(size=128) + ) + rvs.add_texts(texts=texts) + + # Validate knn query + query_result = rvs.similarity_search(query="fox dog", k=2) + assert len(query_result) == 2, "Expected 2 documents to be returned" + + # Clean up + RedisVectorStore.drop_index(client=client, index_name=index_name) + client.flushall() + + +@pytest.mark.parametrize( + "distance_strategy, distance_threshold", + [ + (DistanceStrategy.COSINE, 0.8), + (DistanceStrategy.MAX_INNER_PRODUCT, 1.0), + (DistanceStrategy.EUCLIDEAN_DISTANCE, 2.0), + ], +) +def test_vector_store_range_query(distance_strategy, distance_threshold): + texts = [ + "The quick brown fox jumps over the lazy dog", + "A clever fox outwitted the guard dog to sneak into the farmyard at night", + "Exploring the mysteries of deep space and black holes", + "Delicious recipes for homemade pasta and pizza", + "Advanced techniques in machine learning and artificial intelligence", + "Sustainable living: Tips for reducing your carbon footprint", + ] + + client = redis.from_url(get_env_var("REDIS_URL", "URL of the Redis instance")) + + # Initialize the vector index + index_name = str(uuid.uuid4()) + index_config = HNSWConfig( + name=index_name, distance_strategy=distance_strategy, vector_size=128 + ) + RedisVectorStore.init_index(client=client, index_config=index_config) + + # Insert the documents + rvs = RedisVectorStore( + client=client, index_name=index_name, embeddings=FakeEmbeddings(size=128) + ) + rvs.add_texts(texts=texts) + + # Validate range query + query_result = rvs.similarity_search_with_score( + query="dog", + k=3, + distance_strategy=distance_strategy, + distance_threshold=distance_threshold, + ) + assert len(query_result) <= 3, "Expected less than 3 documents to be returned" + for _, score in query_result: + if distance_strategy == DistanceStrategy.MAX_INNER_PRODUCT: + assert ( + score > distance_threshold + ), f"Score {score} is not greater than {distance_threshold} for {distance_strategy}" + else: + assert ( + score < distance_threshold + ), f"Score {score} is not less than {distance_threshold} for {distance_strategy}" + + # Clean up + RedisVectorStore.drop_index(client=client, index_name=index_name) + client.flushall() + + +def check_index_exists( + client: redis.Redis, index_name: str, index_config: VectorIndexConfig +) -> bool: + try: + index_info = client.ft(index_name).info() + except: + return False + + return ( + index_info["index_name"] == index_name + and index_info["index_definition"][1] == b"HASH" + ) + + +def get_env_var(key: str, desc: str) -> str: + v = os.environ.get(key) + if v is None: + raise ValueError(f"Must set env var {key} to: {desc}") + return v From 7552220aaa099782026be578af2eac5e2fa37af7 Mon Sep 17 00:00:00 2001 From: Ping Xie Date: Fri, 16 Feb 2024 13:19:32 -0800 Subject: [PATCH 9/9] feat: Add placeholder for LangChain with Memorystore Redis integration - Introduce initial ipynb file as a scaffold for demonstrating the integration of LangChain with Memorystore for Redis. - The notebook is currently a placeholder, with detailed examples and guidance to be added. --- samples/langchain_quick_start.ipynb | 33 +++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 samples/langchain_quick_start.ipynb diff --git a/samples/langchain_quick_start.ipynb b/samples/langchain_quick_start.ipynb new file mode 100644 index 0000000..b0e2c99 --- /dev/null +++ b/samples/langchain_quick_start.ipynb @@ -0,0 +1,33 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}