From 1185dca3686b8beec0ade1ec303eb6c829f0b2ea Mon Sep 17 00:00:00 2001 From: Averi Kitsch Date: Thu, 1 Feb 2024 11:13:33 -0800 Subject: [PATCH 1/9] feat: add Vectorstore --- src/langchain_google_cloud_sql_pg/cloudSQL.py | 717 ++++++++++++++++++ tests/test_cloudSQL.py | 183 +++++ 2 files changed, 900 insertions(+) create mode 100644 src/langchain_google_cloud_sql_pg/cloudSQL.py create mode 100644 tests/test_cloudSQL.py diff --git a/src/langchain_google_cloud_sql_pg/cloudSQL.py b/src/langchain_google_cloud_sql_pg/cloudSQL.py new file mode 100644 index 0000000..afd364d --- /dev/null +++ b/src/langchain_google_cloud_sql_pg/cloudSQL.py @@ -0,0 +1,717 @@ +import json +import uuid + +import asyncio +import asyncpg +import nest_asyncio + +from typing import Any, Dict, List, Optional, Tuple, Type +from pgvector.asyncpg import register_vector + +# import sqlalchemy +from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncEngine, create_async_engine + +from google.cloud.sql.connector import Connector +import google.auth +import numpy as np +from google.auth.transport.requests import Request + +from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings +from langchain_core.vectorstores import VectorStore +from langchain_community.vectorstores.utils import maximal_marginal_relevance + +import aiohttp + +nest_asyncio.apply() + +async def _get_IAM_user(credentials): + """Get user/service account name""" + request = google.auth.transport.requests.Request() + credentials.refresh(request) + + url = f"https://oauth2.googleapis.com/tokeninfo?access_token={credentials.token}" + async with aiohttp.ClientSession() as client: + response = await client.get(url) + response = await response.text() + response = json.loads(response) + email = response['email'] + if ".gserviceaccount.com" in email: + email = email.replace(".gserviceaccount.com","") + + return email + +class CloudSQLEngine: + """Creating a connection to the CloudSQL instance + To use, you need the following packages installed: + cloud-sql-python-connector[asyncpg] + """ + def __init__( + self, + project_id=None, + region=None, + instance=None, + database=None, + engine=None + ): + self.project_id = project_id + self.region = region + self.instance = instance + self.database = database + self.engine = engine + self._pool = asyncio.get_event_loop().run_until_complete(self._engine()) + + @classmethod + def from_instance( + cls, + region: str, + instance: str, + database: str, + project_id: str=None, + ) -> CloudSQLEngine: + + """Create CloudSQLEngine connection to the postgres database in the CloudSQL instance. + + Args: + region (str): CloudSQL instance region. + instance (str): CloudSQL instance name. + database (str): CloudSQL instance database name. + project_id (str): GCP project ID. Defaults to None + + Returns: + CloudSQLEngine containing the asyncpg connection pool. + """ + return cls(project_id=project_id, region=region, instance=instance, database=database) + + @classmethod + def from_engine( + cls, + engine: AsyncEngine + ) -> CloudSQLEngine: + + return cls(engine=engine) + + async def _engine(self) -> AsyncEngine: + + if self.engine is not None: + return self.engine + + credentials, _ = google.auth.default(scopes=['email', 'https://www.googleapis.com/auth/cloud-platform']) + + if self.project_id is None: + self.project_id = _ + + async def get_conn(): + async with Connector(loop=asyncio.get_running_loop()) as connector: + conn = await connector.connect_async( + f"{self.project_id}:{self.region}:{self.instance}", + "asyncpg", + user=await _get_IAM_user(credentials), + enable_iam_auth=True, + db=self.database, + ) + + await register_vector(conn) + return conn + + pool = create_async_engine( + "postgresql+asyncpg://", + async_creator=get_conn, + ) + + return pool + + async def _aexecute_fetch( + self, + query + ) -> Any: + + async with self._pool.connect() as conn: + result = (await conn.execute(text(query))) + result_map = result.mappings() + result_fetch = result_map.fetchall() + + return result_fetch + + async def _aexecute_update( + self, + query, + additional=None + ) -> None: + + async with self._pool.connect() as conn: + result = (await conn.execute(text(query),additional)) + result = result.mappings() + await conn.commit() + +class CloudSQLVectorStore(VectorStore): + """Google Cloud SQL vector store. + + To use, you need the following packages installed: + pgvector-python + sqlalchemy + """ + + def __init__( + self, + engine: Type[CloudSQLEngine], + table_name: str, + vector_size: int, + embedding_service: Embeddings, + content_column: str='content', + embedding_column: str='embedding', + metadata_columns: Optional[str, List[str]]='metadata', + ignore_metadata_columns: bool=False, + index_query_options = None, + index: Type[HNSWIndex, IVFFlatIndex, BruteForce]=HNSWIndex, + distance_strategy = 'L2', + overwrite_existing: bool=False, + store_metadata: bool=True + ): + """Constructor for CloudSQLVectorStore. + + Args: + engine (CloudSQLEngine): AsyncEngine with pool connection to the postgres database. Required. + embedding_service (Embeddings): Text embedding model to use. + table_name (str): Name of the existing table or the table to be created. + content_column (str): Column that represent a Document’s page_content. Defaults to content + embedding_column (str): Column for embedding vectors. + The embedding is generated from the document value. Defaults to embedding + metadata_columns (List[str]): Column(s) that represent a document's metadata. Defaults to metadata + ignore_metadata_columns (List[str]): Column(s) to ignore in pre-existing tables for a document’s metadata. + Can not be used with metadata_columns. Defaults to None + overwrite_existing (bool): Boolean for truncating table before inserting data. Defaults to False + index_query_options : QueryOptions class with vector search parameters. Defaults to None + distance_strategy (str): + Determines the strategy employed for calculating + the distance between vectors in the embedding space. + Defaults to EUCLIDEAN_DISTANCE(L2). + Available options are: + - COSINE: Measures the similarity between two vectors of an inner + product space. + - EUCLIDEAN_DISTANCE: Computes the Euclidean distance between + two vectors. This metric considers the geometric distance in + the vector space, and might be more suitable for embeddings + that rely on spatial relationships. This is the default behavior. + """ + + self.engine = engine + self.table_name = table_name + self.vector_size = vector_size + self.embedding_service = embedding_service + self.embedding_column = embedding_column + self.content_column = content_column + self.metadata_columns = metadata_columns + self.ignore_metadata_columns = ignore_metadata_columns + self.overwrite_existing = overwrite_existing + self.index_query_options = index_query_options + self.store_metadata = store_metadata + self.distance_strategy = distance_strategy + self.index = index + asyncio.get_running_loop().run_until_complete(self.__post_init__()) + + async def __post_init__(self) -> None: + """Initialize table and validate existing tables""" + + # Check if table exists + query = f"SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = '{self.table_name}');" + result = await self.engine._aexecute_fetch(query) + # If table exists + if result[0]['exists']: + # If overwrite_existing is True Truncate the Table + if self.overwrite_existing: + query = f"TRUNCATE TABLE {self.table_name} RESET IDENTITY" + await self.engine._aexecute_update(query) + + # Checking if metadata and ignore_metadata are given together + if self.metadata_columns is not None and self.ignore_metadata_columns is not None: + raise ValueError("Both metadata_columns and ignore_metadata_columns have been provided.") + + get_name = f"SELECT column_name, data_type FROM information_schema.columns WHERE table_name = '{self.table_name}'" + result = await self.engine._aexecute_fetch(get_name) + column_name = [col['column_name'] for col in result] + dtypes = [dtype['data_type'] for dtype in result] + + # Check column names and datatype for embedding column + if 'uuid' not in column_name: + raise ValueError("Column uuid does not exist") + if self.content_column not in column_name: + raise ValueError(f"Column {self.content_column} does not exist") + if self.embedding_column in column_name: + if "USER-DEFINED" not in dtypes: + raise ValueError(f"Column {self.embedding_column} is not of type vector") + else: + raise ValueError(f"Column {self.embedding_column} does not exist") + + if 'metadata' not in column_name: + raise ValueError("Column metadata does not exist") + + # Check if there are non-nullable columns + query = f"SELECT column_name FROM information_schema.columns WHERE table_name = '{self.table_name}' AND is_nullable = 'NO';" + result = await self.engine._aexecute_fetch(query) + non_nullable_list = [n['column_name'] for n in result] + exceptions = set(["uuid", f"{self.content_column}"]) + other_values = [value for value in non_nullable_list if value not in exceptions] + + if bool(other_values): + raise ValueError(f"Only uuid and {self.content_column} can be non-nullable") + + # If both metadata and ignore_metadata are given, throw an error + if self.metadata_columns is not None and self.ignore_metadata_columns is not None: + raise ValueError("Both metadata_columns and ignore_metadata_columns have been provided.") + + else: + await self.init_vectorstore_table( + engine=self.engine, + table_name=self.table_name, + vector_size=self.vector_size, + content_column=self.content_column, + embedding_column=self.embedding_column, + metadata_columns=self.metadata_columns, + overwrite_existing=self.overwrite_existing, + store_metadata=self.store_metadata + ) + + @property + def embeddings( + self + ) -> Embeddings: + return self.embedding_service + + async def create_vector_extension( + self + ) -> None: + """Creates the vector extsion to the specified database.""" + query = "CREATE EXTENSION IF NOT EXISTS vector" + await self.engine._aexecute_update(query) + + async def init_vectorstore_table( + self, + engine: Type[CloudSQLEngine], + table_name: str, + vector_size: int, + content_column: str='content', + embedding_column: str='embedding', + metadata_columns: Optional[str, List[str]]='metadata', + overwrite_existing: bool=False, + store_metadata: bool=True + ) -> None: + + """Creating a non-default vectorstore table""" + + # Create vector extension if not exists + await self.create_vector_extension() + + if overwrite_existing: + query = f"TRUNCATE TABLE {self.table_name} RESET IDENTITY" + await engine._aexecute_update(query) + + query = f''' + CREATE TABLE IF NOT EXISTS {table_name} ( + uuid UUID PRIMARY KEY, + {content_column} TEXT NOT NULL, + {embedding_column} vector({vector_size}), + {metadata_columns} JSON + ); + ''' + await engine._aexecute_update(query) + + @classmethod + async def afrom_embeddings( + cls: Type[CloudSQLVectorStore], + engine: Type[CloudSQLEngine], + embedding_service: Embeddings, + text_embeddings: List[Tuple[str, List[float]]], + table_name: str, + metadatas: List[Dict]=None, + ids: List[int]=None + ) -> cloudSQLVectorStore: + + texts = [t[0] for t in text_embeddings] + embeddings = [t[1] for t in text_embeddings] + metadatas = [{} for _ in texts] + + table = cls( + engine=engine, + table_name=table_name, + ) + + await table.aadd_embeddings( + texts=texts, engine=engine, embeddings=embeddings, metadatas=metadatas, ids=ids, table_name=table_name) + + return table + + @classmethod + async def afrom_documents( + cls: Type[CloudSQLVectorStore], + documents: List[Document], + engine: Type[CloudSQLEngine], + table_name: str, + embedding_service: Embeddings, + ids: List[int]=None + ) -> cloudSQLVectorStore: + + texts = [d.page_content for d in documents] + metadatas = [json.dumps(d.metadata) for d in documents] + + embeddings = embedding_service.embed_documents(list(texts)) + + table = cls( + engine=engine, + embedding_service=embedding_service, + table_name=table_name, + ) + + await table.aadd_embeddings( + texts=texts, engine=engine, embeddings=embeddings, metadatas=metadatas, ids=ids, table_name=table_name) + + return table + + @classmethod + async def afrom_texts( + cls: Type[CloudSQLVectorStore], + texts: List[str], + table_name: str, + embedding_service: Embeddings, + engine: Type[CloudSQLEngine], + metadatas: List[Dict]=None, + ids: List[int]=None + ) -> CloudSQLVectorStore: + + """ Return VectorStore initialized from texts and embeddings.""" + if not metadatas: + metadatas = [{} for _ in texts] + + documents = [] + for text, meta in zip(texts, metadatas): + docs = Document(page_content=text, metadata=meta) + documents.append(docs) + + return await cls.afrom_documents( + engine=engine, + documents=documents, + embedding_service=embedding_service, + table_name=table_name, + ids=ids) + + async def aadd_embeddings( + self, + engine: Type[CloudSQLEngine], + texts: List[str], + table_name: str, + embeddings: Embeddings, + metadatas: List[Dict]=None, + ids: List[int]=None + ) -> List[str]: + + if ids is None: + ids = [str(uuid.uuid1()) for _ in texts] + + for id, content, embedding, meta in zip(ids, texts, embeddings, metadatas): + data_to_add = {"ids":id, 'content':content, 'embedding':embedding, 'metadata':meta} + stmt = f"INSERT INTO {table_name}(uuid, content, embedding, metadata) VALUES (:ids,:content,:embedding,:metadata)" + await engine._aexecute_update(stmt, data_to_add) + + return ids + + async def aadd_documents( + self, + documents: List[Document], + ids: List[int]=None + ) -> List[str]: + + """Run more documents through the embeddings and add to the vectorstore. + + Args: + documents (List[Document]): Iterable of Documents to add to the vectorstore. + ids (List[str]): List of id strings. Defaults to None + + Returns: + List of ids from adding the texts into the vectorstore. + """ + + texts = [d.page_content for d in documents] + metadatas = [json.dumps(d.metadata) for d in documents] + embeddings = self.embedding_service.embed_documents(list(texts)) + + return await self.aadd_embeddings( + texts=texts, + embeddings=embeddings, + metadatas=metadatas, + ids=ids, + engine=self.Engine, + table_name=self.table_name) + + + async def aadd_texts( + self, + texts: List[str], + metadatas: List[Dict]=None, + ids: List[int]=None + ) -> List[str]: + + """Run more texts through the embeddings and add to the vectorstore. + + Args: + texts (str): Iterable of strings to add to the vectorstore. + metadatas (List[dict]): Optional list of metadatas associated with the texts. Defaults to None. + ids (List[str]): List of id strings. Defaults to None + + Returns: + List of ids from adding the texts into the vectorstore. + """ + + if not metadatas: + metadata = [{} for _ in texts] + + documents = [] + for text, meta in zip(texts, metadatas): + docs = Document(page_content = text, metadata=meta) + documents.append(docs) + + return await self.aadd_documents( + documents=documents, + ids=ids) + + async def __query_collection( + self, + embedding: List[float], + k: int=4, + filter: str=None + ) -> List[Any]: + + if filter is not None: + condition = f"WHERE {filter}" + + query = f""" + SELECT uuid, {self.content_column}, {self.embedding_column}, metadata, + l2_distance({self.embedding_column}, '{embedding}') as distance + FROM {self.table_name} {condition} ORDER BY {self.embedding_column} <-> '{embedding}' LIMIT {k} + """ + else: + query = f""" + SELECT uuid, {self.content_column}, {self.embedding_column}, metadata, + l2_distance({self.embedding_column}, '{embedding}') as distance + FROM {self.table_name} ORDER BY {self.embedding_column} <-> '{embedding}' LIMIT {k} + """ + results = await self.engine._aexecute_fetch(query) + + return results + + async def asimilarity_search( + self, + query: str, + k: int=4, + filter: str=None + ) -> List[Document]: + + embedding = self.embedding_service.embed_query(text=query) + + return await self.asimilarity_search_by_vector( + embedding=embedding, + k=k, + filter=filter + ) + + async def asimilarity_search_by_vector( + self, + embedding: List[float], + k: int=4, + filter: str=None + ) -> List[Document]: + + docs_and_scores = await self.asimilarity_search_with_score_by_vector( + embedding=embedding, k=k, filter=filter + ) + + return [doc for doc, _ in docs_and_scores] + + async def asimilarity_search_with_score( + self, + query: str, + k: int=4, + filter: str=None + ) -> List[Tuple[Document, float]]: + + embedding = self.embedding_service.embed_query(query) + docs = await self.asimilarity_search_with_score_by_vector( + embedding=embedding, k=k, filter=filter + ) + return docs + + async def asimilarity_search_with_score_by_vector( + self, + embedding: List[float], + k: int=4, + filter: str=None + ) -> List[Tuple[Document, float]]: + + results = await self.__query_collection(embedding=embedding, k=k, filter=filter) + documents_with_scores = [(Document(page_content=i[f"{self.content_column}"],metadata=i["metadata"],),i['distance'],)for i in results] + return documents_with_scores + + async def amax_marginal_relevance_search( + self, + query: str, + k: int=4, + fetch_k: int=20, + lambda_mult: float=0.5, + filter: str=None + ) -> List[Document]: + + embedding = await self.embedding_service.embed_query(text=query) + + return self.amax_marginal_relevance_search_by_vector( + embedding=embedding, + k=k, + fetch_k=fetch_k, + lambda_mult=lambda_mult, + filter=filter + ) + async def amax_marginal_relevance_search_with_score_by_vector( + self, + embedding: List[float], + k: int=4, + fetch_k: int=20, + lambda_mult: float=0.5, + filter: str=None + ) -> List[Tuple[Document, float]]: + + results = await self.__query_collection(embedding=embedding, k=fetch_k, filter=filter) + embedding_list = [i[f"{self.embedding_column}"] for i in results] + + mmr_selected = maximal_marginal_relevance( + np.array(embedding, dtype=np.float32), + embedding_list, + k=k, + lambda_mult=lambda_mult, + ) + + candidates = [(Document(page_content=i[f"{self.content_column}"],metadata=i["metadata"],),i["distance"],)for i in results] + + return [r for i, r in enumerate(candidates) if i in mmr_selected] + + async def _acreate_index( + self, + index: Union[HNSWIndex, IVFFlatIndex, BruteForce] + ): + + if isinstance(index, BruteForce): + return None + + distance = 'l2' if distance_strategy == 'L2' else 'ip' if distance_strategy == 'INNER' else 'cosine' + index_type = 'hnsw' if istinstance(index, HNSWIndex()) else 'ivfflat' + if partial_indexes == None: + condition = "" + else: + condition = f"WHERE (partial_indexes)" + + if index_type == 'hnsw': + query = f"CREATE INDEX ON {self.table_name} USING hnsw ({self.embedding_column} vector_{distance}_ops) WITH (m={index.m}, ef_construction={index.ef_construction}) {condition}" + else: + query = f"CREATE INDEX ON {self.table_name} USING ivfflat ({self.embedding_column} vector_{distance}_ops) WITH (lists={index.lists}) {condition}" + + await self.engine._aexecute_update(query) + + async def _aindex_query_options( + self, + index_query_options: Type[HNSWIndex.QueryOptions()] + ): + + if isinstance(index_query_options, HNSWIndex.QueryOptions): + query_options = index_query_options.ef_search + quey = f"SET hnsw.ef_search = {query_options}" + else: + query_options = index_query_options.probes + query = f"SET ivfflat.probes = {query_options}" + + await self.engine._aexecute_update(query) + + async def areindex( + self, + index: Union[HNSWIndex, IVFFlatIndex, BruteForce], + index_name: Optional + ): + + if name: + query = f"REINDEX INDEX {index_name}" + await self.engine._aexecute_update(query) + else: + await _acreate_index(index) + + + async def adrop_index( + self + ): + query = f"SELECT indexname, indexdef FROM pg_indexes WHERE tablename='{self.table_name}'" + current_index = await self.engine._aexecute_fetch(query) + index_def = current_index[0]['indexdef'] + if 'hnsw' in index_def or 'ivfflat' in index_def: + current_index = current_index['indexname'] + query = f"DROP INDEX {current_index}" + await self.engine._aexecute_update(query) + else: + raise ValueError("Cannot drop Index") + + async def aset_index_query_options( + self, + distance_strategy, + index_query_options + ): + self.distance_strategy = distance_strategy + self.index_query_options = index_query_options + await self._aindex_query_options() + + +class BruteForce: + + def __init__( + self, + distance_strategy: str='L2' + ): + + self.distance_strategy = distance_strategy + +class HNSWIndex: + + def __init__( + self, + m: int=16, + ef_construction: int=64, + partial_indexes: List=None, + distance_strategy: str='L2' + ): + self.m = m + self.ef_construction = ef_construction + self.partial_indexes = partial_indexes + self.distance_strategy = distance_strategy + + class QueryOptions( + self, + ef_search + ): + + self.ef_search = ef_search + +class IVFFlatIndex: + + def __init__( + self, + lists: int=1, + partial_indexes: List=None, + distance_strategy: str='L2' + ): + + self.lists = lists + self.partial_indexes = partial_indexes + self.distance_strategy = distance_strategy + + class QueryOptions: + + def __init__( + self, + probes + ): + + self.probes = probes \ No newline at end of file diff --git a/tests/test_cloudSQL.py b/tests/test_cloudSQL.py new file mode 100644 index 0000000..8800da6 --- /dev/null +++ b/tests/test_cloudSQL.py @@ -0,0 +1,183 @@ +"""Test cloudSQLVectorStore functionality.""" +import os +from typing import List + +from langchain_core.documents import Document + +from langchain_community.vectorstores.cloudSQL import cloudSQLVectorStore +from langchain_community.vectorstores.cloudSQL import cloudSQLEngine +from langchain_community.embeddings import FakeEmbeddings + +# from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings + +engine = cloudSQLEngine.from_instance( + project_id = os.environ.get("PROJECT_ID", None), + instance = os.environ.get("INSTANCE_NAME"), + region = os.environ.get("REGION_NAME"), + database = os.environ.get("DATABASE_NAME") +) + +ADA_TOKEN_COUNT = 1536 + +class FakeEmbeddingsWithAdaDimension(FakeEmbeddings): + """Fake embeddings functionality for testing.""" + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + """Return simple embeddings.""" + return [ + [float(1.0)] * (ADA_TOKEN_COUNT - 1) + [float(i)] for i in range(len(texts)) + ] + + def embed_query(self, text: str) -> List[float]: + """Return simple embeddings.""" + return [float(1.0)] * (ADA_TOKEN_COUNT - 1) + [float(0.0)] + + +async def test_cloudSQLVectorStore() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + docsearch = await cloudSQL.afrom_texts( + texts=texts, + table_name="test_table", + embedding=FakeEmbeddingsWithAdaDimension(), + engine=engine, + ) + output = await docsearch.asimilarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + +async def test_cloudSQLVectorStore_embeddings() -> None: + """Test end to end construction with embeddings and search.""" + texts = ["foo", "bar", "baz"] + text_embeddings = FakeEmbeddingsWithAdaDimension().embed_documents(texts) + text_embedding_pairs = list(zip(texts, text_embeddings)) + docsearch = cloudSQLVectorStore.afrom_embeddings( + text_embeddings=text_embedding_pairs, + table_name="test_table", + embedding=FakeEmbeddingsWithAdaDimension(), + engine=engine, + ) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + +async def test_cloudSQLVectorStore_with_metadatas() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = await cloudSQLVectorStore.afrom_texts( + texts=texts, + table_name="test_table", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + engine=engine, + ) + output = await docsearch.asimilarity_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={"page": "0"})] + + +async def test_cloudSQLVectorStore_with_metadatas_with_scores() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = await cloudSQLVectorStore.afrom_texts( + texts=texts, + table_name="test_table", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + engine=engine, + ) + output = await docsearch.asimilarity_search_with_score("foo", k=1) + assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)] + + +async def test_cloudSQLVectorStore_with_filter_match() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = await cloudSQLVectorStore.afrom_texts( + texts=texts, + collection_name="test_collection_filter", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + engine=engine, + ) + output = await docsearch.asimilarity_search_with_score("foo", k=1, filter={"page": "0"}) + assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)] + + +async def test_cloudSQLVectorStore_with_filter_distant_match() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = await cloudSQLVectorStore.afrom_texts( + texts=texts, + table_name="test_table", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + engine=engine, + ) + output = await docsearch.asimilarity_search_with_score("foo", k=1, filter={"page": "2"}) + assert output == [ + (Document(page_content="baz", metadata={"page": "2"}), 0.0013003906671379406) + ] + + +async def test_cloudSQLVectorStore_with_filter_no_match() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = await cloudSQLVectorStore.afrom_texts( + texts=texts, + table_name="test_table", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + engine=engine, + ) + output = await docsearch.asimilarity_search_with_score("foo", k=1, filter={"page": "5"}) + assert output == [] + +async def test_cloudSQLVectorStore_relevance_score() -> None: + """Test to make sure the relevance score is scaled to 0-1.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = await cloudSQLVectorStore.from_texts( + texts=texts, + table_name="test_table", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + engine=engine, + ) + + output = await docsearch.asimilarity_search_with_relevance_scores("foo", k=3) + assert output == [ + (Document(page_content="foo", metadata={"page": "0"}), 1.0), + (Document(page_content="bar", metadata={"page": "1"}), 0.9996744261675065), + (Document(page_content="baz", metadata={"page": "2"}), 0.9986996093328621), + ] + + +async def test_cloudSQLVectorStore_max_marginal_relevance_search() -> None: + """Test max marginal relevance search.""" + texts = ["foo", "bar", "baz"] + docsearch = await cloudSQLVectorStore.afrom_texts( + texts=texts, + table_name="test_table", + embedding=FakeEmbeddingsWithAdaDimension(), + engine=engine, + ) + output = await docsearch.max_marginal_relevance_search("foo", k=1, fetch_k=3) + assert output == [Document(page_content="foo")] + + +async def test_cloudSQLVectorStore_max_marginal_relevance_search_with_score() -> None: + """Test max marginal relevance search with relevance scores.""" + texts = ["foo", "bar", "baz"] + docsearch = await cloudSQLVectorStore.afrom_texts( + texts=texts, + collection_name="test_table", + embedding=FakeEmbeddingsWithAdaDimension(), + engine=engine, + ) + output = await docsearch.amax_marginal_relevance_search_with_score("foo", k=1, fetch_k=3) + assert output == [(Document(page_content="foo"), 0.0)] \ No newline at end of file From 93b6221da84641808d6ad14ae710613583649f86 Mon Sep 17 00:00:00 2001 From: Averi Kitsch Date: Mon, 5 Feb 2024 14:08:10 -0800 Subject: [PATCH 2/9] udpate --- pyproject.toml | 4 +- src/langchain_google_cloud_sql_pg/__init__.py | 5 + src/langchain_google_cloud_sql_pg/cloudSQL.py | 717 ------------------ .../postgresql_engine.py | 140 ++++ .../vectorstore.py | 602 +++++++++++++++ tests/test_cloudSQL.py | 114 ++- 6 files changed, 822 insertions(+), 760 deletions(-) delete mode 100644 src/langchain_google_cloud_sql_pg/cloudSQL.py create mode 100644 src/langchain_google_cloud_sql_pg/postgresql_engine.py create mode 100644 src/langchain_google_cloud_sql_pg/vectorstore.py diff --git a/pyproject.toml b/pyproject.toml index f335088..79002b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,9 @@ requires-python = ">=3.8" dependencies = [ "langchain==0.1.1", "SQLAlchemy==2.0.7", - "cloud-sql-python-connector[asyncpg]==1.5.0" + "cloud-sql-python-connector[asyncpg]==1.5.0", + "nest_asyncio", + "pgvector" ] [project.urls] diff --git a/src/langchain_google_cloud_sql_pg/__init__.py b/src/langchain_google_cloud_sql_pg/__init__.py index 6d5e14b..259f273 100644 --- a/src/langchain_google_cloud_sql_pg/__init__.py +++ b/src/langchain_google_cloud_sql_pg/__init__.py @@ -11,3 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from langchain_google_cloud_sql_pg.postgresql_engine import PostgreSQLEngine +from langchain_google_cloud_sql_pg.vectorstore import CloudSQLVectorStore + +__all__ = ["PostgreSQLEngine", "CloudSQLVectorStore"] diff --git a/src/langchain_google_cloud_sql_pg/cloudSQL.py b/src/langchain_google_cloud_sql_pg/cloudSQL.py deleted file mode 100644 index afd364d..0000000 --- a/src/langchain_google_cloud_sql_pg/cloudSQL.py +++ /dev/null @@ -1,717 +0,0 @@ -import json -import uuid - -import asyncio -import asyncpg -import nest_asyncio - -from typing import Any, Dict, List, Optional, Tuple, Type -from pgvector.asyncpg import register_vector - -# import sqlalchemy -from sqlalchemy import text -from sqlalchemy.ext.asyncio import AsyncEngine, create_async_engine - -from google.cloud.sql.connector import Connector -import google.auth -import numpy as np -from google.auth.transport.requests import Request - -from langchain_core.documents import Document -from langchain_core.embeddings import Embeddings -from langchain_core.vectorstores import VectorStore -from langchain_community.vectorstores.utils import maximal_marginal_relevance - -import aiohttp - -nest_asyncio.apply() - -async def _get_IAM_user(credentials): - """Get user/service account name""" - request = google.auth.transport.requests.Request() - credentials.refresh(request) - - url = f"https://oauth2.googleapis.com/tokeninfo?access_token={credentials.token}" - async with aiohttp.ClientSession() as client: - response = await client.get(url) - response = await response.text() - response = json.loads(response) - email = response['email'] - if ".gserviceaccount.com" in email: - email = email.replace(".gserviceaccount.com","") - - return email - -class CloudSQLEngine: - """Creating a connection to the CloudSQL instance - To use, you need the following packages installed: - cloud-sql-python-connector[asyncpg] - """ - def __init__( - self, - project_id=None, - region=None, - instance=None, - database=None, - engine=None - ): - self.project_id = project_id - self.region = region - self.instance = instance - self.database = database - self.engine = engine - self._pool = asyncio.get_event_loop().run_until_complete(self._engine()) - - @classmethod - def from_instance( - cls, - region: str, - instance: str, - database: str, - project_id: str=None, - ) -> CloudSQLEngine: - - """Create CloudSQLEngine connection to the postgres database in the CloudSQL instance. - - Args: - region (str): CloudSQL instance region. - instance (str): CloudSQL instance name. - database (str): CloudSQL instance database name. - project_id (str): GCP project ID. Defaults to None - - Returns: - CloudSQLEngine containing the asyncpg connection pool. - """ - return cls(project_id=project_id, region=region, instance=instance, database=database) - - @classmethod - def from_engine( - cls, - engine: AsyncEngine - ) -> CloudSQLEngine: - - return cls(engine=engine) - - async def _engine(self) -> AsyncEngine: - - if self.engine is not None: - return self.engine - - credentials, _ = google.auth.default(scopes=['email', 'https://www.googleapis.com/auth/cloud-platform']) - - if self.project_id is None: - self.project_id = _ - - async def get_conn(): - async with Connector(loop=asyncio.get_running_loop()) as connector: - conn = await connector.connect_async( - f"{self.project_id}:{self.region}:{self.instance}", - "asyncpg", - user=await _get_IAM_user(credentials), - enable_iam_auth=True, - db=self.database, - ) - - await register_vector(conn) - return conn - - pool = create_async_engine( - "postgresql+asyncpg://", - async_creator=get_conn, - ) - - return pool - - async def _aexecute_fetch( - self, - query - ) -> Any: - - async with self._pool.connect() as conn: - result = (await conn.execute(text(query))) - result_map = result.mappings() - result_fetch = result_map.fetchall() - - return result_fetch - - async def _aexecute_update( - self, - query, - additional=None - ) -> None: - - async with self._pool.connect() as conn: - result = (await conn.execute(text(query),additional)) - result = result.mappings() - await conn.commit() - -class CloudSQLVectorStore(VectorStore): - """Google Cloud SQL vector store. - - To use, you need the following packages installed: - pgvector-python - sqlalchemy - """ - - def __init__( - self, - engine: Type[CloudSQLEngine], - table_name: str, - vector_size: int, - embedding_service: Embeddings, - content_column: str='content', - embedding_column: str='embedding', - metadata_columns: Optional[str, List[str]]='metadata', - ignore_metadata_columns: bool=False, - index_query_options = None, - index: Type[HNSWIndex, IVFFlatIndex, BruteForce]=HNSWIndex, - distance_strategy = 'L2', - overwrite_existing: bool=False, - store_metadata: bool=True - ): - """Constructor for CloudSQLVectorStore. - - Args: - engine (CloudSQLEngine): AsyncEngine with pool connection to the postgres database. Required. - embedding_service (Embeddings): Text embedding model to use. - table_name (str): Name of the existing table or the table to be created. - content_column (str): Column that represent a Document’s page_content. Defaults to content - embedding_column (str): Column for embedding vectors. - The embedding is generated from the document value. Defaults to embedding - metadata_columns (List[str]): Column(s) that represent a document's metadata. Defaults to metadata - ignore_metadata_columns (List[str]): Column(s) to ignore in pre-existing tables for a document’s metadata. - Can not be used with metadata_columns. Defaults to None - overwrite_existing (bool): Boolean for truncating table before inserting data. Defaults to False - index_query_options : QueryOptions class with vector search parameters. Defaults to None - distance_strategy (str): - Determines the strategy employed for calculating - the distance between vectors in the embedding space. - Defaults to EUCLIDEAN_DISTANCE(L2). - Available options are: - - COSINE: Measures the similarity between two vectors of an inner - product space. - - EUCLIDEAN_DISTANCE: Computes the Euclidean distance between - two vectors. This metric considers the geometric distance in - the vector space, and might be more suitable for embeddings - that rely on spatial relationships. This is the default behavior. - """ - - self.engine = engine - self.table_name = table_name - self.vector_size = vector_size - self.embedding_service = embedding_service - self.embedding_column = embedding_column - self.content_column = content_column - self.metadata_columns = metadata_columns - self.ignore_metadata_columns = ignore_metadata_columns - self.overwrite_existing = overwrite_existing - self.index_query_options = index_query_options - self.store_metadata = store_metadata - self.distance_strategy = distance_strategy - self.index = index - asyncio.get_running_loop().run_until_complete(self.__post_init__()) - - async def __post_init__(self) -> None: - """Initialize table and validate existing tables""" - - # Check if table exists - query = f"SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = '{self.table_name}');" - result = await self.engine._aexecute_fetch(query) - # If table exists - if result[0]['exists']: - # If overwrite_existing is True Truncate the Table - if self.overwrite_existing: - query = f"TRUNCATE TABLE {self.table_name} RESET IDENTITY" - await self.engine._aexecute_update(query) - - # Checking if metadata and ignore_metadata are given together - if self.metadata_columns is not None and self.ignore_metadata_columns is not None: - raise ValueError("Both metadata_columns and ignore_metadata_columns have been provided.") - - get_name = f"SELECT column_name, data_type FROM information_schema.columns WHERE table_name = '{self.table_name}'" - result = await self.engine._aexecute_fetch(get_name) - column_name = [col['column_name'] for col in result] - dtypes = [dtype['data_type'] for dtype in result] - - # Check column names and datatype for embedding column - if 'uuid' not in column_name: - raise ValueError("Column uuid does not exist") - if self.content_column not in column_name: - raise ValueError(f"Column {self.content_column} does not exist") - if self.embedding_column in column_name: - if "USER-DEFINED" not in dtypes: - raise ValueError(f"Column {self.embedding_column} is not of type vector") - else: - raise ValueError(f"Column {self.embedding_column} does not exist") - - if 'metadata' not in column_name: - raise ValueError("Column metadata does not exist") - - # Check if there are non-nullable columns - query = f"SELECT column_name FROM information_schema.columns WHERE table_name = '{self.table_name}' AND is_nullable = 'NO';" - result = await self.engine._aexecute_fetch(query) - non_nullable_list = [n['column_name'] for n in result] - exceptions = set(["uuid", f"{self.content_column}"]) - other_values = [value for value in non_nullable_list if value not in exceptions] - - if bool(other_values): - raise ValueError(f"Only uuid and {self.content_column} can be non-nullable") - - # If both metadata and ignore_metadata are given, throw an error - if self.metadata_columns is not None and self.ignore_metadata_columns is not None: - raise ValueError("Both metadata_columns and ignore_metadata_columns have been provided.") - - else: - await self.init_vectorstore_table( - engine=self.engine, - table_name=self.table_name, - vector_size=self.vector_size, - content_column=self.content_column, - embedding_column=self.embedding_column, - metadata_columns=self.metadata_columns, - overwrite_existing=self.overwrite_existing, - store_metadata=self.store_metadata - ) - - @property - def embeddings( - self - ) -> Embeddings: - return self.embedding_service - - async def create_vector_extension( - self - ) -> None: - """Creates the vector extsion to the specified database.""" - query = "CREATE EXTENSION IF NOT EXISTS vector" - await self.engine._aexecute_update(query) - - async def init_vectorstore_table( - self, - engine: Type[CloudSQLEngine], - table_name: str, - vector_size: int, - content_column: str='content', - embedding_column: str='embedding', - metadata_columns: Optional[str, List[str]]='metadata', - overwrite_existing: bool=False, - store_metadata: bool=True - ) -> None: - - """Creating a non-default vectorstore table""" - - # Create vector extension if not exists - await self.create_vector_extension() - - if overwrite_existing: - query = f"TRUNCATE TABLE {self.table_name} RESET IDENTITY" - await engine._aexecute_update(query) - - query = f''' - CREATE TABLE IF NOT EXISTS {table_name} ( - uuid UUID PRIMARY KEY, - {content_column} TEXT NOT NULL, - {embedding_column} vector({vector_size}), - {metadata_columns} JSON - ); - ''' - await engine._aexecute_update(query) - - @classmethod - async def afrom_embeddings( - cls: Type[CloudSQLVectorStore], - engine: Type[CloudSQLEngine], - embedding_service: Embeddings, - text_embeddings: List[Tuple[str, List[float]]], - table_name: str, - metadatas: List[Dict]=None, - ids: List[int]=None - ) -> cloudSQLVectorStore: - - texts = [t[0] for t in text_embeddings] - embeddings = [t[1] for t in text_embeddings] - metadatas = [{} for _ in texts] - - table = cls( - engine=engine, - table_name=table_name, - ) - - await table.aadd_embeddings( - texts=texts, engine=engine, embeddings=embeddings, metadatas=metadatas, ids=ids, table_name=table_name) - - return table - - @classmethod - async def afrom_documents( - cls: Type[CloudSQLVectorStore], - documents: List[Document], - engine: Type[CloudSQLEngine], - table_name: str, - embedding_service: Embeddings, - ids: List[int]=None - ) -> cloudSQLVectorStore: - - texts = [d.page_content for d in documents] - metadatas = [json.dumps(d.metadata) for d in documents] - - embeddings = embedding_service.embed_documents(list(texts)) - - table = cls( - engine=engine, - embedding_service=embedding_service, - table_name=table_name, - ) - - await table.aadd_embeddings( - texts=texts, engine=engine, embeddings=embeddings, metadatas=metadatas, ids=ids, table_name=table_name) - - return table - - @classmethod - async def afrom_texts( - cls: Type[CloudSQLVectorStore], - texts: List[str], - table_name: str, - embedding_service: Embeddings, - engine: Type[CloudSQLEngine], - metadatas: List[Dict]=None, - ids: List[int]=None - ) -> CloudSQLVectorStore: - - """ Return VectorStore initialized from texts and embeddings.""" - if not metadatas: - metadatas = [{} for _ in texts] - - documents = [] - for text, meta in zip(texts, metadatas): - docs = Document(page_content=text, metadata=meta) - documents.append(docs) - - return await cls.afrom_documents( - engine=engine, - documents=documents, - embedding_service=embedding_service, - table_name=table_name, - ids=ids) - - async def aadd_embeddings( - self, - engine: Type[CloudSQLEngine], - texts: List[str], - table_name: str, - embeddings: Embeddings, - metadatas: List[Dict]=None, - ids: List[int]=None - ) -> List[str]: - - if ids is None: - ids = [str(uuid.uuid1()) for _ in texts] - - for id, content, embedding, meta in zip(ids, texts, embeddings, metadatas): - data_to_add = {"ids":id, 'content':content, 'embedding':embedding, 'metadata':meta} - stmt = f"INSERT INTO {table_name}(uuid, content, embedding, metadata) VALUES (:ids,:content,:embedding,:metadata)" - await engine._aexecute_update(stmt, data_to_add) - - return ids - - async def aadd_documents( - self, - documents: List[Document], - ids: List[int]=None - ) -> List[str]: - - """Run more documents through the embeddings and add to the vectorstore. - - Args: - documents (List[Document]): Iterable of Documents to add to the vectorstore. - ids (List[str]): List of id strings. Defaults to None - - Returns: - List of ids from adding the texts into the vectorstore. - """ - - texts = [d.page_content for d in documents] - metadatas = [json.dumps(d.metadata) for d in documents] - embeddings = self.embedding_service.embed_documents(list(texts)) - - return await self.aadd_embeddings( - texts=texts, - embeddings=embeddings, - metadatas=metadatas, - ids=ids, - engine=self.Engine, - table_name=self.table_name) - - - async def aadd_texts( - self, - texts: List[str], - metadatas: List[Dict]=None, - ids: List[int]=None - ) -> List[str]: - - """Run more texts through the embeddings and add to the vectorstore. - - Args: - texts (str): Iterable of strings to add to the vectorstore. - metadatas (List[dict]): Optional list of metadatas associated with the texts. Defaults to None. - ids (List[str]): List of id strings. Defaults to None - - Returns: - List of ids from adding the texts into the vectorstore. - """ - - if not metadatas: - metadata = [{} for _ in texts] - - documents = [] - for text, meta in zip(texts, metadatas): - docs = Document(page_content = text, metadata=meta) - documents.append(docs) - - return await self.aadd_documents( - documents=documents, - ids=ids) - - async def __query_collection( - self, - embedding: List[float], - k: int=4, - filter: str=None - ) -> List[Any]: - - if filter is not None: - condition = f"WHERE {filter}" - - query = f""" - SELECT uuid, {self.content_column}, {self.embedding_column}, metadata, - l2_distance({self.embedding_column}, '{embedding}') as distance - FROM {self.table_name} {condition} ORDER BY {self.embedding_column} <-> '{embedding}' LIMIT {k} - """ - else: - query = f""" - SELECT uuid, {self.content_column}, {self.embedding_column}, metadata, - l2_distance({self.embedding_column}, '{embedding}') as distance - FROM {self.table_name} ORDER BY {self.embedding_column} <-> '{embedding}' LIMIT {k} - """ - results = await self.engine._aexecute_fetch(query) - - return results - - async def asimilarity_search( - self, - query: str, - k: int=4, - filter: str=None - ) -> List[Document]: - - embedding = self.embedding_service.embed_query(text=query) - - return await self.asimilarity_search_by_vector( - embedding=embedding, - k=k, - filter=filter - ) - - async def asimilarity_search_by_vector( - self, - embedding: List[float], - k: int=4, - filter: str=None - ) -> List[Document]: - - docs_and_scores = await self.asimilarity_search_with_score_by_vector( - embedding=embedding, k=k, filter=filter - ) - - return [doc for doc, _ in docs_and_scores] - - async def asimilarity_search_with_score( - self, - query: str, - k: int=4, - filter: str=None - ) -> List[Tuple[Document, float]]: - - embedding = self.embedding_service.embed_query(query) - docs = await self.asimilarity_search_with_score_by_vector( - embedding=embedding, k=k, filter=filter - ) - return docs - - async def asimilarity_search_with_score_by_vector( - self, - embedding: List[float], - k: int=4, - filter: str=None - ) -> List[Tuple[Document, float]]: - - results = await self.__query_collection(embedding=embedding, k=k, filter=filter) - documents_with_scores = [(Document(page_content=i[f"{self.content_column}"],metadata=i["metadata"],),i['distance'],)for i in results] - return documents_with_scores - - async def amax_marginal_relevance_search( - self, - query: str, - k: int=4, - fetch_k: int=20, - lambda_mult: float=0.5, - filter: str=None - ) -> List[Document]: - - embedding = await self.embedding_service.embed_query(text=query) - - return self.amax_marginal_relevance_search_by_vector( - embedding=embedding, - k=k, - fetch_k=fetch_k, - lambda_mult=lambda_mult, - filter=filter - ) - async def amax_marginal_relevance_search_with_score_by_vector( - self, - embedding: List[float], - k: int=4, - fetch_k: int=20, - lambda_mult: float=0.5, - filter: str=None - ) -> List[Tuple[Document, float]]: - - results = await self.__query_collection(embedding=embedding, k=fetch_k, filter=filter) - embedding_list = [i[f"{self.embedding_column}"] for i in results] - - mmr_selected = maximal_marginal_relevance( - np.array(embedding, dtype=np.float32), - embedding_list, - k=k, - lambda_mult=lambda_mult, - ) - - candidates = [(Document(page_content=i[f"{self.content_column}"],metadata=i["metadata"],),i["distance"],)for i in results] - - return [r for i, r in enumerate(candidates) if i in mmr_selected] - - async def _acreate_index( - self, - index: Union[HNSWIndex, IVFFlatIndex, BruteForce] - ): - - if isinstance(index, BruteForce): - return None - - distance = 'l2' if distance_strategy == 'L2' else 'ip' if distance_strategy == 'INNER' else 'cosine' - index_type = 'hnsw' if istinstance(index, HNSWIndex()) else 'ivfflat' - if partial_indexes == None: - condition = "" - else: - condition = f"WHERE (partial_indexes)" - - if index_type == 'hnsw': - query = f"CREATE INDEX ON {self.table_name} USING hnsw ({self.embedding_column} vector_{distance}_ops) WITH (m={index.m}, ef_construction={index.ef_construction}) {condition}" - else: - query = f"CREATE INDEX ON {self.table_name} USING ivfflat ({self.embedding_column} vector_{distance}_ops) WITH (lists={index.lists}) {condition}" - - await self.engine._aexecute_update(query) - - async def _aindex_query_options( - self, - index_query_options: Type[HNSWIndex.QueryOptions()] - ): - - if isinstance(index_query_options, HNSWIndex.QueryOptions): - query_options = index_query_options.ef_search - quey = f"SET hnsw.ef_search = {query_options}" - else: - query_options = index_query_options.probes - query = f"SET ivfflat.probes = {query_options}" - - await self.engine._aexecute_update(query) - - async def areindex( - self, - index: Union[HNSWIndex, IVFFlatIndex, BruteForce], - index_name: Optional - ): - - if name: - query = f"REINDEX INDEX {index_name}" - await self.engine._aexecute_update(query) - else: - await _acreate_index(index) - - - async def adrop_index( - self - ): - query = f"SELECT indexname, indexdef FROM pg_indexes WHERE tablename='{self.table_name}'" - current_index = await self.engine._aexecute_fetch(query) - index_def = current_index[0]['indexdef'] - if 'hnsw' in index_def or 'ivfflat' in index_def: - current_index = current_index['indexname'] - query = f"DROP INDEX {current_index}" - await self.engine._aexecute_update(query) - else: - raise ValueError("Cannot drop Index") - - async def aset_index_query_options( - self, - distance_strategy, - index_query_options - ): - self.distance_strategy = distance_strategy - self.index_query_options = index_query_options - await self._aindex_query_options() - - -class BruteForce: - - def __init__( - self, - distance_strategy: str='L2' - ): - - self.distance_strategy = distance_strategy - -class HNSWIndex: - - def __init__( - self, - m: int=16, - ef_construction: int=64, - partial_indexes: List=None, - distance_strategy: str='L2' - ): - self.m = m - self.ef_construction = ef_construction - self.partial_indexes = partial_indexes - self.distance_strategy = distance_strategy - - class QueryOptions( - self, - ef_search - ): - - self.ef_search = ef_search - -class IVFFlatIndex: - - def __init__( - self, - lists: int=1, - partial_indexes: List=None, - distance_strategy: str='L2' - ): - - self.lists = lists - self.partial_indexes = partial_indexes - self.distance_strategy = distance_strategy - - class QueryOptions: - - def __init__( - self, - probes - ): - - self.probes = probes \ No newline at end of file diff --git a/src/langchain_google_cloud_sql_pg/postgresql_engine.py b/src/langchain_google_cloud_sql_pg/postgresql_engine.py new file mode 100644 index 0000000..0062e94 --- /dev/null +++ b/src/langchain_google_cloud_sql_pg/postgresql_engine.py @@ -0,0 +1,140 @@ +from __future__ import annotations + +import asyncio +import json +import time +from threading import Thread +from typing import AnyStr + +import aiohttp +import google.auth +from google.cloud.sql.connector import Connector +from langchain_community.vectorstores.utils import maximal_marginal_relevance +from pgvector.asyncpg import register_vector + +# import sqlalchemy +from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncEngine, create_async_engine + + +async def _get_IAM_user( + credentials: google.auth.credentials.Credentials, +) -> str: + """Get user/service account name""" + request = google.auth.transport.requests.Request() + credentials.refresh(request) + + url = f"https://oauth2.googleapis.com/tokeninfo?access_token={credentials.token}" + async with aiohttp.ClientSession() as client: + response = await client.get(url) + response = json.loads(await response.text()) + email = response["email"] + if ".gserviceaccount.com" in email: + email = email.replace(".gserviceaccount.com", "") + + return email + + +class PostgreSQLEngine: + """Creating a connection to the CloudSQL instance + To use, you need the following packages installed: + cloud-sql-python-connector[asyncpg] + """ + + def __init__( + self, + project_id=None, + region=None, + instance=None, + database=None, + engine=None, + ): + self.project_id = project_id + self.region = region + self.instance = instance + self.database = database + self.engine = engine + self._loop = asyncio.new_event_loop() + self._thread = Thread(target=self._loop.run_forever, daemon=True) + self._thread.start() + pool_object = asyncio.wrap_future( + asyncio.run_coroutine_threadsafe(self.async_func(), self._loop), + loop=self._loop, + ) + time.sleep(1) + self._pool = pool_object.result() + + @classmethod + def from_instance( + cls, + region: str, + instance: str, + database: str, + project_id: str = None, + ) -> PostgreSQLEngine: + """Create PostgreSQLEngine connection to the postgres database in the CloudSQL instance. + + Args: + region (str): CloudSQL instance region. + instance (str): CloudSQL instance name. + database (str): CloudSQL instance database name. + project_id (str): GCP project ID. Defaults to None + + Returns: + PostgreSQLEngine containing the asyncpg connection pool. + """ + return cls( + project_id=project_id, + region=region, + instance=instance, + database=database, + ) + + @classmethod + def from_engine(cls, engine: AsyncEngine) -> PostgreSQLEngine: + return cls(engine=engine) + + async def _engine(self) -> AsyncEngine: + if self.engine is not None: + return self.engine + + credentials, _ = google.auth.default( + scopes=["email", "https://www.googleapis.com/auth/cloud-platform"] + ) + + if self.project_id is None: + self.project_id = _ + + async def get_conn(): + async with Connector(loop=asyncio.get_running_loop()) as connector: + conn = await connector.connect_async( + f"{self.project_id}:{self.region}:{self.instance}", + "asyncpg", + user=await _get_IAM_user(credentials), + enable_iam_auth=True, + db=self.database, + ) + + await register_vector(conn) + return conn + + pool = create_async_engine( + "postgresql+asyncpg://", + async_creator=get_conn, + ) + + return pool + + async def _aexecute_fetch(self, query) -> Any: + async with self._pool.connect() as conn: + result = await conn.execute(text(query)) + result_map = result.mappings() + result_fetch = result_map.fetchall() + + return result_fetch + + async def _aexecute_update(self, query, additional=None) -> None: + async with self._pool.connect() as conn: + result = await conn.execute(text(query), additional) + result = result.mappings() + await conn.commit() diff --git a/src/langchain_google_cloud_sql_pg/vectorstore.py b/src/langchain_google_cloud_sql_pg/vectorstore.py new file mode 100644 index 0000000..f514452 --- /dev/null +++ b/src/langchain_google_cloud_sql_pg/vectorstore.py @@ -0,0 +1,602 @@ +from __future__ import annotations + +import asyncio +import json +import uuid +from typing import Any, Iterable, List, Optional, Tuple, Type, Union, dict + +import numpy as np +from langchain_community.vectorstores.utils import maximal_marginal_relevance +from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings +from langchain_core.vectorstores import VectorStore + +from .postgresql_engine import PostgreSQLEngine + +VST = TypeVar("VST", bound="CloudSQLVectorStore") + + +class CloudSQLVectorStore(VectorStore): + """Google Cloud SQL for PostgreSQL vector store. + + To use, you need the following packages installed: + pgvector-python + sqlalchemy + """ + + def __init__( + self, + engine: PostgreSQLEngine, + table_name: str, + # vector_size: int, + embedding_service: Embeddings, + content_column: str = "content", + embedding_column: str = "embedding", + metadata_columns: [str, List[str]] = "metadata", + ignore_metadata_columns: bool = False, + index_query_options: [ + HNSWIndex.QueryOptions, + IVFFlatIndex.QueryOptions, + ] = None, + # index: [HNSWIndex | IVFFlatIndex | BruteForce] = None, + distance_strategy="L2", + overwrite_existing: bool = False, + # store_metadata: bool = True, + ): + """Constructor for CloudSQLVectorStore. + + Args: + engine (PostgreSQLEngine): AsyncEngine with pool connection to the postgres database. Required. + embedding_service (Embeddings): Text embedding model to use. + table_name (str): Name of the existing table or the table to be created. + content_column (str): Column that represent a Document’s page_content. Defaults to content + embedding_column (str): Column for embedding vectors. + The embedding is generated from the document value. Defaults to embedding + metadata_columns (List[str]): Column(s) that represent a document's metadata. Defaults to metadata + ignore_metadata_columns (List[str]): Column(s) to ignore in pre-existing tables for a document’s metadata. + Can not be used with metadata_columns. Defaults to None + overwrite_existing (bool): Boolean for truncating table before inserting data. Defaults to False + index_query_options : QueryOptions class with vector search parameters. Defaults to None + distance_strategy (str): + Determines the strategy employed for calculating + the distance between vectors in the embedding space. + Defaults to EUCLIDEAN_DISTANCE(L2). + Available options are: + - COSINE: Measures the similarity between two vectors of an inner + product space. + - EUCLIDEAN_DISTANCE: Computes the Euclidean distance between + two vectors. This metric considers the geometric distance in + the vector space, and might be more suitable for embeddings + that rely on spatial relationships. This is the default behavior. + """ + + self.engine = engine + self.table_name = table_name + # self.vector_size = vector_size + self.embedding_service = embedding_service + self.embedding_column = embedding_column + self.content_column = content_column + self.metadata_columns = metadata_columns + self.ignore_metadata_columns = ignore_metadata_columns + self.overwrite_existing = overwrite_existing + self.index_query_options = index_query_options + self.store_metadata = store_metadata + self.distance_strategy = distance_strategy + # self.index = index + asyncio.get_running_loop().run_until_complete(self.__post_init__()) + + async def __post_init__(self) -> None: + """Initialize table and validate existing tables""" + + # Check if table exists + query = f"SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = '{self.table_name}');" + result = await self.engine._aexecute_fetch(query) + # If table exists + if result[0]["exists"]: + # If overwrite_existing is True Truncate the Table + if self.overwrite_existing: + query = f"TRUNCATE TABLE {self.table_name} RESET IDENTITY" + await self.engine._aexecute_update(query) + + # Checking if metadata and ignore_metadata are given together + if ( + self.metadata_columns is not None + and self.ignore_metadata_columns is not None + ): + raise ValueError( + "Both metadata_columns and ignore_metadata_columns have been provided." + ) + + get_name = f"SELECT column_name, data_type FROM information_schema.columns WHERE table_name = '{self.table_name}'" + result = await self.engine._aexecute_fetch(get_name) + column_name = [col["column_name"] for col in result] + dtypes = [dtype["data_type"] for dtype in result] + + # Check column names and datatype for embedding column + if "uuid" not in column_name: + raise ValueError("Column uuid does not exist") + if self.content_column not in column_name: + raise ValueError(f"Column {self.content_column} does not exist") + if self.embedding_column in column_name: + if "USER-DEFINED" not in dtypes: + raise ValueError( + f"Column {self.embedding_column} is not of type vector" + ) + else: + raise ValueError( + f"Column {self.embedding_column} does not exist" + ) + + if "metadata" not in column_name: + raise ValueError("Column metadata does not exist") + + # Check if there are non-nullable columns + query = f"SELECT column_name FROM information_schema.columns WHERE table_name = '{self.table_name}' AND is_nullable = 'NO';" + result = await self.engine._aexecute_fetch(query) + non_nullable_list = [n["column_name"] for n in result] + exceptions = set(["uuid", f"{self.content_column}"]) + other_values = [ + value for value in non_nullable_list if value not in exceptions + ] + + if bool(other_values): + raise ValueError( + f"Only uuid and {self.content_column} can be non-nullable" + ) + + # If both metadata and ignore_metadata are given, throw an error + if ( + self.metadata_columns is not None + and self.ignore_metadata_columns is not None + ): + raise ValueError( + "Both metadata_columns and ignore_metadata_columns have been provided." + ) + + else: + await self.init_vectorstore_table( + engine=self.engine, + table_name=self.table_name, + vector_size=self.vector_size, + content_column=self.content_column, + embedding_column=self.embedding_column, + metadata_columns=self.metadata_columns, + overwrite_existing=self.overwrite_existing, + store_metadata=self.store_metadata, + ) + + @property + def embeddings(self) -> Embeddings: + return self.embedding_service + + async def create_vector_extension(self) -> None: + """Creates the vector extsion to the specified database.""" + query = "CREATE EXTENSION IF NOT EXISTS vector" + await self.engine._aexecute_update(query) + + async def init_vectorstore_table( + self, + engine: PostgreSQLEngine, + table_name: str, + vector_size: int, + content_column: str = "content", + embedding_column: str = "embedding", + metadata_columns: Optional[str | List[str]] = "metadata", + overwrite_existing: bool = False, + store_metadata: bool = True, + ) -> None: + """Creating a non-default vectorstore table""" + + # Create vector extension if not exists + await self.create_vector_extension() + + if overwrite_existing: + query = f"TRUNCATE TABLE {self.table_name} RESET IDENTITY" + await engine._aexecute_update(query) + + query = f""" + CREATE TABLE IF NOT EXISTS {table_name} ( + uuid UUID PRIMARY KEY, + {content_column} TEXT NOT NULL, + {embedding_column} vector({vector_size}), + {metadata_columns} JSON + ); + """ + await engine._aexecute_update(query) + + # @classmethod + # async def afrom_embeddings( + # cls: CloudSQLVectorStore, + # engine: PostgreSQLEngine, + # embedding_service: Embeddings, + # text_embeddings: List[Tuple[str, List[float]]], + # table_name: str, + # metadatas: List[dict] = None, + # ids: List[int] = None, + # ) -> CloudSQLVectorStore: + # texts = [t[0] for t in text_embeddings] + # embeddings = [t[1] for t in text_embeddings] + # metadatas = [{} for _ in texts] + + # table = cls( + # engine=engine, + # table_name=table_name, + # embedding_service=embedding_service, + # ) + + # await table.aadd_embeddings( + # texts=texts, + # engine=engine, + # embeddings=embeddings, + # metadatas=metadatas, + # ids=ids, + # table_name=table_name, + # ) + + # return table + + # @classmethod + # async def afrom_documents( + # cls: CloudSQLVectorStore, + # documents: List[Document], + # engine: PostgreSQLEngine, + # table_name: str, + # embedding_service: Embeddings, + # ids: List[int] = None, + # ) -> CloudSQLVectorStore: + # texts = [d.page_content for d in documents] + # metadatas = [json.dumps(d.metadata) for d in documents] + + # embeddings = embedding_service.embed_documents(list(texts)) + + # table = cls( + # engine=engine, + # embedding_service=embedding_service, + # table_name=table_name, + # ) + + # await table.aadd_embeddings( + # texts=texts, + # engine=engine, + # embeddings=embeddings, + # metadatas=metadatas, + # ids=ids, + # table_name=table_name, + # ) + + # return table + + @classmethod + async def afrom_texts( + cls: Type[VST], + texts: List[str], + embeddings: List[List[float]], + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + engine: PostgreSQLEngine, + embedding_service: Embeddings, + table_name: str, + **kwargs: Any, + ) -> VST: + """Return VectorStore initialized from texts and embeddings.""" + if not metadatas: + metadatas = [{} for _ in texts] + + documents = [] + for text, meta in zip(texts, metadatas): + docs = Document(page_content=text, metadata=meta) + documents.append(docs) + + vs = cls( + engine=engine, + documents=documents, + embedding_service=embedding_service, + table_name=table_name, + ) + return await vs.aadd_embeddings(texts, embeddings, metadatas, ids) + + async def aadd_embeddings( + self, + texts: Iterable[str], + embeddings: List[List[float]], + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> List[str]: + # if ids is None: + # ids = [str(uuid.uuid1()) for _ in texts] + + for id, content, embedding, meta in zip( + ids, texts, embeddings, metadatas + ): + data_to_add = { + "ids": id, + "content": content, + "embedding": embedding, + "metadata": meta, + } + stmt = f"INSERT INTO {self.table_name}(uuid, content, embedding, metadata) VALUES (:ids,:content,:embedding,:metadata)" + await self.engine._aexecute_update(stmt, data_to_add) + + return ids + + # async def aadd_documents( + # self, documents: List[Document], ids: List[int] = None, **kwargs: Any + # ) -> List[str]: + # """Run more documents through the embeddings and add to the vectorstore. + + # Args: + # documents (List[Document]): Iterable of Documents to add to the vectorstore. + # ids (List[str]): List of id strings. Defaults to None + + # Returns: + # List of ids from adding the texts into the vectorstore. + # """ + + # texts = [d.page_content for d in documents] + # metadatas = [json.dumps(d.metadata) for d in documents] + # embeddings = self.embedding_service.embed_documents(list(texts)) + + # return await self.aadd_embeddings( + # texts=texts, + # embeddings=embeddings, + # metadatas=metadatas, + # ids=ids, + # engine=self.Engine, + # table_name=self.table_name, + # ) + + # async def aadd_texts( + # self, + # texts: List[str], + # metadatas: List[dict] = None, + # ids: List[int] = None, + # ) -> List[str]: + # """Run more texts through the embeddings and add to the vectorstore. + + # Args: + # texts (str): Iterable of strings to add to the vectorstore. + # metadatas (List[dict]): Optional list of metadatas associated with the texts. Defaults to None. + # ids (List[str]): List of id strings. Defaults to None + + # Returns: + # List of ids from adding the texts into the vectorstore. + # """ + + # if not metadatas: + # metadata = [{} for _ in texts] + + # documents = [] + # for text, meta in zip(texts, metadatas): + # docs = Document(page_content=text, metadata=meta) + # documents.append(docs) + + # return await self.aadd_documents(documents=documents, ids=ids) + + async def __query_collection( + self, embedding: List[float], k: int = 4, filter: str = None + ) -> List[Any]: + if filter is not None: + condition = f"WHERE {filter}" + + query = f""" + SELECT uuid, {self.content_column}, {self.embedding_column}, metadata, + l2_distance({self.embedding_column}, '{embedding}') as distance + FROM {self.table_name} {condition} ORDER BY {self.embedding_column} <-> '{embedding}' LIMIT {k} + """ + else: + query = f""" + SELECT uuid, {self.content_column}, {self.embedding_column}, metadata, + l2_distance({self.embedding_column}, '{embedding}') as distance + FROM {self.table_name} ORDER BY {self.embedding_column} <-> '{embedding}' LIMIT {k} + """ + results = await self.engine._aexecute_fetch(query) + + return results + + async def asimilarity_search( + self, query: str, k: int = 4, filter: str = None + ) -> List[Document]: + embedding = self.embedding_service.embed_query(text=query) + + return await self.asimilarity_search_by_vector( + embedding=embedding, k=k, filter=filter + ) + + async def asimilarity_search_by_vector( + self, embedding: List[float], k: int = 4, filter: str = None + ) -> List[Document]: + docs_and_scores = await self.asimilarity_search_with_score_by_vector( + embedding=embedding, k=k, filter=filter + ) + + return [doc for doc, _ in docs_and_scores] + + async def asimilarity_search_with_score( + self, query: str, k: int = 4, filter: str = None + ) -> List[Tuple[Document, float]]: + embedding = self.embedding_service.embed_query(query) + docs = await self.asimilarity_search_with_score_by_vector( + embedding=embedding, k=k, filter=filter + ) + return docs + + async def asimilarity_search_with_score_by_vector( + self, embedding: List[float], k: int = 4, filter: str = None + ) -> List[Tuple[Document, float]]: + results = await self.__query_collection( + embedding=embedding, k=k, filter=filter + ) + documents_with_scores = [ + ( + Document( + page_content=i[f"{self.content_column}"], + metadata=i["metadata"], + ), + i["distance"], + ) + for i in results + ] + return documents_with_scores + + async def amax_marginal_relevance_search( + self, + query: str, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + filter: str = None, + ) -> List[Document]: + embedding = await self.embedding_service.embed_query(text=query) + + return self.amax_marginal_relevance_search_by_vector( + embedding=embedding, + k=k, + fetch_k=fetch_k, + lambda_mult=lambda_mult, + filter=filter, + ) + + async def amax_marginal_relevance_search_with_score_by_vector( + self, + embedding: List[float], + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + filter: str = None, + ) -> List[Tuple[Document, float]]: + results = await self.__query_collection( + embedding=embedding, k=fetch_k, filter=filter + ) + embedding_list = [i[f"{self.embedding_column}"] for i in results] + + mmr_selected = maximal_marginal_relevance( + np.array(embedding, dtype=np.float32), + embedding_list, + k=k, + lambda_mult=lambda_mult, + ) + + candidates = [ + ( + Document( + page_content=i[f"{self.content_column}"], + metadata=i["metadata"], + ), + i["distance"], + ) + for i in results + ] + + return [r for i, r in enumerate(candidates) if i in mmr_selected] + + async def _acreate_index( + self, index: Union[HNSWIndex, IVFFlatIndex, BruteForce] + ): + if isinstance(index, BruteForce): + return None + + distance = ( + "l2" + if self.distance_strategy == "L2" + else "ip" if distance_strategy == "INNER" else "cosine" + ) + index_type = "hnsw" if isinstance(index, HNSWIndex()) else "ivfflat" + if partial_indexes == None: + condition = "" + else: + condition = f"WHERE (partial_indexes)" + + if index_type == "hnsw": + query = f"CREATE INDEX ON {self.table_name} USING hnsw ({self.embedding_column} vector_{distance}_ops) WITH (m={index.m}, ef_construction={index.ef_construction}) {condition}" + else: + query = f"CREATE INDEX ON {self.table_name} USING ivfflat ({self.embedding_column} vector_{distance}_ops) WITH (lists={index.lists}) {condition}" + + await self.engine._aexecute_update(query) + + async def _aindex_query_options( + self, + index_query_options: [ + HNSWIndex.QueryOptions | IVFFlatIndex.QueryOptions + ], + ): + if isinstance(index_query_options, HNSWIndex.QueryOptions): + query_options = index_query_options.ef_search + query = f"SET hnsw.ef_search = {query_options}" + else: + query_options = index_query_options.probes + query = f"SET ivfflat.probes = {query_options}" + + await self.engine._aexecute_update(query) + + async def areindex( + self, + index: Union[HNSWIndex, IVFFlatIndex, BruteForce], + index_name: Optional[str], + ): + if index_name: + query = f"REINDEX INDEX {index_name}" + await self.engine._aexecute_update(query) + else: + await self._acreate_index(index) + + async def adrop_index(self): + query = f"SELECT indexname, indexdef FROM pg_indexes WHERE tablename='{self.table_name}'" + current_index = await self.engine._aexecute_fetch(query) + index_def = current_index[0]["indexdef"] + if "hnsw" in index_def or "ivfflat" in index_def: + current_index = current_index["indexname"] + query = f"DROP INDEX {current_index}" + await self.engine._aexecute_update(query) + else: + raise ValueError("Cannot drop Index") + + async def aset_index_query_options( + self, distance_strategy, index_query_options + ): + self.distance_strategy = distance_strategy + self.index_query_options = index_query_options + await self._aindex_query_options() + + +class BruteForce: + def __init__(self, distance_strategy: str = "L2"): + self.distance_strategy = distance_strategy + + +class HNSWIndex: + def __init__( + self, + name: str = "LangChainHNSWIndex", + m: int = 16, + ef_construction: int = 64, + partial_indexes: List = [], + distance_strategy: str = "L2", + ): + self.name = name + self.m = m + self.ef_construction = ef_construction + self.partial_indexes = partial_indexes + self.distance_strategy = distance_strategy + + class QueryOptions: + def __init__(self, ef_search): + self.ef_search = ef_search + + +class IVFFlatIndex: + def __init__( + self, + name: str = "LangChainIVFFlatIndex", + lists: int = 1, + partial_indexes: List = [], + distance_strategy: str = "L2", + ): + self.name = name + self.lists = lists + self.partial_indexes = partial_indexes + self.distance_strategy = distance_strategy + + class QueryOptions: + def __init__(self, probes): + self.probes = probes diff --git a/tests/test_cloudSQL.py b/tests/test_cloudSQL.py index 8800da6..557e62f 100644 --- a/tests/test_cloudSQL.py +++ b/tests/test_cloudSQL.py @@ -1,31 +1,33 @@ -"""Test cloudSQLVectorStore functionality.""" +"""Test CloudSQLVectorStore functionality.""" + import os from typing import List +from langchain_community.embeddings import FakeEmbeddings from langchain_core.documents import Document -from langchain_community.vectorstores.cloudSQL import cloudSQLVectorStore -from langchain_community.vectorstores.cloudSQL import cloudSQLEngine -from langchain_community.embeddings import FakeEmbeddings +from langchain_google_cloud_sql_pg import CloudSQLVectorStore, PostgreSQLEngine # from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings -engine = cloudSQLEngine.from_instance( - project_id = os.environ.get("PROJECT_ID", None), - instance = os.environ.get("INSTANCE_NAME"), - region = os.environ.get("REGION_NAME"), - database = os.environ.get("DATABASE_NAME") +engine = PostgreSQLEngine.from_instance( + project_id=os.environ.get("PROJECT_ID", None), + instance=os.environ.get("INSTANCE_NAME"), + region=os.environ.get("REGION_NAME"), + database=os.environ.get("DATABASE_NAME"), ) ADA_TOKEN_COUNT = 1536 + class FakeEmbeddingsWithAdaDimension(FakeEmbeddings): """Fake embeddings functionality for testing.""" def embed_documents(self, texts: List[str]) -> List[List[float]]: """Return simple embeddings.""" return [ - [float(1.0)] * (ADA_TOKEN_COUNT - 1) + [float(i)] for i in range(len(texts)) + [float(1.0)] * (ADA_TOKEN_COUNT - 1) + [float(i)] + for i in range(len(texts)) ] def embed_query(self, text: str) -> List[float]: @@ -33,10 +35,10 @@ def embed_query(self, text: str) -> List[float]: return [float(1.0)] * (ADA_TOKEN_COUNT - 1) + [float(0.0)] -async def test_cloudSQLVectorStore() -> None: +async def test_CloudSQLVectorStore() -> None: """Test end to end construction and search.""" texts = ["foo", "bar", "baz"] - docsearch = await cloudSQL.afrom_texts( + docsearch = await CloudSQLVectorStore.afrom_texts( texts=texts, table_name="test_table", embedding=FakeEmbeddingsWithAdaDimension(), @@ -46,12 +48,12 @@ async def test_cloudSQLVectorStore() -> None: assert output == [Document(page_content="foo")] -async def test_cloudSQLVectorStore_embeddings() -> None: +async def test_CloudSQLVectorStore_embeddings() -> None: """Test end to end construction with embeddings and search.""" texts = ["foo", "bar", "baz"] text_embeddings = FakeEmbeddingsWithAdaDimension().embed_documents(texts) text_embedding_pairs = list(zip(texts, text_embeddings)) - docsearch = cloudSQLVectorStore.afrom_embeddings( + docsearch = CloudSQLVectorStore.afrom_embeddings( text_embeddings=text_embedding_pairs, table_name="test_table", embedding=FakeEmbeddingsWithAdaDimension(), @@ -61,11 +63,11 @@ async def test_cloudSQLVectorStore_embeddings() -> None: assert output == [Document(page_content="foo")] -async def test_cloudSQLVectorStore_with_metadatas() -> None: +async def test_CloudSQLVectorStore_with_metadatas() -> None: """Test end to end construction and search.""" texts = ["foo", "bar", "baz"] metadatas = [{"page": str(i)} for i in range(len(texts))] - docsearch = await cloudSQLVectorStore.afrom_texts( + docsearch = await CloudSQLVectorStore.afrom_texts( texts=texts, table_name="test_table", embedding=FakeEmbeddingsWithAdaDimension(), @@ -76,11 +78,11 @@ async def test_cloudSQLVectorStore_with_metadatas() -> None: assert output == [Document(page_content="foo", metadata={"page": "0"})] -async def test_cloudSQLVectorStore_with_metadatas_with_scores() -> None: +async def test_CloudSQLVectorStore_with_metadatas_with_scores() -> None: """Test end to end construction and search.""" texts = ["foo", "bar", "baz"] metadatas = [{"page": str(i)} for i in range(len(texts))] - docsearch = await cloudSQLVectorStore.afrom_texts( + docsearch = await CloudSQLVectorStore.afrom_texts( texts=texts, table_name="test_table", embedding=FakeEmbeddingsWithAdaDimension(), @@ -88,60 +90,74 @@ async def test_cloudSQLVectorStore_with_metadatas_with_scores() -> None: engine=engine, ) output = await docsearch.asimilarity_search_with_score("foo", k=1) - assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)] + assert output == [ + (Document(page_content="foo", metadata={"page": "0"}), 0.0) + ] -async def test_cloudSQLVectorStore_with_filter_match() -> None: +async def test_CloudSQLVectorStore_with_filter_match() -> None: """Test end to end construction and search.""" texts = ["foo", "bar", "baz"] metadatas = [{"page": str(i)} for i in range(len(texts))] - docsearch = await cloudSQLVectorStore.afrom_texts( + docsearch = await CloudSQLVectorStore.afrom_texts( texts=texts, collection_name="test_collection_filter", embedding=FakeEmbeddingsWithAdaDimension(), metadatas=metadatas, engine=engine, ) - output = await docsearch.asimilarity_search_with_score("foo", k=1, filter={"page": "0"}) - assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)] + output = await docsearch.asimilarity_search_with_score( + "foo", k=1, filter={"page": "0"} + ) + assert output == [ + (Document(page_content="foo", metadata={"page": "0"}), 0.0) + ] -async def test_cloudSQLVectorStore_with_filter_distant_match() -> None: +async def test_CloudSQLVectorStore_with_filter_distant_match() -> None: """Test end to end construction and search.""" texts = ["foo", "bar", "baz"] metadatas = [{"page": str(i)} for i in range(len(texts))] - docsearch = await cloudSQLVectorStore.afrom_texts( + docsearch = await CloudSQLVectorStore.afrom_texts( texts=texts, table_name="test_table", embedding=FakeEmbeddingsWithAdaDimension(), metadatas=metadatas, engine=engine, ) - output = await docsearch.asimilarity_search_with_score("foo", k=1, filter={"page": "2"}) + output = await docsearch.asimilarity_search_with_score( + "foo", k=1, filter={"page": "2"} + ) assert output == [ - (Document(page_content="baz", metadata={"page": "2"}), 0.0013003906671379406) + ( + Document(page_content="baz", metadata={"page": "2"}), + 0.0013003906671379406, + ) ] -async def test_cloudSQLVectorStore_with_filter_no_match() -> None: +async def test_CloudSQLVectorStore_with_filter_no_match() -> None: """Test end to end construction and search.""" texts = ["foo", "bar", "baz"] metadatas = [{"page": str(i)} for i in range(len(texts))] - docsearch = await cloudSQLVectorStore.afrom_texts( + docsearch = await CloudSQLVectorStore.afrom_texts( texts=texts, table_name="test_table", embedding=FakeEmbeddingsWithAdaDimension(), metadatas=metadatas, engine=engine, ) - output = await docsearch.asimilarity_search_with_score("foo", k=1, filter={"page": "5"}) + output = await docsearch.asimilarity_search_with_score( + "foo", k=1, filter={"page": "5"} + ) assert output == [] -async def test_cloudSQLVectorStore_relevance_score() -> None: + +async def test_CloudSQLVectorStore_relevance_score() -> None: """Test to make sure the relevance score is scaled to 0-1.""" texts = ["foo", "bar", "baz"] metadatas = [{"page": str(i)} for i in range(len(texts))] - docsearch = await cloudSQLVectorStore.from_texts( + docsearch = await CloudSQLVectorStore.from_texts( texts=texts, table_name="test_table", embedding=FakeEmbeddingsWithAdaDimension(), @@ -149,35 +165,49 @@ async def test_cloudSQLVectorStore_relevance_score() -> None: engine=engine, ) - output = await docsearch.asimilarity_search_with_relevance_scores("foo", k=3) + output = await docsearch.asimilarity_search_with_relevance_scores( + "foo", k=3 + ) assert output == [ (Document(page_content="foo", metadata={"page": "0"}), 1.0), - (Document(page_content="bar", metadata={"page": "1"}), 0.9996744261675065), - (Document(page_content="baz", metadata={"page": "2"}), 0.9986996093328621), + ( + Document(page_content="bar", metadata={"page": "1"}), + 0.9996744261675065, + ), + ( + Document(page_content="baz", metadata={"page": "2"}), + 0.9986996093328621, + ), ] -async def test_cloudSQLVectorStore_max_marginal_relevance_search() -> None: +async def test_CloudSQLVectorStore_max_marginal_relevance_search() -> None: """Test max marginal relevance search.""" texts = ["foo", "bar", "baz"] - docsearch = await cloudSQLVectorStore.afrom_texts( + docsearch = await CloudSQLVectorStore.afrom_texts( texts=texts, table_name="test_table", embedding=FakeEmbeddingsWithAdaDimension(), engine=engine, ) - output = await docsearch.max_marginal_relevance_search("foo", k=1, fetch_k=3) + output = await docsearch.max_marginal_relevance_search( + "foo", k=1, fetch_k=3 + ) assert output == [Document(page_content="foo")] -async def test_cloudSQLVectorStore_max_marginal_relevance_search_with_score() -> None: +async def test_CloudSQLVectorStore_max_marginal_relevance_search_with_score() -> ( + None +): """Test max marginal relevance search with relevance scores.""" texts = ["foo", "bar", "baz"] - docsearch = await cloudSQLVectorStore.afrom_texts( + docsearch = await CloudSQLVectorStore.afrom_texts( texts=texts, collection_name="test_table", embedding=FakeEmbeddingsWithAdaDimension(), engine=engine, ) - output = await docsearch.amax_marginal_relevance_search_with_score("foo", k=1, fetch_k=3) - assert output == [(Document(page_content="foo"), 0.0)] \ No newline at end of file + output = await docsearch.amax_marginal_relevance_search_with_score( + "foo", k=1, fetch_k=3 + ) + assert output == [(Document(page_content="foo"), 0.0)] From 4c08bbbac1161a8b9fc1c4c6923f107d46cafc03 Mon Sep 17 00:00:00 2001 From: Averi Kitsch Date: Thu, 8 Feb 2024 20:22:51 -0800 Subject: [PATCH 3/9] add features and doc --- docs/vector_store.ipynb | 710 ++++++++++++++++-- pyproject.toml | 4 +- src/langchain_google_cloud_sql_pg/__init__.py | 4 +- .../cloudsql_vectorstore.py | 652 ++++++++++++++++ src/langchain_google_cloud_sql_pg/indexes.py | 87 +++ .../postgresql_engine.py | 159 +++- .../vectorstore.py | 602 --------------- tests/test_cloudSQL.py | 213 ------ tests/test_cloudsql_vectorstore.py | 636 ++++++++++++++++ 9 files changed, 2145 insertions(+), 922 deletions(-) create mode 100644 src/langchain_google_cloud_sql_pg/cloudsql_vectorstore.py create mode 100644 src/langchain_google_cloud_sql_pg/indexes.py delete mode 100644 src/langchain_google_cloud_sql_pg/vectorstore.py delete mode 100644 tests/test_cloudSQL.py create mode 100644 tests/test_cloudsql_vectorstore.py diff --git a/docs/vector_store.ipynb b/docs/vector_store.ipynb index 8b1a4cf..70f5b71 100644 --- a/docs/vector_store.ipynb +++ b/docs/vector_store.ipynb @@ -1,79 +1,657 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Google DATABASE\n", - "\n", - "[Google DATABASE](https://cloud.google.com/DATABASE).\n", - "\n", - "Save chat messages into `DATABASE`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Pre-reqs" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "E_RJy7C1bpCT" + }, + "source": [ + "# CloudSQLVectorStore\n", + "> **CloudSQLVectorStore**:\n", + "CloudSQLVectorStore lets you create vector stores on the Cloud SQL for PostgreSQL database. It also allows for semantic search, using vector indexes for fast approximate results, or using brute force for exact results.\n", + "\n", + "\n", + "This tutorial illustrates how to work with an end-to-end data and embedding management system in LangChain, and provide scalable semantic search in CloudSQL for PostgreSQL." + ] + }, + { + "cell_type": "markdown", + "source": [ + "###Pre-requisites" + ], + "metadata": { + "id": "xjcxaw6--Xyy" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IR54BmgvdHT_" + }, + "source": [ + "### Install the library" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 }, + "id": "0ZITIDE160OD", + "outputId": "90e0636e-ff34-4e1e-ad37-d2a6db4a317e" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "%pip install PACKAGE_NAME" - ] + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting langchain\n", + " Downloading langchain-0.1.5-py3-none-any.whl (806 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m806.7/806.7 kB\u001b[0m \u001b[31m11.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting langchain-community\n", + " Downloading langchain_community-0.0.18-py3-none-any.whl (1.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m39.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting google-cloud\n", + " Downloading google_cloud-0.34.0-py2.py3-none-any.whl (1.8 kB)\n", + "Requirement already satisfied: google-cloud-aiplatform in /usr/local/lib/python3.10/dist-packages (1.39.0)\n", + "Collecting google-cloud-aiplatform\n", + " Downloading google_cloud_aiplatform-1.40.0-py2.py3-none-any.whl (3.4 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m56.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting asyncio\n", + " Downloading asyncio-3.4.3-py3-none-any.whl (101 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m101.8/101.8 kB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting asyncpg\n", + " Downloading asyncpg-0.29.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.7/2.7 MB\u001b[0m \u001b[31m54.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (6.0.1)\n", + "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.0.25)\n", + "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (3.9.3)\n", + "Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (4.0.3)\n", + "Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)\n", + " Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)\n", + "Collecting jsonpatch<2.0,>=1.33 (from langchain)\n", + " Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)\n", + "Collecting langchain-core<0.2,>=0.1.16 (from langchain)\n", + " Downloading langchain_core-0.1.19-py3-none-any.whl (238 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m238.5/238.5 kB\u001b[0m \u001b[31m30.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting langsmith<0.1,>=0.0.83 (from langchain)\n", + " Downloading langsmith-0.0.87-py3-none-any.whl (55 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.4/55.4 kB\u001b[0m \u001b[31m7.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.23.5)\n", + "Requirement already satisfied: pydantic<3,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.6.0)\n", + "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.31.0)\n", + "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (8.2.3)\n", + "Requirement already satisfied: google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (2.11.1)\n", + "Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (1.23.0)\n", + "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (3.20.3)\n", + "Requirement already satisfied: packaging>=14.3 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (23.2)\n", + "Requirement already satisfied: google-cloud-storage<3.0.0dev,>=1.32.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (2.8.0)\n", + "Requirement already satisfied: google-cloud-bigquery<4.0.0dev,>=1.15.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (3.12.0)\n", + "Requirement already satisfied: google-cloud-resource-manager<3.0.0dev,>=1.3.3 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (1.12.0)\n", + "Requirement already satisfied: shapely<3.0.0dev in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (2.0.2)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.2.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.4.1)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.5)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.4)\n", + "Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain)\n", + " Downloading marshmallow-3.20.2-py3-none-any.whl (49 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain)\n", + " Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)\n", + "Requirement already satisfied: googleapis-common-protos<2.0.dev0,>=1.56.2 in /usr/local/lib/python3.10/dist-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (1.62.0)\n", + "Requirement already satisfied: google-auth<3.0.dev0,>=2.14.1 in /usr/local/lib/python3.10/dist-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (2.17.3)\n", + "Requirement already satisfied: grpcio<2.0dev,>=1.33.2 in /usr/local/lib/python3.10/dist-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (1.60.1)\n", + "Requirement already satisfied: grpcio-status<2.0.dev0,>=1.33.2 in /usr/local/lib/python3.10/dist-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (1.48.2)\n", + "Requirement already satisfied: google-cloud-core<3.0.0dev,>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform) (2.3.3)\n", + "Requirement already satisfied: google-resumable-media<3.0dev,>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform) (2.7.0)\n", + "Requirement already satisfied: python-dateutil<3.0dev,>=2.7.2 in /usr/local/lib/python3.10/dist-packages (from google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform) (2.8.2)\n", + "Requirement already satisfied: grpc-google-iam-v1<1.0.0dev,>=0.12.4 in /usr/local/lib/python3.10/dist-packages (from google-cloud-resource-manager<3.0.0dev,>=1.3.3->google-cloud-aiplatform) (0.13.0)\n", + "Collecting jsonpointer>=1.9 (from jsonpatch<2.0,>=1.33->langchain)\n", + " Downloading jsonpointer-2.4-py2.py3-none-any.whl (7.8 kB)\n", + "Requirement already satisfied: anyio<5,>=3 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.2,>=0.1.16->langchain) (3.7.1)\n", + "Requirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (0.6.0)\n", + "Requirement already satisfied: pydantic-core==2.16.1 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (2.16.1)\n", + "Requirement already satisfied: typing-extensions>=4.6.1 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (4.9.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (3.6)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2024.2.2)\n", + "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain) (3.0.3)\n", + "Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3->langchain-core<0.2,>=0.1.16->langchain) (1.3.0)\n", + "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3->langchain-core<0.2,>=0.1.16->langchain) (1.2.0)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (5.3.2)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (0.3.0)\n", + "Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (1.16.0)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.10/dist-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (4.9)\n", + "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /usr/local/lib/python3.10/dist-packages (from google-resumable-media<3.0dev,>=0.6.0->google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform) (1.5.0)\n", + "Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain)\n", + " Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)\n", + "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /usr/local/lib/python3.10/dist-packages (from pyasn1-modules>=0.2.1->google-auth<3.0.dev0,>=2.14.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (0.5.1)\n", + "Installing collected packages: google-cloud, asyncio, mypy-extensions, marshmallow, jsonpointer, asyncpg, typing-inspect, jsonpatch, langsmith, dataclasses-json, langchain-core, langchain-community, langchain, google-cloud-aiplatform\n", + "\u001b[33m WARNING: The script langsmith is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script langchain-server is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script tb-gcp-uploader is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0mSuccessfully installed asyncio-3.4.3 asyncpg-0.29.0 dataclasses-json-0.6.4 google-cloud-0.34.0 google-cloud-aiplatform-1.40.0 jsonpatch-1.33 jsonpointer-2.4 langchain-0.1.5 langchain-community-0.0.18 langchain-core-0.1.19 langsmith-0.0.87 marshmallow-3.20.2 mypy-extensions-1.0.0 typing-inspect-0.9.0\n" + ] }, { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from PACKAGE import LOADER" - ] + "output_type": "display_data", + "data": { + "application/vnd.colab-display-data+json": { + "pip_warning": { + "packages": [ + "asyncio", + "google" + ] + } + } + }, + "metadata": {} }, { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Basic Usage" - ] + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting cloud-sql-python-connector[asyncpg]\n", + " Downloading cloud_sql_python_connector-1.6.0-py2.py3-none-any.whl (35 kB)\n", + "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from cloud-sql-python-connector[asyncpg]) (3.9.3)\n", + "Requirement already satisfied: cryptography>=38.0.3 in /usr/local/lib/python3.10/dist-packages (from cloud-sql-python-connector[asyncpg]) (42.0.2)\n", + "Requirement already satisfied: Requests in /usr/local/lib/python3.10/dist-packages (from cloud-sql-python-connector[asyncpg]) (2.31.0)\n", + "Requirement already satisfied: google-auth in /usr/local/lib/python3.10/dist-packages (from cloud-sql-python-connector[asyncpg]) (2.17.3)\n", + "Requirement already satisfied: asyncpg>=0.29.0 in /root/.local/lib/python3.10/site-packages (from cloud-sql-python-connector[asyncpg]) (0.29.0)\n", + "Requirement already satisfied: async-timeout>=4.0.3 in /usr/local/lib/python3.10/dist-packages (from asyncpg>=0.29.0->cloud-sql-python-connector[asyncpg]) (4.0.3)\n", + "Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.10/dist-packages (from cryptography>=38.0.3->cloud-sql-python-connector[asyncpg]) (1.16.0)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->cloud-sql-python-connector[asyncpg]) (1.3.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->cloud-sql-python-connector[asyncpg]) (23.2.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->cloud-sql-python-connector[asyncpg]) (1.4.1)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->cloud-sql-python-connector[asyncpg]) (6.0.5)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->cloud-sql-python-connector[asyncpg]) (1.9.4)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from google-auth->cloud-sql-python-connector[asyncpg]) (5.3.2)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from google-auth->cloud-sql-python-connector[asyncpg]) (0.3.0)\n", + "Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from google-auth->cloud-sql-python-connector[asyncpg]) (1.16.0)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.10/dist-packages (from google-auth->cloud-sql-python-connector[asyncpg]) (4.9)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from Requests->cloud-sql-python-connector[asyncpg]) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from Requests->cloud-sql-python-connector[asyncpg]) (3.6)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from Requests->cloud-sql-python-connector[asyncpg]) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from Requests->cloud-sql-python-connector[asyncpg]) (2024.2.2)\n", + "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.12->cryptography>=38.0.3->cloud-sql-python-connector[asyncpg]) (2.21)\n", + "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /usr/local/lib/python3.10/dist-packages (from pyasn1-modules>=0.2.1->google-auth->cloud-sql-python-connector[asyncpg]) (0.5.1)\n", + "Installing collected packages: cloud-sql-python-connector\n", + "Successfully installed cloud-sql-python-connector-1.6.0\n" + ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "output_type": "display_data", + "data": { + "application/vnd.colab-display-data+json": { + "pip_warning": { + "packages": [ + "google" + ] + } + } + }, + "metadata": {} } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" + ], + "source": [ + "! pip install langchain langchain-community google-cloud google-cloud-aiplatform asyncio asyncpg --upgrade --user\n", + "! pip install \"cloud-sql-python-connector[asyncpg]\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "v40bB_GMcr9f" + }, + "source": [ + ":**Colab only:** Uncomment the following cell to restart the kernel or use the button to restart the kernel. For Vertex AI Workbench you can restart the terminal using the button on top." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6o0iGVIdDD6K" + }, + "outputs": [], + "source": [ + "# # Automatically restart kernel after installs so that your environment can access the new packages\n", + "# import IPython\n", + "\n", + "# app = IPython.Application.instance()\n", + "# app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "###Note\n", + "\n", + "`If you do not have a GCP project, please follow the below link to create a new project`\n", + "\n", + "[Create a Google Cloud project](https://developers.google.com/workspace/guides/create-project)\n" + ], + "metadata": { + "id": "cTXTbj4UltKf" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Uj02bMRAc9_c" + }, + "source": [ + "#### Set your project ID\n", + "\n", + "If you don't know your project ID, try the following:\n", + "* Run `gcloud config list`.\n", + "* Run `gcloud projects list`.\n", + "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.6" + "id": "wnp1R1PYc9_c", + "outputId": "6502c721-a2fd-451f-b946-9f7b850d5966" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Updated property [core/project].\n" + ] } + ], + "source": [ + "# @title Project { display-mode: \"form\" }\n", + "PROJECT_ID = \"gcp_project_id\" # @param {type:\"string\"}\n", + "\n", + "# Set the project id\n", + "! gcloud config set project {PROJECT_ID}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "38OFiUrIc9_c" + }, + "source": [ + "#### Set the region\n", + "\n", + "You can also change the `REGION` variable used by CloudSQL Postgres. Learn more about [CloudSQL Postgres regions](https://cloud.google.com/sql/docs/postgres/locations)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DWQxsk80c9_d" + }, + "outputs": [], + "source": [ + "# @title Region { display-mode: \"form\" }\n", + "REGION = \"US\" # @param {type: \"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aG5_tNwHc9_d" + }, + "source": [ + "#### Set the dataset and table names\n", + "\n", + "They will be your CloudSQL Postgres Vector Store." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "F8gPQnbDc9_d" + }, + "outputs": [], + "source": [ + "# @title Instance, Database and Table { display-mode: \"form\" }\n", + "INSTANCE = \"my_cloudsql_instance\" # @param {type: \"string\"}\n", + "DATABASE = \"my_langchain_database\" # @param {type: \"string\"}\n", + "TABLE = \"doc_and_vectors\" # @param {type: \"string\"}" + ] + }, + { + "cell_type": "markdown", + "source": [ + "###Pre-requisites for connecting to the CloudSQL instance\n", + "\n", + "To connect to the postgreSQL instance make sure to setup the cloudSQL auth proxy and ensure the addition of IAM users to the list of authenticated users to connect to the instance.\n", + "\n", + "Refer to this [link](https://github.com/GoogleCloudPlatform/cloud-sql-proxy) to setup auth proxy.\n", + "\n", + "Refer to this [link](https://cloud.google.com/sql/docs/postgres/users?_ga=2.165429503.-1722697531.1694071937) to add users to the instance" + ], + "metadata": { + "id": "W6wxYasx_EKB" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "w7JEEj49c9_d" + }, + "source": [ + "### Authenticating your notebook environment\n", + "\n", + "- If you are using **Colab** to run this notebook, uncomment the cell below and continue.\n", + "- If you are using **Vertex AI Workbench**, check out the setup instructions [here](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/setup-env)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1JZmXFavc9_d" + }, + "outputs": [], + "source": [ + "from google.colab import auth as google_auth\n", + "\n", + "google_auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AD3yG49BdLlr" + }, + "source": [ + "## Demo: CloudSQL Postgres VectorSearch" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vMi7sXhtc9_e" + }, + "source": [ + "### Create an embedding class [instance](https://)\n", + "\n", + "---\n", + "\n", + "\n", + "\n", + "You may need to enable Vertex AI API in your project by running\n", + "`gcloud services enable aiplatform.googleapis.com --project {PROJECT_ID}`\n", + "(replace `{PROJECT_ID}` with the name of your project).\n", + "\n", + "You can use any [LangChain embeddings model](https://python.langchain.com/docs/integrations/text_embedding/)." + ] + }, + { + "cell_type": "code", + "source": [ + "# Importing the necessary libraries\n", + "from langchain_community.vectorstores.cloudSQL import CloudSQLVectorStore\n", + "from langchain_community.vectorstores.cloudSQL import CloudSQLEngine\n", + "from langchain_community.vectorstores.cloudSQL import HNSWIndex" + ], + "metadata": { + "id": "TuH9AOl58bAs" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Vb2RJocV9_LQ" + }, + "outputs": [], + "source": [ + "from langchain_community.embeddings import VertexAIEmbeddings\n", + "\n", + "embedding = VertexAIEmbeddings(\n", + " model_name=\"textembedding-gecko@latest\", project=PROJECT_ID\n", + ")" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Create CloudSQLEngine to connect to the database" + ], + "metadata": { + "id": "D9Xs2qhm6X56" + } + }, + { + "cell_type": "code", + "source": [ + "# ClouSQLVectorStore requires an engine created using the CloudSQLEngine class\n", + "engine = CloudSQLEngine.from_instance(\n", + " region = \"region_name\",\n", + " instance = \"instance_name\",\n", + " database = \"dbname\"\n", + ")" + ], + "metadata": { + "id": "avlyHEMn6gzU" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Create CloudSQLVectorStore to create a table" + ], + "metadata": { + "id": "e1tl0aNx7SWy" + } + }, + { + "cell_type": "code", + "source": [ + "# Creating a basic CloudSQLVectorStore object\n", + "db = CloudSQLVectorStore(\n", + " engine=engine,\n", + " table_name='table_name',\n", + " embedding_service=embedding)\n", + "\n", + "# Alternatively we can create a non-default vector store object by tweaking the following args:\n", + "# vector_size - By default it is set to 768. Can be set to vector size of choice.\n", + "# content_column - By default the content column is named 'content'. Can be set to any name of choice.\n", + "# embedding_column - By default the embedding column is named 'embedding'. Can be set to any name of choice.\n", + "# metadata_columns - By default the metadata column is named 'metadata'. Can be set to any name/ list of names of choice.\n", + "# ignore_metadata_columns - By default the ignore_metadata_columns is None. Can be set to any name/ list of names of choice.\n", + "# index_query_options - By default the index_query_options is None. Can be set using HNSWIndex.QueryOptions() or IVFFlatIndex.QueryOptions().\n", + "# index - By default the index is a HNSWIndex object. Can be set to a IVFFlatIndex object or BruteForce object.\n", + "# distance_strategy - By default the distance_strategy is 'L2'. Can be set to 'INNER PRODUCT' or 'COSINE'.\n", + "# overwrite_existing - By default the overwrite_existing is False. Can be set to True if table needs to be overwritten." + ], + "metadata": { + "id": "z-AZyzAQ7bsf" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PeOMpftjc9_e" + }, + "source": [ + "### Add texts\n", + "This method helps add texts into the table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cwvi_O5Wc9_e" + }, + "outputs": [], + "source": [ + "texts = [\"Apples and oranges\", \"Cars and airplanes\", \"Pineapple\", \"Train\", \"Banana\"]\n", + "metadatas = [{\"len\": len(t)} for t in texts]\n", + "await db.add_texts(texts=texts,metadatas=metadatas)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kSkL9l1Hc9_e" + }, + "source": [ + "### Search for documents\n", + "The default distance strategy used for querying similar documents is L2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Q4pCL2I_c9_f" + }, + "outputs": [], + "source": [ + "query = \"I'd like a fruit.\"\n", + "docs = await db.similarity_search(query)\n", + "print(docs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5R6h0_Cvc9_f" + }, + "source": [ + "### Search for documents by vector\n", + "Searching for similar documents with list of embeddings as params\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NGNdS7cqc9_f" + }, + "outputs": [], + "source": [ + "query_vector = embedding.embed_query(query)\n", + "docs = await db.asimilarity_search_by_vector(query_vector, k=2)\n", + "print(docs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yKw_Lab-c9_f" + }, + "source": [ + "### Search for documents with metadata filter\n", + "Additional metadata filtering" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uyYDfbMKc9_f" + }, + "outputs": [], + "source": [ + "# This should only return \"Banana\" document.\n", + "docs = await db.asimilarity_search_by_vector(query_vector, filter={\"len\": 6})\n", + "print(docs)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "###Maximum Marginal relevance search (MMR)\n", + "Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents.\n", + "\n" + ], + "metadata": { + "id": "IPhxeqGr7sOS" + } + }, + { + "cell_type": "code", + "source": [ + "# This should return top 4 relevant documents to the given query\n", + "docs = await db.amax_marginal_relevance_search(query)\n", + "print(docs)" + ], + "metadata": { + "id": "zmnGOrTT71BF" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "###Indexing\n", + "Setting custom indexes/ rebuilding indexes" + ], + "metadata": { + "id": "_K68SOsq73Tc" + } + }, + { + "cell_type": "code", + "source": [ + "# This would return None if index is rebuilt or created.\n", + "index = HNSWIndex()\n", + "await db.areindex(index)" + ], + "metadata": { + "id": "aZdo-WVM77I7" + }, + "execution_count": null, + "outputs": [] + } + ], + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 4 -} + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 79002b5..6fd3fb2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,8 +7,8 @@ license = {file = "LICENSE"} requires-python = ">=3.8" dependencies = [ "langchain==0.1.1", - "SQLAlchemy==2.0.7", - "cloud-sql-python-connector[asyncpg]==1.5.0", + "SQLAlchemy>=2.0.25", + "cloud-sql-python-connector[asyncpg]>=1.6.0", "nest_asyncio", "pgvector" ] diff --git a/src/langchain_google_cloud_sql_pg/__init__.py b/src/langchain_google_cloud_sql_pg/__init__.py index 259f273..7363b32 100644 --- a/src/langchain_google_cloud_sql_pg/__init__.py +++ b/src/langchain_google_cloud_sql_pg/__init__.py @@ -12,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +from langchain_google_cloud_sql_pg.cloudsql_vectorstore import ( + CloudSQLVectorStore, +) from langchain_google_cloud_sql_pg.postgresql_engine import PostgreSQLEngine -from langchain_google_cloud_sql_pg.vectorstore import CloudSQLVectorStore __all__ = ["PostgreSQLEngine", "CloudSQLVectorStore"] diff --git a/src/langchain_google_cloud_sql_pg/cloudsql_vectorstore.py b/src/langchain_google_cloud_sql_pg/cloudsql_vectorstore.py new file mode 100644 index 0000000..829e6fa --- /dev/null +++ b/src/langchain_google_cloud_sql_pg/cloudsql_vectorstore.py @@ -0,0 +1,652 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# TODO: Remove below import when minimum supported Python version is 3.10 +from __future__ import annotations + +import asyncio +import json +import uuid +from typing import Any, Callable, Iterable, List, Optional, Tuple, Type, Union + +import nest_asyncio +import numpy as np +from langchain_community.vectorstores.utils import maximal_marginal_relevance +from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings +from langchain_core.vectorstores import VectorStore +from pgvector.sqlalchemy import Vector +from sqlalchemy import text + +from .indexes import ( + DEFAULT_DISTANCE_STRATEGY, + BruteForce, + DistanceStrategy, + HNSWIndex, + IVFFlatIndex, +) +from .postgresql_engine import PostgreSQLEngine + +nest_asyncio.apply() + + +class CloudSQLVectorStore(VectorStore): + """Google Cloud SQL for PostgreSQL Vector Store class""" + + def __init__( + self, + engine: PostgreSQLEngine, + embedding_service: Embeddings, + table_name: str, + content_column: str = "content", + embedding_column: str = "embedding", + metadata_columns: List[str] = [], + ignore_metadata_columns: List[str] = None, + id_column: str = "langchain_id", + metadata_json_column: str = "langchain_metadata", + index_query_options: Optional[ + HNSWIndex.QueryOptions | IVFFlatIndex.QueryOptions + ] = None, + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, + overwrite_existing: bool = False, + k: int = None, + score_threshold: float = None, + fetch_k: int = None, + lambda_mult: float = None, + ): + """_summary_ + + Args: + engine (PostgreSQLEngine): _description_ + embedding_service (Embeddings): _description_ + table_name (str): _description_ + content_column (str): _description_ + embedding_column (str): _description_ + metadata_columns (List[str]): _description_ + ignore_metadata_columns (List[str]): _description_ + id_column (str): _description_ + metadata_json_column (str): _description_ + index_query_options (_type_): _description_ + distance_strategy (DistanceStrategy, optional): _description_. Defaults to DEFAULT_DISTANCE_STRATEGY. + """ + self.engine = engine + self.embedding_service = embedding_service + self.table_name = table_name + self.content_column = content_column + self.embedding_column = embedding_column + self.metadata_columns = metadata_columns + self.ignore_metadata_columns = ignore_metadata_columns + self.id_column = id_column + self.metadata_json_column = metadata_json_column + self.index_query_options = index_query_options + self.distance_strategy = distance_strategy + self.overwrite_existing = overwrite_existing + self.store_metadata = False # Set true later + self.k = k + self.score_threshold = score_threshold + self.fetch_k = fetch_k + self.lambda_mult = lambda_mult + if metadata_columns and ignore_metadata_columns: + raise ValueError( + "Can not use both metadata_columns and ignore_metadata_columns." + ) + self.loop = asyncio.get_event_loop() + # from threading import Thread + + # self._thread = Thread(target=self._loop.run_forever, daemon=True) + # self._thread.start() + # asyncio.run_coroutine_threadsafe( + # self.__post_init__(), self._loop + # ).result() + # loop = asyncio.get_event_loop() + # loop = asyncio.get_running_loop() + # loop.run_until_complete(self.__post_init__()) + # loop = asyncio.get_event_loop() + + # return loop.run_until_complete(self.__post_init__()) + + # return asyncio.run_coroutine_threadsafe( + # self.__post_init__(), self.loop + # ).result() + # loop = asyncio.new_event_loop() + # asyncio.run_coroutine_threadsafe(self.__post_init__(), loop).result() + + self.loop.create_task(self.__post_init__()) + + async def __post_init__(self) -> None: + stmt = text( + f"SELECT column_name, data_type FROM information_schema.columns WHERE table_name = '{self.table_name}'" + ) + if self.overwrite_existing: + await self.engine._aexecute_update( + f"TRUNCATE TABLE {self.table_name}" + ) + # async with self.engine.connect() as conn: + results = await self.engine._aexecute_fetch(stmt) + # Get field type information + columns = {} + for field in results.fetchall(): + columns[field[0]] = field[1] + + if self.id_column not in columns: + raise ValueError(f"Id column, {self.id_column}, does not exist.") + if self.content_column not in columns: + raise ValueError( + f"Content column, {self.content_column}, does not exist." + ) + if self.embedding_column not in columns: + raise ValueError( + f"Embedding column, {self.embedding_column}, does not exist." + ) + if columns[self.embedding_column] != "USER-DEFINED": + raise ValueError( + f"Embedding column, {self.embedding_column}, is not type Vector." + ) + for column in self.metadata_columns: + if column not in columns: + raise ValueError(f"Metadata column, {column}, does not exist.") + # if column_types[content_column] is not "String": + # raise ValueError(f"Content column, {content_column}, does not exist.") + if self.metadata_json_column in columns: + print("found") + self.store_metadata = True + + all_columns = columns # .keys() + if self.ignore_metadata_columns: + for column in self.ignore_metadata_columns: + del all_columns[column] + + del all_columns[self.id_column] + del all_columns[self.content_column] + del all_columns[self.embedding_column] + # print("key", self.metadata_columns) + self.metadata_columns = [k for k, v in all_columns.keys()] + + @property + def embeddings(self) -> Embeddings: + return self.embedding_service + + async def aadd_embeddings( + self, + texts: Iterable[str], + embeddings: List[List[float]], + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> List[str]: + if not ids: + ids = [str(uuid.uuid4()) for _ in texts] + if not metadatas: + metadatas = [{} for _ in texts] + for id, content, embedding, metadata in zip( + ids, texts, embeddings, metadatas + ): + metadata_col_names = ( + ", " + ", ".join(self.metadata_columns) + if len(self.metadata_columns) > 0 + else "" + ) + insert_stmt = f"INSERT INTO {self.table_name}({self.id_column}, {self.content_column}, {self.embedding_column}{metadata_col_names}" + values_stmt = f" VALUES ('{id}','{content}','{embedding}'" + extra = metadata + for metadata_column in self.metadata_columns: + values_stmt += f",'{metadata[metadata_column]}'" + del extra[metadata_column] + + insert_stmt += ( + f", {self.metadata_json_column})" + if self.store_metadata + else ")" + ) + values_stmt += f",'{extra}')" if self.store_metadata else ")" + query = insert_stmt + values_stmt + print(query) + print(extra) + print(self.metadata_columns) + await self.engine._aexecute_update(query) + + return ids + + # def add_embeddings( + # self, + # texts: Iterable[str], + # embeddings: List[List[float]], + # metadatas: Optional[List[dict]] = None, + # ids: Optional[List[str]] = None, + # **kwargs: Any, + # ) -> List[str]: + # return self.loop.create_task( + # self.aadd_embeddings(texts, embeddings, metadatas, ids, **kwargs) + # ) + + async def aadd_documents( + self, documents: List[Document], ids: List[str] = None, **kwargs: Any + ) -> List[str]: + texts = [doc.page_content for doc in documents] + metadatas = [doc.metadata for doc in documents] + await self.aadd_texts(texts, metadatas=metadatas, ids=ids, **kwargs) + return ids + + # def add_documents( + # self, documents: List[Document], ids: List[str] = None, **kwargs: Any + # ) -> List[str]: + # return self.loop.create_task( + # self.aadd_documents(documents, ids, **kwargs) + # ) + + async def aadd_texts( + self, + texts: Iterable[str], + metadatas: List[dict] = None, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> List[str]: + # if ids is None: + # ids = [str(uuid.uuid1()) for _ in texts] + embeddings = self.embedding_service.embed_documents(list(texts)) + await self.aadd_embeddings( + texts, embeddings, metadatas=metadatas, ids=ids, **kwargs + ) + return ids + + def add_texts( + self, + texts: Iterable[str], + metadatas: List[dict] = None, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> List[Document]: + return self.loop.create_task( + self.aadd_texts(texts, metadatas=metadatas, ids=ids, **kwargs) + ) + + @classmethod + async def afrom_texts( + cls: Type[CloudSQLVectorStore], + texts: List[str], + engine: PostgreSQLEngine, + embedding_service: Embeddings, + table_name: str, + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + content_column: str = "content", + embedding_column: str = "embedding", + metadata_columns: List[str] = [], + ignore_metadata_columns: List[str] = None, + id_column: str = "langchain_id", + metadata_json_column: str = "langchain_metadata", + index_query_options=None, + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, + **kwargs: Any, + ) -> CloudSQLVectorStore: + vs = cls( + engine, + embedding_service, + table_name, + content_column=content_column, + embedding_column=embedding_column, + metadata_columns=metadata_columns, + ignore_metadata_columns=ignore_metadata_columns, + metadata_json_column=metadata_json_column, + id_column=id_column, + index_query_options=index_query_options, + distance_strategy=distance_strategy, + ) + await vs.aadd_texts(texts, metadatas=metadatas, ids=ids, **kwargs) + return vs + + @classmethod + def from_texts( + cls: Type[CloudSQLVectorStore], + texts: List[str], + engine: PostgreSQLEngine, + embedding_service: Embeddings, + table_name: str, + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + content_column: str = "content", + embedding_column: str = "embedding", + metadata_columns: List[str] = [], + ignore_metadata_columns: List[str] = None, + id_column: str = "langchain_id", + metadata_json_column: str = "langchain_metadata", + index_query_options=None, + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, + **kwargs: Any, + ) -> CloudSQLVectorStore: + vs = cls( + engine, + embedding_service, + table_name, + content_column=content_column, + embedding_column=embedding_column, + metadata_columns=metadata_columns, + ignore_metadata_columns=ignore_metadata_columns, + metadata_json_column=metadata_json_column, + id_column=id_column, + index_query_options=index_query_options, + distance_strategy=distance_strategy, + ) + vs.add_texts(texts, metadatas=metadatas, ids=ids, **kwargs) + return vs + + async def adelete( + self, + ids: List[str], + **kwargs: Any, + ) -> Optional[bool]: + id_list = ", ".join([f"'{id}'" for id in ids]) + query = f"DELETE FROM {self.table_name} WHERE {self.id_column} in ({id_list})" + await self.engine._aexecute_update(query) + return True + + # def delete( + # self, + # ids: Optional[List[str]] = None, + # **kwargs: Any, + # ) -> Optional[bool]: + # return self.loop.create_task(self.adelete(ids=ids, **kwargs)) + + async def __query_collection( + self, + embedding: List[float], + k: int = 4, + filter: str = None, + ) -> List[Any]: + k = self.k if self.k else k + if self.distance_strategy == DistanceStrategy.EUCLIDEAN: + operator = "<->" + vector_function = "l2_distance" + elif self.distance_strategy == DistanceStrategy.COSINE: + operator = "<=>" + vector_function = "cosine_distance" + else: # Inner product + operator = "<#>" + vector_function = "inner_product" + + filter = f"WHERE {filter}" if filter else "" + stmt = f"SELECT *, {vector_function}({self.embedding_column}, '{embedding}') as distance FROM {self.table_name} {filter} ORDER BY {self.embedding_column} {operator} '{embedding}' LIMIT {k};" + results = await self.engine._aexecute_fetch(stmt) + return results + + async def asimilarity_search( + self, + query: str, + k: int = 4, + filter: str = None, + **kwargs: Any, + ) -> List[Document]: + embedding = self.embedding_service.embed_query(text=query) + + return await self.asimilarity_search_by_vector( + embedding=embedding, k=k, filter=filter + ) + + def similarity_search( + self, + query: str, + k: int = 4, + filter: str = None, + **kwargs: Any, + ) -> List[Document]: + return self.loop.create_task( + self.asimilarity_search(query, k, filter=filter, **kwargs) + ).result() + + async def asimilarity_search_with_score( + self, + query: str, + k: int = 4, + filter: str = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + + embedding = self.embedding_service.embed_query(query) + docs = await self.asimilarity_search_with_score_by_vector( + embedding=embedding, k=k, filter=filter + ) + return docs + + # def similarity_search_with_score( + # self, + # query: str, + # k: int = 4, + # filter: str = None, + # **kwargs: Any, + # ) -> List[Tuple[Document, float]]: + # return self.loop.create_task(self.asimilarity_search_with_score()) + + async def asimilarity_search_by_vector( + self, + embedding: List[float], + k: int = 4, + filter: str = None, + **kwargs: Any, + ) -> List[Document]: + docs_and_scores = await self.asimilarity_search_with_score_by_vector( + embedding=embedding, k=k, filter=filter + ) + + return [doc for doc, _ in docs_and_scores] + + # def similarity_search_by_vector( + # self, embedding: List[float], k: int = 4, filter=filter, **kwargs: Any + # ) -> List[Document]: + # return self.loop.create_task( + # self.similarity_search_by_vector( + # embedding, k, fitler=filter**kwargs + # ) + # ) + + async def asimilarity_search_with_score_by_vector( + self, + embedding: List[float], + k: int = 4, + filter: str = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + results = await self.__query_collection( + embedding=embedding, k=k, filter=filter + ) + + documents_with_scores = [] + for row in results: + metadata = ( + row[self.metadata_json_column] + if self.store_metadata and row[self.metadata_json_column] + else {} + ) + for col in self.metadata_columns: + metadata[col] = row[col] + documents_with_scores.append( + ( + Document( + page_content=row[self.content_column], + metadata=metadata, + ), + row["distance"], + ) + ) + + return documents_with_scores + + # def similarity_search_with_score_by_vector( + # self, + # embedding: List[float], + # k: int = 4, + # filter: str = None, + # **kwargs: Any, + # ) -> List[Tuple[Document, float]]: + # return self.loop.create_task( + # self.asimilarity_search_with_score_by_vector( + # embedding, k, filter, **kwargs + # ) + # ) + + async def amax_marginal_relevance_search( + self, + query: str, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + filter: str = None, + **kwargs: Any, + ) -> List[Document]: + embedding = self.embedding_service.embed_query(text=query) + + return await self.amax_marginal_relevance_search_by_vector( + embedding=embedding, + k=k, + fetch_k=fetch_k, + lambda_mult=lambda_mult, + filter=filter, + **kwargs, + ) + + # def max_marginal_relevance_search( + # self, + # query: str, + # k: int = 4, + # fetch_k: int = 20, + # lambda_mult: float = 0.5, + # filter: str = None, + # **kwargs: Any, + # ) -> List[Document]: + # return self.loop.create_task( + # self.amax_marginal_relevance_search( + # query, k, fetch_k, lambda_mult, filter, **kwargs + # ) + # ) + + async def amax_marginal_relevance_search_by_vector( + self, + embedding: List[float], + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + filter: str = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance.""" + docs_and_scores = ( + await self.amax_marginal_relevance_search_with_score_by_vector( + embedding, + k=k, + fetch_k=fetch_k, + lambda_mult=lambda_mult, + filter=filter, + **kwargs, + ) + ) + + return [result[0] for result in docs_and_scores] + + async def amax_marginal_relevance_search_with_score_by_vector( + self, + embedding: List[float], + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + filter: str = None, + ) -> List[Tuple[Document, float]]: + results = await self.__query_collection( + embedding=embedding, k=fetch_k, filter=filter + ) + + k = self.k if self.k else k + fetch_k = self.fetch_k if self.fetch_k else fetch_k + lambda_mult = self.lambda_mult if self.lambda_mult else lambda_mult + embedding_list = [ + json.loads(row[self.embedding_column]) for row in results + ] + mmr_selected = maximal_marginal_relevance( + np.array(embedding, dtype=np.float32), + embedding_list, + k=k, + lambda_mult=lambda_mult, + ) + + documents_with_scores = [] + for row in results: + metadata = ( + row[self.metadata_json_column] + if self.store_metadata and row[self.metadata_json_column] + else {} + ) + for col in self.metadata_columns: + metadata[col] = row[col] + documents_with_scores.append( + ( + Document( + page_content=row[self.content_column], + metadata=metadata, + ), + row["distance"], + ) + ) + + return [ + r for i, r in enumerate(documents_with_scores) if i in mmr_selected + ] + + def _select_relevance_score_fn(self) -> Callable[[float], float]: + if self.distance_strategy == DistanceStrategy.COSINE: + return self._cosine_relevance_score_fn + elif self.distance_strategy == DistanceStrategy.EUCLIDEAN: + return self._euclidean_relevance_score_fn + elif self.distance_strategy == DistanceStrategy.INNER_PRODUCT: + return self._max_inner_product_relevance_score_fn + else: + raise ValueError( + "No supported normalization function" + f" for distance_strategy of {self.distance_strategy}." + "Consider providing relevance_score_fn to PGVector constructor." + ) + + async def aapply_index( + self, + index: Union[HNSWIndex, IVFFlatIndex, BruteForce], + concurrently=False, + ) -> None: + + if isinstance(index, BruteForce): + return None + + filter = ( + f"WHERE ({index.partial_indexes})" if index.partial_indexes else "" + ) + params = "WITH " + index.index_options() + concurrently = "CONCURRENTLY" if concurrently else "" + + if index.distance_strategy == DistanceStrategy.EUCLIDEAN: + function = "vector_l2_ops" + elif index.distance_strategy == DistanceStrategy.COSINE: + function = "vector_cosine_ops" + else: + function = "vector_ip_ops" + + stmt = f"CREATE INDEX {index.name} {concurrently} ON {self.table_name} USING {index.index_type} ({self.embedding_column} {function}) {params} {filter};" + + await self.engine._aexecute_update(stmt) + + async def areindex( + self, + index_name: str, + ) -> None: + query = f"REINDEX INDEX {index_name}" + await self.engine._aexecute_update(query) + + async def adrop_index(self, name) -> None: + query = f"DROP INDEX {name}" + await self.engine._aexecute_update(query) diff --git a/src/langchain_google_cloud_sql_pg/indexes.py b/src/langchain_google_cloud_sql_pg/indexes.py new file mode 100644 index 0000000..75c6711 --- /dev/null +++ b/src/langchain_google_cloud_sql_pg/indexes.py @@ -0,0 +1,87 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import enum +from typing import List, Optional + + +class DistanceStrategy(str, enum.Enum): + """Enumerator of the Distance strategies.""" + + EUCLIDEAN = "l2" + COSINE = "cosine" + INNER_PRODUCT = "inner" + + +DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.COSINE + + +class BruteForce: + index_type = "knn" + + def __init__( + self, + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, + ): + self.distance_strategy = distance_strategy + + +class HNSWIndex: + index_type = "hnsw" + + def __init__( + self, + name: str = "langchainhnsw", + m: int = 16, # TODO! + ef_construction: int = 64, + partial_indexes: List = [], + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, + ): + self.name = name + self.m = m + self.ef_construction = ef_construction + self.partial_indexes = partial_indexes + self.distance_strategy = distance_strategy + self.query_options = self.QueryOptions() + + def index_options(self) -> str: + return f"(m = {self.m}, ef_construction = {self.ef_construction})" + + class QueryOptions: + def __init__(self, ef_search: Optional[int] = None): + self.ef_search = ef_search + + +class IVFFlatIndex: + index_type = "ivfflat" + + def __init__( + self, + name: str = "langchainivfflat", + lists: int = 1, + partial_indexes: List = [], + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, + ): + self.name = name + self.lists = lists + self.partial_indexes = partial_indexes + self.distance_strategy = distance_strategy + self.query_options = self.QueryOptions() + + def index_options(self) -> str: + return f"(lists = {self.lists})" + + class QueryOptions: + def __init__(self, probes: Optional[int] = None): + self.probes = probes diff --git a/src/langchain_google_cloud_sql_pg/postgresql_engine.py b/src/langchain_google_cloud_sql_pg/postgresql_engine.py index 0062e94..c92fbdd 100644 --- a/src/langchain_google_cloud_sql_pg/postgresql_engine.py +++ b/src/langchain_google_cloud_sql_pg/postgresql_engine.py @@ -1,54 +1,100 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# TODO: Remove below import when minimum supported Python version is 3.10 from __future__ import annotations +# import requests +# import sqlalchemy import asyncio -import json -import time from threading import Thread -from typing import AnyStr +from typing import TYPE_CHECKING, Dict, List, Optional, Type import aiohttp import google.auth -from google.cloud.sql.connector import Connector -from langchain_community.vectorstores.utils import maximal_marginal_relevance -from pgvector.asyncpg import register_vector +import google.auth.transport.requests +import nest_asyncio +from google.cloud.sql.connector import Connector, create_async_connector -# import sqlalchemy -from sqlalchemy import text -from sqlalchemy.ext.asyncio import AsyncEngine, create_async_engine +# from pgvector.asyncpg import register_vector +from sqlalchemy import Column, text +from sqlalchemy.ext.asyncio import ( + AsyncConnection, + AsyncEngine, + create_async_engine, +) + +# nest_asyncio.apply() +if TYPE_CHECKING: + import asyncpg + import google.auth.credentials -async def _get_IAM_user( + +async def _get_iam_principal_email( credentials: google.auth.credentials.Credentials, ) -> str: - """Get user/service account name""" - request = google.auth.transport.requests.Request() - credentials.refresh(request) + """Get email address associated with current authenticated IAM principal. + + Email will be used for automatic IAM database authentication to Cloud SQL. + Args: + credentials (google.auth.credentials.Credentials): + The credentials object to use in finding the associated IAM + principal email address. + + Returns: + email (str): + The email address associated with the current authenticated IAM + principal. + """ + # refresh credentials if they are not valid + if not credentials.valid: + request = google.auth.transport.requests.Request() + credentials.refresh(request) + # call OAuth2 api to get IAM principal email associated with OAuth2 token url = f"https://oauth2.googleapis.com/tokeninfo?access_token={credentials.token}" async with aiohttp.ClientSession() as client: - response = await client.get(url) - response = json.loads(await response.text()) - email = response["email"] - if ".gserviceaccount.com" in email: - email = email.replace(".gserviceaccount.com", "") - - return email + response = await client.get(url, raise_for_status=True) + response_json: Dict = await response.json() + email = response_json.get("email") + if email is None: + raise ValueError( + "Failed to automatically obtain authenticated IAM princpal's " + "email address using environment's ADC credentials!" + ) + return email class PostgreSQLEngine: - """Creating a connection to the CloudSQL instance - To use, you need the following packages installed: - cloud-sql-python-connector[asyncpg] - """ + """A class for managing connections to a Cloud SQL for Postgres database.""" + + __create_key = object() def __init__( self, + key, project_id=None, region=None, instance=None, database=None, engine=None, ): + if key != PostgreSQLEngine.__create_key: + raise Exception( + "Only create class through from_instance and from_engine methods!" + ) self.project_id = project_id self.region = region self.instance = instance @@ -57,12 +103,9 @@ def __init__( self._loop = asyncio.new_event_loop() self._thread = Thread(target=self._loop.run_forever, daemon=True) self._thread.start() - pool_object = asyncio.wrap_future( - asyncio.run_coroutine_threadsafe(self.async_func(), self._loop), - loop=self._loop, - ) - time.sleep(1) - self._pool = pool_object.result() + self._pool = asyncio.run_coroutine_threadsafe( + self._engine(), self._loop + ).result() @classmethod def from_instance( @@ -73,13 +116,11 @@ def from_instance( project_id: str = None, ) -> PostgreSQLEngine: """Create PostgreSQLEngine connection to the postgres database in the CloudSQL instance. - Args: region (str): CloudSQL instance region. instance (str): CloudSQL instance name. database (str): CloudSQL instance database name. project_id (str): GCP project ID. Defaults to None - Returns: PostgreSQLEngine containing the asyncpg connection pool. """ @@ -88,13 +129,16 @@ def from_instance( region=region, instance=instance, database=database, + key=PostgreSQLEngine.__create_key, ) @classmethod def from_engine(cls, engine: AsyncEngine) -> PostgreSQLEngine: - return cls(engine=engine) + + return cls(engine=engine, key=PostgreSQLEngine.__create_key) async def _engine(self) -> AsyncEngine: + if self.engine is not None: return self.engine @@ -110,22 +154,24 @@ async def get_conn(): conn = await connector.connect_async( f"{self.project_id}:{self.region}:{self.instance}", "asyncpg", - user=await _get_IAM_user(credentials), + # user=await _get_iam_principal_email(credentials), + user="postgres", + password="my-pg-pass", enable_iam_auth=True, db=self.database, ) - await register_vector(conn) return conn pool = create_async_engine( "postgresql+asyncpg://", + # poolclass=NullPool, async_creator=get_conn, ) return pool - async def _aexecute_fetch(self, query) -> Any: + async def _aexecute_fetch(self, query): async with self._pool.connect() as conn: result = await conn.execute(text(query)) result_map = result.mappings() @@ -135,6 +181,43 @@ async def _aexecute_fetch(self, query) -> Any: async def _aexecute_update(self, query, additional=None) -> None: async with self._pool.connect() as conn: - result = await conn.execute(text(query), additional) - result = result.mappings() + await conn.execute(text(query), additional) await conn.commit() + + async def init_vectorstore_table( + self, + table_name: str, + vector_size: int, + content_column: str = "content", + embedding_column: str = "embedding", + metadata_columns: List[Column] = [], + id_column: str = "langchain_id", + overwrite_existing: bool = False, + store_metadata: bool = True, + ) -> None: + # async with self.engine.connect() as conn: + # Enable pgvector + # await conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector")) + await self._aexecute_update("CREATE EXTENSION IF NOT EXISTS vector") + # Register the vector type + # await register_vector(conn) + + if overwrite_existing: + await self._aexecute_update(f"DROP TABLE {table_name}") + # await conn.execute( + # text(f"TRUNCATE TABLE {table_name} RESET IDENTITY") + # ) # TODO? + + query = f"""CREATE TABLE IF NOT EXISTS {table_name}( + {id_column} UUID PRIMARY KEY, + {content_column} TEXT NOT NULL, + {embedding_column} vector({vector_size}) NOT NULL""" + for column in metadata_columns: + query += f",\n{column.name} {column.type}" + ( + "NOT NULL" if not column.nullable else "" + ) + if store_metadata: + query += ",\nlangchain_metadata JSON" + query += "\n);" + + await self._aexecute_update(query) diff --git a/src/langchain_google_cloud_sql_pg/vectorstore.py b/src/langchain_google_cloud_sql_pg/vectorstore.py deleted file mode 100644 index f514452..0000000 --- a/src/langchain_google_cloud_sql_pg/vectorstore.py +++ /dev/null @@ -1,602 +0,0 @@ -from __future__ import annotations - -import asyncio -import json -import uuid -from typing import Any, Iterable, List, Optional, Tuple, Type, Union, dict - -import numpy as np -from langchain_community.vectorstores.utils import maximal_marginal_relevance -from langchain_core.documents import Document -from langchain_core.embeddings import Embeddings -from langchain_core.vectorstores import VectorStore - -from .postgresql_engine import PostgreSQLEngine - -VST = TypeVar("VST", bound="CloudSQLVectorStore") - - -class CloudSQLVectorStore(VectorStore): - """Google Cloud SQL for PostgreSQL vector store. - - To use, you need the following packages installed: - pgvector-python - sqlalchemy - """ - - def __init__( - self, - engine: PostgreSQLEngine, - table_name: str, - # vector_size: int, - embedding_service: Embeddings, - content_column: str = "content", - embedding_column: str = "embedding", - metadata_columns: [str, List[str]] = "metadata", - ignore_metadata_columns: bool = False, - index_query_options: [ - HNSWIndex.QueryOptions, - IVFFlatIndex.QueryOptions, - ] = None, - # index: [HNSWIndex | IVFFlatIndex | BruteForce] = None, - distance_strategy="L2", - overwrite_existing: bool = False, - # store_metadata: bool = True, - ): - """Constructor for CloudSQLVectorStore. - - Args: - engine (PostgreSQLEngine): AsyncEngine with pool connection to the postgres database. Required. - embedding_service (Embeddings): Text embedding model to use. - table_name (str): Name of the existing table or the table to be created. - content_column (str): Column that represent a Document’s page_content. Defaults to content - embedding_column (str): Column for embedding vectors. - The embedding is generated from the document value. Defaults to embedding - metadata_columns (List[str]): Column(s) that represent a document's metadata. Defaults to metadata - ignore_metadata_columns (List[str]): Column(s) to ignore in pre-existing tables for a document’s metadata. - Can not be used with metadata_columns. Defaults to None - overwrite_existing (bool): Boolean for truncating table before inserting data. Defaults to False - index_query_options : QueryOptions class with vector search parameters. Defaults to None - distance_strategy (str): - Determines the strategy employed for calculating - the distance between vectors in the embedding space. - Defaults to EUCLIDEAN_DISTANCE(L2). - Available options are: - - COSINE: Measures the similarity between two vectors of an inner - product space. - - EUCLIDEAN_DISTANCE: Computes the Euclidean distance between - two vectors. This metric considers the geometric distance in - the vector space, and might be more suitable for embeddings - that rely on spatial relationships. This is the default behavior. - """ - - self.engine = engine - self.table_name = table_name - # self.vector_size = vector_size - self.embedding_service = embedding_service - self.embedding_column = embedding_column - self.content_column = content_column - self.metadata_columns = metadata_columns - self.ignore_metadata_columns = ignore_metadata_columns - self.overwrite_existing = overwrite_existing - self.index_query_options = index_query_options - self.store_metadata = store_metadata - self.distance_strategy = distance_strategy - # self.index = index - asyncio.get_running_loop().run_until_complete(self.__post_init__()) - - async def __post_init__(self) -> None: - """Initialize table and validate existing tables""" - - # Check if table exists - query = f"SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = '{self.table_name}');" - result = await self.engine._aexecute_fetch(query) - # If table exists - if result[0]["exists"]: - # If overwrite_existing is True Truncate the Table - if self.overwrite_existing: - query = f"TRUNCATE TABLE {self.table_name} RESET IDENTITY" - await self.engine._aexecute_update(query) - - # Checking if metadata and ignore_metadata are given together - if ( - self.metadata_columns is not None - and self.ignore_metadata_columns is not None - ): - raise ValueError( - "Both metadata_columns and ignore_metadata_columns have been provided." - ) - - get_name = f"SELECT column_name, data_type FROM information_schema.columns WHERE table_name = '{self.table_name}'" - result = await self.engine._aexecute_fetch(get_name) - column_name = [col["column_name"] for col in result] - dtypes = [dtype["data_type"] for dtype in result] - - # Check column names and datatype for embedding column - if "uuid" not in column_name: - raise ValueError("Column uuid does not exist") - if self.content_column not in column_name: - raise ValueError(f"Column {self.content_column} does not exist") - if self.embedding_column in column_name: - if "USER-DEFINED" not in dtypes: - raise ValueError( - f"Column {self.embedding_column} is not of type vector" - ) - else: - raise ValueError( - f"Column {self.embedding_column} does not exist" - ) - - if "metadata" not in column_name: - raise ValueError("Column metadata does not exist") - - # Check if there are non-nullable columns - query = f"SELECT column_name FROM information_schema.columns WHERE table_name = '{self.table_name}' AND is_nullable = 'NO';" - result = await self.engine._aexecute_fetch(query) - non_nullable_list = [n["column_name"] for n in result] - exceptions = set(["uuid", f"{self.content_column}"]) - other_values = [ - value for value in non_nullable_list if value not in exceptions - ] - - if bool(other_values): - raise ValueError( - f"Only uuid and {self.content_column} can be non-nullable" - ) - - # If both metadata and ignore_metadata are given, throw an error - if ( - self.metadata_columns is not None - and self.ignore_metadata_columns is not None - ): - raise ValueError( - "Both metadata_columns and ignore_metadata_columns have been provided." - ) - - else: - await self.init_vectorstore_table( - engine=self.engine, - table_name=self.table_name, - vector_size=self.vector_size, - content_column=self.content_column, - embedding_column=self.embedding_column, - metadata_columns=self.metadata_columns, - overwrite_existing=self.overwrite_existing, - store_metadata=self.store_metadata, - ) - - @property - def embeddings(self) -> Embeddings: - return self.embedding_service - - async def create_vector_extension(self) -> None: - """Creates the vector extsion to the specified database.""" - query = "CREATE EXTENSION IF NOT EXISTS vector" - await self.engine._aexecute_update(query) - - async def init_vectorstore_table( - self, - engine: PostgreSQLEngine, - table_name: str, - vector_size: int, - content_column: str = "content", - embedding_column: str = "embedding", - metadata_columns: Optional[str | List[str]] = "metadata", - overwrite_existing: bool = False, - store_metadata: bool = True, - ) -> None: - """Creating a non-default vectorstore table""" - - # Create vector extension if not exists - await self.create_vector_extension() - - if overwrite_existing: - query = f"TRUNCATE TABLE {self.table_name} RESET IDENTITY" - await engine._aexecute_update(query) - - query = f""" - CREATE TABLE IF NOT EXISTS {table_name} ( - uuid UUID PRIMARY KEY, - {content_column} TEXT NOT NULL, - {embedding_column} vector({vector_size}), - {metadata_columns} JSON - ); - """ - await engine._aexecute_update(query) - - # @classmethod - # async def afrom_embeddings( - # cls: CloudSQLVectorStore, - # engine: PostgreSQLEngine, - # embedding_service: Embeddings, - # text_embeddings: List[Tuple[str, List[float]]], - # table_name: str, - # metadatas: List[dict] = None, - # ids: List[int] = None, - # ) -> CloudSQLVectorStore: - # texts = [t[0] for t in text_embeddings] - # embeddings = [t[1] for t in text_embeddings] - # metadatas = [{} for _ in texts] - - # table = cls( - # engine=engine, - # table_name=table_name, - # embedding_service=embedding_service, - # ) - - # await table.aadd_embeddings( - # texts=texts, - # engine=engine, - # embeddings=embeddings, - # metadatas=metadatas, - # ids=ids, - # table_name=table_name, - # ) - - # return table - - # @classmethod - # async def afrom_documents( - # cls: CloudSQLVectorStore, - # documents: List[Document], - # engine: PostgreSQLEngine, - # table_name: str, - # embedding_service: Embeddings, - # ids: List[int] = None, - # ) -> CloudSQLVectorStore: - # texts = [d.page_content for d in documents] - # metadatas = [json.dumps(d.metadata) for d in documents] - - # embeddings = embedding_service.embed_documents(list(texts)) - - # table = cls( - # engine=engine, - # embedding_service=embedding_service, - # table_name=table_name, - # ) - - # await table.aadd_embeddings( - # texts=texts, - # engine=engine, - # embeddings=embeddings, - # metadatas=metadatas, - # ids=ids, - # table_name=table_name, - # ) - - # return table - - @classmethod - async def afrom_texts( - cls: Type[VST], - texts: List[str], - embeddings: List[List[float]], - metadatas: Optional[List[dict]] = None, - ids: Optional[List[str]] = None, - engine: PostgreSQLEngine, - embedding_service: Embeddings, - table_name: str, - **kwargs: Any, - ) -> VST: - """Return VectorStore initialized from texts and embeddings.""" - if not metadatas: - metadatas = [{} for _ in texts] - - documents = [] - for text, meta in zip(texts, metadatas): - docs = Document(page_content=text, metadata=meta) - documents.append(docs) - - vs = cls( - engine=engine, - documents=documents, - embedding_service=embedding_service, - table_name=table_name, - ) - return await vs.aadd_embeddings(texts, embeddings, metadatas, ids) - - async def aadd_embeddings( - self, - texts: Iterable[str], - embeddings: List[List[float]], - metadatas: Optional[List[dict]] = None, - ids: Optional[List[str]] = None, - **kwargs: Any, - ) -> List[str]: - # if ids is None: - # ids = [str(uuid.uuid1()) for _ in texts] - - for id, content, embedding, meta in zip( - ids, texts, embeddings, metadatas - ): - data_to_add = { - "ids": id, - "content": content, - "embedding": embedding, - "metadata": meta, - } - stmt = f"INSERT INTO {self.table_name}(uuid, content, embedding, metadata) VALUES (:ids,:content,:embedding,:metadata)" - await self.engine._aexecute_update(stmt, data_to_add) - - return ids - - # async def aadd_documents( - # self, documents: List[Document], ids: List[int] = None, **kwargs: Any - # ) -> List[str]: - # """Run more documents through the embeddings and add to the vectorstore. - - # Args: - # documents (List[Document]): Iterable of Documents to add to the vectorstore. - # ids (List[str]): List of id strings. Defaults to None - - # Returns: - # List of ids from adding the texts into the vectorstore. - # """ - - # texts = [d.page_content for d in documents] - # metadatas = [json.dumps(d.metadata) for d in documents] - # embeddings = self.embedding_service.embed_documents(list(texts)) - - # return await self.aadd_embeddings( - # texts=texts, - # embeddings=embeddings, - # metadatas=metadatas, - # ids=ids, - # engine=self.Engine, - # table_name=self.table_name, - # ) - - # async def aadd_texts( - # self, - # texts: List[str], - # metadatas: List[dict] = None, - # ids: List[int] = None, - # ) -> List[str]: - # """Run more texts through the embeddings and add to the vectorstore. - - # Args: - # texts (str): Iterable of strings to add to the vectorstore. - # metadatas (List[dict]): Optional list of metadatas associated with the texts. Defaults to None. - # ids (List[str]): List of id strings. Defaults to None - - # Returns: - # List of ids from adding the texts into the vectorstore. - # """ - - # if not metadatas: - # metadata = [{} for _ in texts] - - # documents = [] - # for text, meta in zip(texts, metadatas): - # docs = Document(page_content=text, metadata=meta) - # documents.append(docs) - - # return await self.aadd_documents(documents=documents, ids=ids) - - async def __query_collection( - self, embedding: List[float], k: int = 4, filter: str = None - ) -> List[Any]: - if filter is not None: - condition = f"WHERE {filter}" - - query = f""" - SELECT uuid, {self.content_column}, {self.embedding_column}, metadata, - l2_distance({self.embedding_column}, '{embedding}') as distance - FROM {self.table_name} {condition} ORDER BY {self.embedding_column} <-> '{embedding}' LIMIT {k} - """ - else: - query = f""" - SELECT uuid, {self.content_column}, {self.embedding_column}, metadata, - l2_distance({self.embedding_column}, '{embedding}') as distance - FROM {self.table_name} ORDER BY {self.embedding_column} <-> '{embedding}' LIMIT {k} - """ - results = await self.engine._aexecute_fetch(query) - - return results - - async def asimilarity_search( - self, query: str, k: int = 4, filter: str = None - ) -> List[Document]: - embedding = self.embedding_service.embed_query(text=query) - - return await self.asimilarity_search_by_vector( - embedding=embedding, k=k, filter=filter - ) - - async def asimilarity_search_by_vector( - self, embedding: List[float], k: int = 4, filter: str = None - ) -> List[Document]: - docs_and_scores = await self.asimilarity_search_with_score_by_vector( - embedding=embedding, k=k, filter=filter - ) - - return [doc for doc, _ in docs_and_scores] - - async def asimilarity_search_with_score( - self, query: str, k: int = 4, filter: str = None - ) -> List[Tuple[Document, float]]: - embedding = self.embedding_service.embed_query(query) - docs = await self.asimilarity_search_with_score_by_vector( - embedding=embedding, k=k, filter=filter - ) - return docs - - async def asimilarity_search_with_score_by_vector( - self, embedding: List[float], k: int = 4, filter: str = None - ) -> List[Tuple[Document, float]]: - results = await self.__query_collection( - embedding=embedding, k=k, filter=filter - ) - documents_with_scores = [ - ( - Document( - page_content=i[f"{self.content_column}"], - metadata=i["metadata"], - ), - i["distance"], - ) - for i in results - ] - return documents_with_scores - - async def amax_marginal_relevance_search( - self, - query: str, - k: int = 4, - fetch_k: int = 20, - lambda_mult: float = 0.5, - filter: str = None, - ) -> List[Document]: - embedding = await self.embedding_service.embed_query(text=query) - - return self.amax_marginal_relevance_search_by_vector( - embedding=embedding, - k=k, - fetch_k=fetch_k, - lambda_mult=lambda_mult, - filter=filter, - ) - - async def amax_marginal_relevance_search_with_score_by_vector( - self, - embedding: List[float], - k: int = 4, - fetch_k: int = 20, - lambda_mult: float = 0.5, - filter: str = None, - ) -> List[Tuple[Document, float]]: - results = await self.__query_collection( - embedding=embedding, k=fetch_k, filter=filter - ) - embedding_list = [i[f"{self.embedding_column}"] for i in results] - - mmr_selected = maximal_marginal_relevance( - np.array(embedding, dtype=np.float32), - embedding_list, - k=k, - lambda_mult=lambda_mult, - ) - - candidates = [ - ( - Document( - page_content=i[f"{self.content_column}"], - metadata=i["metadata"], - ), - i["distance"], - ) - for i in results - ] - - return [r for i, r in enumerate(candidates) if i in mmr_selected] - - async def _acreate_index( - self, index: Union[HNSWIndex, IVFFlatIndex, BruteForce] - ): - if isinstance(index, BruteForce): - return None - - distance = ( - "l2" - if self.distance_strategy == "L2" - else "ip" if distance_strategy == "INNER" else "cosine" - ) - index_type = "hnsw" if isinstance(index, HNSWIndex()) else "ivfflat" - if partial_indexes == None: - condition = "" - else: - condition = f"WHERE (partial_indexes)" - - if index_type == "hnsw": - query = f"CREATE INDEX ON {self.table_name} USING hnsw ({self.embedding_column} vector_{distance}_ops) WITH (m={index.m}, ef_construction={index.ef_construction}) {condition}" - else: - query = f"CREATE INDEX ON {self.table_name} USING ivfflat ({self.embedding_column} vector_{distance}_ops) WITH (lists={index.lists}) {condition}" - - await self.engine._aexecute_update(query) - - async def _aindex_query_options( - self, - index_query_options: [ - HNSWIndex.QueryOptions | IVFFlatIndex.QueryOptions - ], - ): - if isinstance(index_query_options, HNSWIndex.QueryOptions): - query_options = index_query_options.ef_search - query = f"SET hnsw.ef_search = {query_options}" - else: - query_options = index_query_options.probes - query = f"SET ivfflat.probes = {query_options}" - - await self.engine._aexecute_update(query) - - async def areindex( - self, - index: Union[HNSWIndex, IVFFlatIndex, BruteForce], - index_name: Optional[str], - ): - if index_name: - query = f"REINDEX INDEX {index_name}" - await self.engine._aexecute_update(query) - else: - await self._acreate_index(index) - - async def adrop_index(self): - query = f"SELECT indexname, indexdef FROM pg_indexes WHERE tablename='{self.table_name}'" - current_index = await self.engine._aexecute_fetch(query) - index_def = current_index[0]["indexdef"] - if "hnsw" in index_def or "ivfflat" in index_def: - current_index = current_index["indexname"] - query = f"DROP INDEX {current_index}" - await self.engine._aexecute_update(query) - else: - raise ValueError("Cannot drop Index") - - async def aset_index_query_options( - self, distance_strategy, index_query_options - ): - self.distance_strategy = distance_strategy - self.index_query_options = index_query_options - await self._aindex_query_options() - - -class BruteForce: - def __init__(self, distance_strategy: str = "L2"): - self.distance_strategy = distance_strategy - - -class HNSWIndex: - def __init__( - self, - name: str = "LangChainHNSWIndex", - m: int = 16, - ef_construction: int = 64, - partial_indexes: List = [], - distance_strategy: str = "L2", - ): - self.name = name - self.m = m - self.ef_construction = ef_construction - self.partial_indexes = partial_indexes - self.distance_strategy = distance_strategy - - class QueryOptions: - def __init__(self, ef_search): - self.ef_search = ef_search - - -class IVFFlatIndex: - def __init__( - self, - name: str = "LangChainIVFFlatIndex", - lists: int = 1, - partial_indexes: List = [], - distance_strategy: str = "L2", - ): - self.name = name - self.lists = lists - self.partial_indexes = partial_indexes - self.distance_strategy = distance_strategy - - class QueryOptions: - def __init__(self, probes): - self.probes = probes diff --git a/tests/test_cloudSQL.py b/tests/test_cloudSQL.py deleted file mode 100644 index 557e62f..0000000 --- a/tests/test_cloudSQL.py +++ /dev/null @@ -1,213 +0,0 @@ -"""Test CloudSQLVectorStore functionality.""" - -import os -from typing import List - -from langchain_community.embeddings import FakeEmbeddings -from langchain_core.documents import Document - -from langchain_google_cloud_sql_pg import CloudSQLVectorStore, PostgreSQLEngine - -# from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings - -engine = PostgreSQLEngine.from_instance( - project_id=os.environ.get("PROJECT_ID", None), - instance=os.environ.get("INSTANCE_NAME"), - region=os.environ.get("REGION_NAME"), - database=os.environ.get("DATABASE_NAME"), -) - -ADA_TOKEN_COUNT = 1536 - - -class FakeEmbeddingsWithAdaDimension(FakeEmbeddings): - """Fake embeddings functionality for testing.""" - - def embed_documents(self, texts: List[str]) -> List[List[float]]: - """Return simple embeddings.""" - return [ - [float(1.0)] * (ADA_TOKEN_COUNT - 1) + [float(i)] - for i in range(len(texts)) - ] - - def embed_query(self, text: str) -> List[float]: - """Return simple embeddings.""" - return [float(1.0)] * (ADA_TOKEN_COUNT - 1) + [float(0.0)] - - -async def test_CloudSQLVectorStore() -> None: - """Test end to end construction and search.""" - texts = ["foo", "bar", "baz"] - docsearch = await CloudSQLVectorStore.afrom_texts( - texts=texts, - table_name="test_table", - embedding=FakeEmbeddingsWithAdaDimension(), - engine=engine, - ) - output = await docsearch.asimilarity_search("foo", k=1) - assert output == [Document(page_content="foo")] - - -async def test_CloudSQLVectorStore_embeddings() -> None: - """Test end to end construction with embeddings and search.""" - texts = ["foo", "bar", "baz"] - text_embeddings = FakeEmbeddingsWithAdaDimension().embed_documents(texts) - text_embedding_pairs = list(zip(texts, text_embeddings)) - docsearch = CloudSQLVectorStore.afrom_embeddings( - text_embeddings=text_embedding_pairs, - table_name="test_table", - embedding=FakeEmbeddingsWithAdaDimension(), - engine=engine, - ) - output = docsearch.similarity_search("foo", k=1) - assert output == [Document(page_content="foo")] - - -async def test_CloudSQLVectorStore_with_metadatas() -> None: - """Test end to end construction and search.""" - texts = ["foo", "bar", "baz"] - metadatas = [{"page": str(i)} for i in range(len(texts))] - docsearch = await CloudSQLVectorStore.afrom_texts( - texts=texts, - table_name="test_table", - embedding=FakeEmbeddingsWithAdaDimension(), - metadatas=metadatas, - engine=engine, - ) - output = await docsearch.asimilarity_search("foo", k=1) - assert output == [Document(page_content="foo", metadata={"page": "0"})] - - -async def test_CloudSQLVectorStore_with_metadatas_with_scores() -> None: - """Test end to end construction and search.""" - texts = ["foo", "bar", "baz"] - metadatas = [{"page": str(i)} for i in range(len(texts))] - docsearch = await CloudSQLVectorStore.afrom_texts( - texts=texts, - table_name="test_table", - embedding=FakeEmbeddingsWithAdaDimension(), - metadatas=metadatas, - engine=engine, - ) - output = await docsearch.asimilarity_search_with_score("foo", k=1) - assert output == [ - (Document(page_content="foo", metadata={"page": "0"}), 0.0) - ] - - -async def test_CloudSQLVectorStore_with_filter_match() -> None: - """Test end to end construction and search.""" - texts = ["foo", "bar", "baz"] - metadatas = [{"page": str(i)} for i in range(len(texts))] - docsearch = await CloudSQLVectorStore.afrom_texts( - texts=texts, - collection_name="test_collection_filter", - embedding=FakeEmbeddingsWithAdaDimension(), - metadatas=metadatas, - engine=engine, - ) - output = await docsearch.asimilarity_search_with_score( - "foo", k=1, filter={"page": "0"} - ) - assert output == [ - (Document(page_content="foo", metadata={"page": "0"}), 0.0) - ] - - -async def test_CloudSQLVectorStore_with_filter_distant_match() -> None: - """Test end to end construction and search.""" - texts = ["foo", "bar", "baz"] - metadatas = [{"page": str(i)} for i in range(len(texts))] - docsearch = await CloudSQLVectorStore.afrom_texts( - texts=texts, - table_name="test_table", - embedding=FakeEmbeddingsWithAdaDimension(), - metadatas=metadatas, - engine=engine, - ) - output = await docsearch.asimilarity_search_with_score( - "foo", k=1, filter={"page": "2"} - ) - assert output == [ - ( - Document(page_content="baz", metadata={"page": "2"}), - 0.0013003906671379406, - ) - ] - - -async def test_CloudSQLVectorStore_with_filter_no_match() -> None: - """Test end to end construction and search.""" - texts = ["foo", "bar", "baz"] - metadatas = [{"page": str(i)} for i in range(len(texts))] - docsearch = await CloudSQLVectorStore.afrom_texts( - texts=texts, - table_name="test_table", - embedding=FakeEmbeddingsWithAdaDimension(), - metadatas=metadatas, - engine=engine, - ) - output = await docsearch.asimilarity_search_with_score( - "foo", k=1, filter={"page": "5"} - ) - assert output == [] - - -async def test_CloudSQLVectorStore_relevance_score() -> None: - """Test to make sure the relevance score is scaled to 0-1.""" - texts = ["foo", "bar", "baz"] - metadatas = [{"page": str(i)} for i in range(len(texts))] - docsearch = await CloudSQLVectorStore.from_texts( - texts=texts, - table_name="test_table", - embedding=FakeEmbeddingsWithAdaDimension(), - metadatas=metadatas, - engine=engine, - ) - - output = await docsearch.asimilarity_search_with_relevance_scores( - "foo", k=3 - ) - assert output == [ - (Document(page_content="foo", metadata={"page": "0"}), 1.0), - ( - Document(page_content="bar", metadata={"page": "1"}), - 0.9996744261675065, - ), - ( - Document(page_content="baz", metadata={"page": "2"}), - 0.9986996093328621, - ), - ] - - -async def test_CloudSQLVectorStore_max_marginal_relevance_search() -> None: - """Test max marginal relevance search.""" - texts = ["foo", "bar", "baz"] - docsearch = await CloudSQLVectorStore.afrom_texts( - texts=texts, - table_name="test_table", - embedding=FakeEmbeddingsWithAdaDimension(), - engine=engine, - ) - output = await docsearch.max_marginal_relevance_search( - "foo", k=1, fetch_k=3 - ) - assert output == [Document(page_content="foo")] - - -async def test_CloudSQLVectorStore_max_marginal_relevance_search_with_score() -> ( - None -): - """Test max marginal relevance search with relevance scores.""" - texts = ["foo", "bar", "baz"] - docsearch = await CloudSQLVectorStore.afrom_texts( - texts=texts, - collection_name="test_table", - embedding=FakeEmbeddingsWithAdaDimension(), - engine=engine, - ) - output = await docsearch.amax_marginal_relevance_search_with_score( - "foo", k=1, fetch_k=3 - ) - assert output == [(Document(page_content="foo"), 0.0)] diff --git a/tests/test_cloudsql_vectorstore.py b/tests/test_cloudsql_vectorstore.py new file mode 100644 index 0000000..63c2c64 --- /dev/null +++ b/tests/test_cloudsql_vectorstore.py @@ -0,0 +1,636 @@ +"""Test cloudSQLVectorStore functionality.""" + +import os +import uuid +from typing import List + +import pytest +import pytest_asyncio +from langchain_community.embeddings import FakeEmbeddings +from langchain_core.documents import Document +from langchain_google_vertexai import VertexAIEmbeddings +from sqlalchemy import TEXT, VARCHAR, Column + +from langchain_google_cloud_sql_pg import CloudSQLVectorStore, PostgreSQLEngine +from langchain_google_cloud_sql_pg.indexes import ( + DEFAULT_DISTANCE_STRATEGY, + BruteForce, + DistanceStrategy, + HNSWIndex, + IVFFlatIndex, +) + +PROJECT_ID = "starter-akitsch" +INSTANCE = "pg15-pgvector-demo" +DATABASE = "assistantdemo" +REGION = "us-west2" +ADA_TOKEN_COUNT = 768 +DEFAULT_TABLE = "test_table" +CUSTOM_COL = "page" +CUSTOM_TABLE = "test_table_custom" + +embeddings_service = VertexAIEmbeddings() + + +class FakeEmbeddingsWithAdaDimension(FakeEmbeddings): + """Fake embeddings functionality for testing.""" + + size: int = ADA_TOKEN_COUNT + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + """Return simple embeddings.""" + return [ + [float(1.0)] * (ADA_TOKEN_COUNT - 1) + [float(i)] + for i in range(len(texts)) + ] + + def embed_query(self, text: str = "default") -> List[float]: + """Return simple embeddings.""" + return [float(1.0)] * (ADA_TOKEN_COUNT - 1) + [float(0.0)] + + +@pytest.mark.asyncio +class TestEngine: + @pytest_asyncio.fixture + async def engine(self) -> None: + engine = PostgreSQLEngine.from_instance( + project_id=PROJECT_ID, + instance=INSTANCE, + region=REGION, + database=DATABASE, + ) + await engine.init_vectorstore_table( + "table4", + ADA_TOKEN_COUNT, + content_column="product", + embedding_column="product_embedding", + store_metadata=True, + ) + yield engine + await engine._aexecute_update(f"DROP TABLE table4") + + async def test_metadata_upload(self, engine): + texts = ["Hello, World!"] + metadatas = [{"field1": "value1", "field2": "value2"}] + vs = CloudSQLVectorStore( + table_name="table4", + embedding_service=FakeEmbeddingsWithAdaDimension(), + engine=engine, + content_column="product", + embedding_column="product_embedding", + ) + await vs.aadd_texts( + texts=texts, + metadatas=metadatas, + ) + output = await vs.asimilarity_search("Hello", k=1) + assert output[0].metadata == metadatas[0] + + async def test_override_on_init(self, engine): + await engine.init_vectorstore_table( + "table4", + ADA_TOKEN_COUNT, + content_column="product", + embedding_column="product_embedding", + store_metadata=True, + overwrite_existing=True, + ) + vs = CloudSQLVectorStore( + table_name="table4", + embedding_service=FakeEmbeddingsWithAdaDimension(), + engine=engine, + content_column="product", + embedding_column="product_embedding", + ) + + output = await vs.asimilarity_search("Hello", k=10) + assert len(output) == 0 + + async def test_override(self, engine): + texts = ["foo", "bar"] + vs = await CloudSQLVectorStore.afrom_texts( + texts=texts, + table_name="table4", + embedding_service=FakeEmbeddingsWithAdaDimension(), + engine=engine, + content_column="product", + embedding_column="product_embedding", + ) + vs2 = await CloudSQLVectorStore.afrom_texts( + texts=texts, + table_name="table4", + embedding_service=FakeEmbeddingsWithAdaDimension(), + engine=engine, + content_column="product", + embedding_column="product_embedding", + overwrite_existing=True, + ) + output = await vs2.asimilarity_search("foo", k=10) + assert len(output) == 2 + + +@pytest.mark.asyncio +class TestAsync: + @pytest_asyncio.fixture # (scope="function") + async def engine(self): + engine = await PostgreSQLEngine.from_instance( + project_id=PROJECT_ID, + instance=INSTANCE, + region=REGION, + database=DATABASE, + ) + await engine.init_vectorstore_table(DEFAULT_TABLE, ADA_TOKEN_COUNT) + yield engine + await engine._aexecute_update(f"DROP TABLE {DEFAULT_TABLE}") + + @pytest_asyncio.fixture # (scope="function") + async def engine_custom(self): + table_name = CUSTOM_TABLE + engine = await PostgreSQLEngine.from_instance( + project_id=PROJECT_ID, + instance=INSTANCE, + region=REGION, + database=DATABASE, + ) + await engine.init_vectorstore_table( + table_name, + ADA_TOKEN_COUNT, + metadata_columns=[Column("page", TEXT)], + ) + yield engine + # return engine + await engine._aexecute_update(f"DROP TABLE {table_name}") + + async def test_similarity_search(self, engine) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + vs = await CloudSQLVectorStore.afrom_texts( + texts=texts, + table_name=DEFAULT_TABLE, + embedding_service=FakeEmbeddingsWithAdaDimension(), + engine=engine, + ) + output = await vs.asimilarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + async def test_with_metadatas(self, engine_custom) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + vs = await CloudSQLVectorStore.afrom_texts( + texts=texts, + metadatas=metadatas, + table_name=CUSTOM_TABLE, + embedding_service=FakeEmbeddingsWithAdaDimension(), + engine=engine_custom, + metadata_columns=["page"], + ) + output = await vs.asimilarity_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={"page": "0"})] + + async def test_with_ids(self, engine_custom) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + ids = [str(uuid.uuid4()) for i in range(len(texts))] + metadatas = [{"page": str(i)} for i in range(len(texts))] + vs = await CloudSQLVectorStore.afrom_texts( + texts=texts, + metadatas=metadatas, + ids=ids, + table_name=CUSTOM_TABLE, + embedding_service=FakeEmbeddingsWithAdaDimension(), + engine=engine_custom, + metadata_columns=["page"], + ) + output = await vs.adelete(ids) + assert output + + async def test_with_metadatas_with_scores( + self, + engine_custom, + ) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + vs = await CloudSQLVectorStore.afrom_texts( + texts=texts, + table_name=CUSTOM_TABLE, + embedding_service=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + engine=engine_custom, + metadata_columns=["page"], + ) + output = await vs.asimilarity_search_with_score("foo", k=1) + assert output == [ + (Document(page_content="foo", metadata={"page": "0"}), 0.0) + ] + + async def test_with_filter_match(self, engine_custom) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + vs = await CloudSQLVectorStore.afrom_texts( + texts=texts, + table_name=CUSTOM_TABLE, + embedding_service=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + engine=engine_custom, + metadata_columns=["page"], + ) + output = await vs.asimilarity_search_with_score( + "foo", k=1, filter="page = '0'" + ) + assert output == [ + (Document(page_content="foo", metadata={"page": "0"}), 0.0) + ] + + async def test_with_filter_distant_match( + self, + engine_custom, + ) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + vs = await CloudSQLVectorStore.afrom_texts( + texts=texts, + table_name=CUSTOM_TABLE, + embedding_service=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + engine=engine_custom, + metadata_columns=["page"], + ) + output = await vs.asimilarity_search_with_score( + "foo", k=1, filter="page = '2'" + ) + assert output == [ + ( + Document(page_content="baz", metadata={"page": "2"}), + 0.0025974069839586056, + ) + ] + + async def test_with_filter_no_match( + self, + engine_custom, + ) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + vs = await CloudSQLVectorStore.afrom_texts( + texts=texts, + table_name=CUSTOM_TABLE, + embedding_service=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + engine=engine_custom, + metadata_columns=["page"], + ) + output = await vs.asimilarity_search_with_score( + "foo", k=1, filter="page = '5'" + ) + assert output == [] + + async def test_relevance_score(self, engine_custom) -> None: + """Test to make sure the relevance score is scaled to 0-1.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + vs = await CloudSQLVectorStore.afrom_texts( + texts=texts, + table_name=CUSTOM_TABLE, + embedding_service=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + engine=engine_custom, + metadata_columns=["page"], + ) + + output = await vs.asimilarity_search_with_relevance_scores("foo", k=3) + assert output == [ + (Document(page_content="foo", metadata={"page": "0"}), 1.0), + ( + Document(page_content="bar", metadata={"page": "1"}), + 0.9993487462676214, + ), + ( + Document(page_content="baz", metadata={"page": "2"}), + 0.9974025930160414, + ), + ] + + async def test_max_marginal_relevance_search( + self, + engine, + ) -> None: + """Test max marginal relevance search.""" + texts = ["foo", "bar", "baz"] + vs = await CloudSQLVectorStore.afrom_texts( + texts=texts, + table_name=DEFAULT_TABLE, + embedding_service=embeddings_service, + engine=engine, + ) + output = await vs.amax_marginal_relevance_search("foo", k=1, fetch_k=3) + assert output == [Document(page_content="foo")] + + async def test_max_marginal_relevance_search_with_score( + self, + engine, + ) -> None: + """Test max marginal relevance search with relevance scores.""" + texts = ["foo", "bar", "baz"] + vs = await CloudSQLVectorStore.afrom_texts( + texts=texts, + table_name=DEFAULT_TABLE, + embedding_service=embeddings_service, + engine=engine, + ) + embedding = embeddings_service.embed_query(text="foo") + output = await vs.amax_marginal_relevance_search_with_score_by_vector( + embedding, k=1, fetch_k=3 + ) + assert output[0][0] == Document(page_content="foo") + assert output[0][1] > 0 + + async def test_max_marginal_relevance_search_amenities( + self, + engine_custom, + ) -> None: + """Test max marginal relevance search.""" + vs = CloudSQLVectorStore( + table_name="amenities", + embedding_service=embeddings_service, + engine=engine_custom, + ) + output = await vs.amax_marginal_relevance_search( + "coffee", k=1, fetch_k=3 + ) + assert "coffee" in output[0].page_content + + +@pytest.mark.asyncio +class TestIndex: + @pytest_asyncio.fixture() + async def vs(self): + table_name = "test_table2" + engine = await PostgreSQLEngine.from_instance( + project_id=PROJECT_ID, + instance=INSTANCE, + region=REGION, + database=DATABASE, + ) + await engine.init_vectorstore_table(table_name, ADA_TOKEN_COUNT) + texts = ["foo", "bar", "baz"] + vs = await CloudSQLVectorStore.afrom_texts( + texts=texts, + table_name="test_table2", + embedding_service=embeddings_service, + engine=engine, + ) + yield vs + # return engine + # await engine._aexecute_update(f"DROP TABLE {table_name}") + + async def test_applyindex(self, vs) -> None: + index = HNSWIndex() + await vs.aapply_index(index) + + async def test_applyindex_l2(self, vs) -> None: + index = HNSWIndex( + name="hnswl2", distance_strategy=DistanceStrategy.EUCLIDEAN + ) + await vs.aapply_index(index) + + async def test_applyindex_ip(self, vs) -> None: + index = IVFFlatIndex(distance_strategy=DistanceStrategy.INNER_PRODUCT) + await vs.aapply_index(index) + + async def test_reindex(self, vs) -> None: + """Test the creation and reindexing of index""" + output = await vs.areindex("langchainhnsw") + assert output is None + + async def test_dropindex(self, vs) -> None: + """Test the dropping of index""" + output = await vs.adrop_index("langchainivfflat") + await vs.adrop_index("langchainhnsw") + await vs.adrop_index("hnswl2") + assert output is None + + +# @pytest.mark.asyncio +# class TestSync: +# @pytest_asyncio.fixture(scope="function") +# async def engine(self): +# table_name = "test_table_sync" +# engine = await PostgreSQLEngine.from_instance( +# project_id=PROJECT_ID, +# instance=INSTANCE, +# region=REGION, +# database=DATABASE, +# ) +# await engine.init_vectorstore_table(table_name, ADA_TOKEN_COUNT) +# yield engine +# # return engine +# await engine._aexecute_update(f"DROP TABLE {table_name}") + +# @pytest_asyncio.fixture(scope="function") +# async def engine_custom(self): +# table_name = CUSTOM_TABLE +# engine = await PostgreSQLEngine.from_instance( +# project_id=PROJECT_ID, +# instance=INSTANCE, +# region=REGION, +# database=DATABASE, +# ) +# await engine.init_vectorstore_table( +# table_name, +# ADA_TOKEN_COUNT, +# metadata_columns=[Column("page", TEXT)], +# ) +# yield engine +# # return engine +# await engine._aexecute_update(f"DROP TABLE {table_name}") + +# def test(self, engine) -> None: +# """Test end to end construction and search.""" +# texts = ["foo", "bar", "baz"] +# vs = CloudSQLVectorStore.from_texts( +# texts=texts, +# table_name="test_table_sync", +# embedding_service=FakeEmbeddingsWithAdaDimension(), +# engine=engine, +# ) +# output = vs.similarity_search("foo", k=1) +# assert output == [Document(page_content="foo")] + +# def test_with_metadatas(self, engine_custom) -> None: +# """Test end to end construction and search.""" +# texts = ["foo", "bar", "baz"] +# metadatas = [{"page": str(i)} for i in range(len(texts))] +# vs = CloudSQLVectorStore.from_texts( +# texts=texts, +# metadatas=metadatas, +# table_name=CUSTOM_TABLE, +# embedding_service=FakeEmbeddingsWithAdaDimension(), +# engine=engine_custom, +# metadata_columns=["page"], +# ) +# output = vs.similarity_search("foo", k=1) +# assert output == [Document(page_content="foo", metadata={"page": "0"})] + +# def test_with_metadatas_with_scores( +# self, +# engine_custom, +# ) -> None: +# """Test end to end construction and search.""" +# texts = ["foo", "bar", "baz"] +# metadatas = [{"page": str(i)} for i in range(len(texts))] +# vs = CloudSQLVectorStore.from_texts( +# texts=texts, +# table_name=CUSTOM_TABLE, +# embedding_service=FakeEmbeddingsWithAdaDimension(), +# metadatas=metadatas, +# engine=engine_custom, +# metadata_columns=["page"], +# ) +# output = vs.similarity_search_with_score("foo", k=1) +# assert output == [ +# (Document(page_content="foo", metadata={"page": "0"}), 0.0) +# ] + +# def test_with_filter_match(self, engine_custom) -> None: +# """Test end to end construction and search.""" +# texts = ["foo", "bar", "baz"] +# metadatas = [{"page": str(i)} for i in range(len(texts))] +# vs = CloudSQLVectorStore.from_texts( +# texts=texts, +# table_name=CUSTOM_TABLE, +# embedding_service=FakeEmbeddingsWithAdaDimension(), +# metadatas=metadatas, +# engine=engine_custom, +# metadata_columns=["page"], +# ) +# output = vs.similarity_search_with_score( +# "foo", k=1, filter="page = '0'" +# ) +# assert output == [ +# (Document(page_content="foo", metadata={"page": "0"}), 0.0) +# ] + +# def test_with_filter_distant_match( +# self, +# engine_custom, +# ) -> None: +# """Test end to end construction and search.""" +# texts = ["foo", "bar", "baz"] +# metadatas = [{"page": str(i)} for i in range(len(texts))] +# vs = CloudSQLVectorStore.from_texts( +# texts=texts, +# table_name=CUSTOM_TABLE, +# embedding_service=FakeEmbeddingsWithAdaDimension(), +# metadatas=metadatas, +# engine=engine_custom, +# metadata_columns=["page"], +# ) +# output = vs.similarity_search_with_score( +# "foo", k=1, filter="page = '2'" +# ) +# assert output == [ +# ( +# Document(page_content="baz", metadata={"page": "2"}), +# 0.0025974069839586056, +# ) +# ] + +# def test_with_filter_no_match( +# self, +# engine_custom, +# ) -> None: +# """Test end to end construction and search.""" +# texts = ["foo", "bar", "baz"] +# metadatas = [{"page": str(i)} for i in range(len(texts))] +# vs = CloudSQLVectorStore.from_texts( +# texts=texts, +# table_name=CUSTOM_TABLE, +# embedding_service=FakeEmbeddingsWithAdaDimension(), +# metadatas=metadatas, +# engine=engine_custom, +# metadata_columns=["page"], +# ) +# output = vs.similarity_search_with_score( +# "foo", k=1, filter="page = '5'" +# ) +# assert output == [] + +# def test_relevance_score(self, engine_custom) -> None: +# """Test to make sure the relevance score is scaled to 0-1.""" +# texts = ["foo", "bar", "baz"] +# metadatas = [{"page": str(i)} for i in range(len(texts))] +# vs = CloudSQLVectorStore.from_texts( +# texts=texts, +# table_name=CUSTOM_TABLE, +# embedding_service=FakeEmbeddingsWithAdaDimension(), +# metadatas=metadatas, +# engine=engine_custom, +# metadata_columns=["page"], +# ) + +# output = vs.similarity_search_with_relevance_scores("foo", k=3) +# assert output == [ +# (Document(page_content="foo", metadata={"page": "0"}), 1.0), +# ( +# Document(page_content="bar", metadata={"page": "1"}), +# 0.9993487462676214, +# ), +# ( +# Document(page_content="baz", metadata={"page": "2"}), +# 0.9974025930160414, +# ), +# ] + +# def test_max_marginal_relevance_search( +# self, +# engine, +# ) -> None: +# """Test max marginal relevance search.""" +# texts = ["foo", "bar", "baz"] +# vs = CloudSQLVectorStore.from_texts( +# texts=texts, +# table_name="test_table_sync", +# embedding_service=embeddings_service, +# engine=engine, +# ) +# output = vs.max_marginal_relevance_search("foo", k=1, fetch_k=3) +# assert output == [Document(page_content="foo")] + +# def test_max_marginal_relevance_search_with_score( +# self, +# engine, +# ) -> None: +# """Test max marginal relevance search with relevance scores.""" +# texts = ["foo", "bar", "baz"] +# vs = CloudSQLVectorStore.from_texts( +# texts=texts, +# table_name="test_table_sync", +# embedding_service=embeddings_service, +# engine=engine, +# ) +# embedding = embeddings_service.embed_query(text="foo") +# output = vs.max_marginal_relevance_search_with_score_by_vector( +# embedding, k=1, fetch_k=3 +# ) +# assert output[0][0] == Document(page_content="foo") +# assert output[0][1] > 0 + +# def test_max_marginal_relevance_search_amenities( +# self, +# engine_custom, +# ) -> None: +# """Test max marginal relevance search.""" +# vs = CloudSQLVectorStore( +# table_name="amenities", +# embedding_service=embeddings_service, +# engine=engine_custom, +# ) +# output = vs.max_marginal_relevance_search( +# "coffee", k=1, fetch_k=3 +# ) +# assert "coffee" in output[0].page_content From 9962fa5768c0d609d05bf119e0004cb12bf1b2e3 Mon Sep 17 00:00:00 2001 From: Averi Kitsch Date: Thu, 8 Feb 2024 20:25:13 -0800 Subject: [PATCH 4/9] fix --- pyproject.toml | 1 + tests/test_cloudsql_vectorstore.py | 14 +++++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6fd3fb2..5f1b469 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ test = [ "mypy==1.7.1", "pytest-asyncio==0.23.0", "pytest==7.4.4" + "langchain_google_vertexai" ] [build-system] diff --git a/tests/test_cloudsql_vectorstore.py b/tests/test_cloudsql_vectorstore.py index 63c2c64..72248bb 100644 --- a/tests/test_cloudsql_vectorstore.py +++ b/tests/test_cloudsql_vectorstore.py @@ -1,4 +1,16 @@ -"""Test cloudSQLVectorStore functionality.""" +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import os import uuid From ea87ec3360da173ab5d14ed30b88a73e47981823 Mon Sep 17 00:00:00 2001 From: Averi Kitsch Date: Thu, 8 Feb 2024 20:26:49 -0800 Subject: [PATCH 5/9] lint --- pyproject.toml | 2 +- src/langchain_google_cloud_sql_pg/__init__.py | 4 +-- .../cloudsql_vectorstore.py | 34 +++++-------------- .../postgresql_engine.py | 8 +---- tests/test_cloudsql_vectorstore.py | 31 +++++------------ 5 files changed, 19 insertions(+), 60 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5f1b469..d9acca3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ test = [ "isort==5.13.2", "mypy==1.7.1", "pytest-asyncio==0.23.0", - "pytest==7.4.4" + "pytest==7.4.4", "langchain_google_vertexai" ] diff --git a/src/langchain_google_cloud_sql_pg/__init__.py b/src/langchain_google_cloud_sql_pg/__init__.py index 7363b32..3ad6dd5 100644 --- a/src/langchain_google_cloud_sql_pg/__init__.py +++ b/src/langchain_google_cloud_sql_pg/__init__.py @@ -12,9 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from langchain_google_cloud_sql_pg.cloudsql_vectorstore import ( - CloudSQLVectorStore, -) +from langchain_google_cloud_sql_pg.cloudsql_vectorstore import CloudSQLVectorStore from langchain_google_cloud_sql_pg.postgresql_engine import PostgreSQLEngine __all__ = ["PostgreSQLEngine", "CloudSQLVectorStore"] diff --git a/src/langchain_google_cloud_sql_pg/cloudsql_vectorstore.py b/src/langchain_google_cloud_sql_pg/cloudsql_vectorstore.py index 829e6fa..96d5683 100644 --- a/src/langchain_google_cloud_sql_pg/cloudsql_vectorstore.py +++ b/src/langchain_google_cloud_sql_pg/cloudsql_vectorstore.py @@ -129,9 +129,7 @@ async def __post_init__(self) -> None: f"SELECT column_name, data_type FROM information_schema.columns WHERE table_name = '{self.table_name}'" ) if self.overwrite_existing: - await self.engine._aexecute_update( - f"TRUNCATE TABLE {self.table_name}" - ) + await self.engine._aexecute_update(f"TRUNCATE TABLE {self.table_name}") # async with self.engine.connect() as conn: results = await self.engine._aexecute_fetch(stmt) # Get field type information @@ -142,9 +140,7 @@ async def __post_init__(self) -> None: if self.id_column not in columns: raise ValueError(f"Id column, {self.id_column}, does not exist.") if self.content_column not in columns: - raise ValueError( - f"Content column, {self.content_column}, does not exist." - ) + raise ValueError(f"Content column, {self.content_column}, does not exist.") if self.embedding_column not in columns: raise ValueError( f"Embedding column, {self.embedding_column}, does not exist." @@ -189,9 +185,7 @@ async def aadd_embeddings( ids = [str(uuid.uuid4()) for _ in texts] if not metadatas: metadatas = [{} for _ in texts] - for id, content, embedding, metadata in zip( - ids, texts, embeddings, metadatas - ): + for id, content, embedding, metadata in zip(ids, texts, embeddings, metadatas): metadata_col_names = ( ", " + ", ".join(self.metadata_columns) if len(self.metadata_columns) > 0 @@ -205,9 +199,7 @@ async def aadd_embeddings( del extra[metadata_column] insert_stmt += ( - f", {self.metadata_json_column})" - if self.store_metadata - else ")" + f", {self.metadata_json_column})" if self.store_metadata else ")" ) values_stmt += f",'{extra}')" if self.store_metadata else ")" query = insert_stmt + values_stmt @@ -411,7 +403,6 @@ async def asimilarity_search_with_score( filter: str = None, **kwargs: Any, ) -> List[Tuple[Document, float]]: - embedding = self.embedding_service.embed_query(query) docs = await self.asimilarity_search_with_score_by_vector( embedding=embedding, k=k, filter=filter @@ -456,9 +447,7 @@ async def asimilarity_search_with_score_by_vector( filter: str = None, **kwargs: Any, ) -> List[Tuple[Document, float]]: - results = await self.__query_collection( - embedding=embedding, k=k, filter=filter - ) + results = await self.__query_collection(embedding=embedding, k=k, filter=filter) documents_with_scores = [] for row in results: @@ -567,9 +556,7 @@ async def amax_marginal_relevance_search_with_score_by_vector( k = self.k if self.k else k fetch_k = self.fetch_k if self.fetch_k else fetch_k lambda_mult = self.lambda_mult if self.lambda_mult else lambda_mult - embedding_list = [ - json.loads(row[self.embedding_column]) for row in results - ] + embedding_list = [json.loads(row[self.embedding_column]) for row in results] mmr_selected = maximal_marginal_relevance( np.array(embedding, dtype=np.float32), embedding_list, @@ -596,9 +583,7 @@ async def amax_marginal_relevance_search_with_score_by_vector( ) ) - return [ - r for i, r in enumerate(documents_with_scores) if i in mmr_selected - ] + return [r for i, r in enumerate(documents_with_scores) if i in mmr_selected] def _select_relevance_score_fn(self) -> Callable[[float], float]: if self.distance_strategy == DistanceStrategy.COSINE: @@ -619,13 +604,10 @@ async def aapply_index( index: Union[HNSWIndex, IVFFlatIndex, BruteForce], concurrently=False, ) -> None: - if isinstance(index, BruteForce): return None - filter = ( - f"WHERE ({index.partial_indexes})" if index.partial_indexes else "" - ) + filter = f"WHERE ({index.partial_indexes})" if index.partial_indexes else "" params = "WITH " + index.index_options() concurrently = "CONCURRENTLY" if concurrently else "" diff --git a/src/langchain_google_cloud_sql_pg/postgresql_engine.py b/src/langchain_google_cloud_sql_pg/postgresql_engine.py index c92fbdd..97fe8be 100644 --- a/src/langchain_google_cloud_sql_pg/postgresql_engine.py +++ b/src/langchain_google_cloud_sql_pg/postgresql_engine.py @@ -29,11 +29,7 @@ # from pgvector.asyncpg import register_vector from sqlalchemy import Column, text -from sqlalchemy.ext.asyncio import ( - AsyncConnection, - AsyncEngine, - create_async_engine, -) +from sqlalchemy.ext.asyncio import AsyncConnection, AsyncEngine, create_async_engine # nest_asyncio.apply() @@ -134,11 +130,9 @@ def from_instance( @classmethod def from_engine(cls, engine: AsyncEngine) -> PostgreSQLEngine: - return cls(engine=engine, key=PostgreSQLEngine.__create_key) async def _engine(self) -> AsyncEngine: - if self.engine is not None: return self.engine diff --git a/tests/test_cloudsql_vectorstore.py b/tests/test_cloudsql_vectorstore.py index 72248bb..7f458d0 100644 --- a/tests/test_cloudsql_vectorstore.py +++ b/tests/test_cloudsql_vectorstore.py @@ -52,8 +52,7 @@ class FakeEmbeddingsWithAdaDimension(FakeEmbeddings): def embed_documents(self, texts: List[str]) -> List[List[float]]: """Return simple embeddings.""" return [ - [float(1.0)] * (ADA_TOKEN_COUNT - 1) + [float(i)] - for i in range(len(texts)) + [float(1.0)] * (ADA_TOKEN_COUNT - 1) + [float(i)] for i in range(len(texts)) ] def embed_query(self, text: str = "default") -> List[float]: @@ -233,9 +232,7 @@ async def test_with_metadatas_with_scores( metadata_columns=["page"], ) output = await vs.asimilarity_search_with_score("foo", k=1) - assert output == [ - (Document(page_content="foo", metadata={"page": "0"}), 0.0) - ] + assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)] async def test_with_filter_match(self, engine_custom) -> None: """Test end to end construction and search.""" @@ -249,12 +246,8 @@ async def test_with_filter_match(self, engine_custom) -> None: engine=engine_custom, metadata_columns=["page"], ) - output = await vs.asimilarity_search_with_score( - "foo", k=1, filter="page = '0'" - ) - assert output == [ - (Document(page_content="foo", metadata={"page": "0"}), 0.0) - ] + output = await vs.asimilarity_search_with_score("foo", k=1, filter="page = '0'") + assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)] async def test_with_filter_distant_match( self, @@ -271,9 +264,7 @@ async def test_with_filter_distant_match( engine=engine_custom, metadata_columns=["page"], ) - output = await vs.asimilarity_search_with_score( - "foo", k=1, filter="page = '2'" - ) + output = await vs.asimilarity_search_with_score("foo", k=1, filter="page = '2'") assert output == [ ( Document(page_content="baz", metadata={"page": "2"}), @@ -296,9 +287,7 @@ async def test_with_filter_no_match( engine=engine_custom, metadata_columns=["page"], ) - output = await vs.asimilarity_search_with_score( - "foo", k=1, filter="page = '5'" - ) + output = await vs.asimilarity_search_with_score("foo", k=1, filter="page = '5'") assert output == [] async def test_relevance_score(self, engine_custom) -> None: @@ -371,9 +360,7 @@ async def test_max_marginal_relevance_search_amenities( embedding_service=embeddings_service, engine=engine_custom, ) - output = await vs.amax_marginal_relevance_search( - "coffee", k=1, fetch_k=3 - ) + output = await vs.amax_marginal_relevance_search("coffee", k=1, fetch_k=3) assert "coffee" in output[0].page_content @@ -405,9 +392,7 @@ async def test_applyindex(self, vs) -> None: await vs.aapply_index(index) async def test_applyindex_l2(self, vs) -> None: - index = HNSWIndex( - name="hnswl2", distance_strategy=DistanceStrategy.EUCLIDEAN - ) + index = HNSWIndex(name="hnswl2", distance_strategy=DistanceStrategy.EUCLIDEAN) await vs.aapply_index(index) async def test_applyindex_ip(self, vs) -> None: From dc9b9d542ce868b2fd32b27adad8735b054a3dd8 Mon Sep 17 00:00:00 2001 From: Averi Kitsch Date: Thu, 8 Feb 2024 20:32:01 -0800 Subject: [PATCH 6/9] lint --- docs/vector_store.ipynb | 1295 ++++++++++++++-------------- tests/test_cloudsql_vectorstore.py | 39 +- 2 files changed, 673 insertions(+), 661 deletions(-) diff --git a/docs/vector_store.ipynb b/docs/vector_store.ipynb index 70f5b71..8d49e5f 100644 --- a/docs/vector_store.ipynb +++ b/docs/vector_store.ipynb @@ -1,657 +1,654 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "E_RJy7C1bpCT" - }, - "source": [ - "# CloudSQLVectorStore\n", - "> **CloudSQLVectorStore**:\n", - "CloudSQLVectorStore lets you create vector stores on the Cloud SQL for PostgreSQL database. It also allows for semantic search, using vector indexes for fast approximate results, or using brute force for exact results.\n", - "\n", - "\n", - "This tutorial illustrates how to work with an end-to-end data and embedding management system in LangChain, and provide scalable semantic search in CloudSQL for PostgreSQL." - ] - }, - { - "cell_type": "markdown", - "source": [ - "###Pre-requisites" - ], - "metadata": { - "id": "xjcxaw6--Xyy" - } - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IR54BmgvdHT_" - }, - "source": [ - "### Install the library" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "0ZITIDE160OD", - "outputId": "90e0636e-ff34-4e1e-ad37-d2a6db4a317e" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting langchain\n", - " Downloading langchain-0.1.5-py3-none-any.whl (806 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m806.7/806.7 kB\u001b[0m \u001b[31m11.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting langchain-community\n", - " Downloading langchain_community-0.0.18-py3-none-any.whl (1.6 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m39.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting google-cloud\n", - " Downloading google_cloud-0.34.0-py2.py3-none-any.whl (1.8 kB)\n", - "Requirement already satisfied: google-cloud-aiplatform in /usr/local/lib/python3.10/dist-packages (1.39.0)\n", - "Collecting google-cloud-aiplatform\n", - " Downloading google_cloud_aiplatform-1.40.0-py2.py3-none-any.whl (3.4 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m56.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting asyncio\n", - " Downloading asyncio-3.4.3-py3-none-any.whl (101 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m101.8/101.8 kB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting asyncpg\n", - " Downloading asyncpg-0.29.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.7/2.7 MB\u001b[0m \u001b[31m54.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (6.0.1)\n", - "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.0.25)\n", - "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (3.9.3)\n", - "Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (4.0.3)\n", - "Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)\n", - " Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)\n", - "Collecting jsonpatch<2.0,>=1.33 (from langchain)\n", - " Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)\n", - "Collecting langchain-core<0.2,>=0.1.16 (from langchain)\n", - " Downloading langchain_core-0.1.19-py3-none-any.whl (238 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m238.5/238.5 kB\u001b[0m \u001b[31m30.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting langsmith<0.1,>=0.0.83 (from langchain)\n", - " Downloading langsmith-0.0.87-py3-none-any.whl (55 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.4/55.4 kB\u001b[0m \u001b[31m7.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.23.5)\n", - "Requirement already satisfied: pydantic<3,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.6.0)\n", - "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.31.0)\n", - "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (8.2.3)\n", - "Requirement already satisfied: google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (2.11.1)\n", - "Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (1.23.0)\n", - "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (3.20.3)\n", - "Requirement already satisfied: packaging>=14.3 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (23.2)\n", - "Requirement already satisfied: google-cloud-storage<3.0.0dev,>=1.32.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (2.8.0)\n", - "Requirement already satisfied: google-cloud-bigquery<4.0.0dev,>=1.15.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (3.12.0)\n", - "Requirement already satisfied: google-cloud-resource-manager<3.0.0dev,>=1.3.3 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (1.12.0)\n", - "Requirement already satisfied: shapely<3.0.0dev in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (2.0.2)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\n", - "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.2.0)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.4.1)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.5)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.4)\n", - "Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain)\n", - " Downloading marshmallow-3.20.2-py3-none-any.whl (49 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain)\n", - " Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)\n", - "Requirement already satisfied: googleapis-common-protos<2.0.dev0,>=1.56.2 in /usr/local/lib/python3.10/dist-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (1.62.0)\n", - "Requirement already satisfied: google-auth<3.0.dev0,>=2.14.1 in /usr/local/lib/python3.10/dist-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (2.17.3)\n", - "Requirement already satisfied: grpcio<2.0dev,>=1.33.2 in /usr/local/lib/python3.10/dist-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (1.60.1)\n", - "Requirement already satisfied: grpcio-status<2.0.dev0,>=1.33.2 in /usr/local/lib/python3.10/dist-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (1.48.2)\n", - "Requirement already satisfied: google-cloud-core<3.0.0dev,>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform) (2.3.3)\n", - "Requirement already satisfied: google-resumable-media<3.0dev,>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform) (2.7.0)\n", - "Requirement already satisfied: python-dateutil<3.0dev,>=2.7.2 in /usr/local/lib/python3.10/dist-packages (from google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform) (2.8.2)\n", - "Requirement already satisfied: grpc-google-iam-v1<1.0.0dev,>=0.12.4 in /usr/local/lib/python3.10/dist-packages (from google-cloud-resource-manager<3.0.0dev,>=1.3.3->google-cloud-aiplatform) (0.13.0)\n", - "Collecting jsonpointer>=1.9 (from jsonpatch<2.0,>=1.33->langchain)\n", - " Downloading jsonpointer-2.4-py2.py3-none-any.whl (7.8 kB)\n", - "Requirement already satisfied: anyio<5,>=3 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.2,>=0.1.16->langchain) (3.7.1)\n", - "Requirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (0.6.0)\n", - "Requirement already satisfied: pydantic-core==2.16.1 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (2.16.1)\n", - "Requirement already satisfied: typing-extensions>=4.6.1 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (4.9.0)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (3.3.2)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (3.6)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2.0.7)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2024.2.2)\n", - "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain) (3.0.3)\n", - "Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3->langchain-core<0.2,>=0.1.16->langchain) (1.3.0)\n", - "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3->langchain-core<0.2,>=0.1.16->langchain) (1.2.0)\n", - "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (5.3.2)\n", - "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (0.3.0)\n", - "Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (1.16.0)\n", - "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.10/dist-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (4.9)\n", - "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /usr/local/lib/python3.10/dist-packages (from google-resumable-media<3.0dev,>=0.6.0->google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform) (1.5.0)\n", - "Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain)\n", - " Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)\n", - "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /usr/local/lib/python3.10/dist-packages (from pyasn1-modules>=0.2.1->google-auth<3.0.dev0,>=2.14.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (0.5.1)\n", - "Installing collected packages: google-cloud, asyncio, mypy-extensions, marshmallow, jsonpointer, asyncpg, typing-inspect, jsonpatch, langsmith, dataclasses-json, langchain-core, langchain-community, langchain, google-cloud-aiplatform\n", - "\u001b[33m WARNING: The script langsmith is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script langchain-server is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script tb-gcp-uploader is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0mSuccessfully installed asyncio-3.4.3 asyncpg-0.29.0 dataclasses-json-0.6.4 google-cloud-0.34.0 google-cloud-aiplatform-1.40.0 jsonpatch-1.33 jsonpointer-2.4 langchain-0.1.5 langchain-community-0.0.18 langchain-core-0.1.19 langsmith-0.0.87 marshmallow-3.20.2 mypy-extensions-1.0.0 typing-inspect-0.9.0\n" - ] - }, - { - "output_type": "display_data", - "data": { - "application/vnd.colab-display-data+json": { - "pip_warning": { - "packages": [ - "asyncio", - "google" - ] - } - } - }, - "metadata": {} - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting cloud-sql-python-connector[asyncpg]\n", - " Downloading cloud_sql_python_connector-1.6.0-py2.py3-none-any.whl (35 kB)\n", - "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from cloud-sql-python-connector[asyncpg]) (3.9.3)\n", - "Requirement already satisfied: cryptography>=38.0.3 in /usr/local/lib/python3.10/dist-packages (from cloud-sql-python-connector[asyncpg]) (42.0.2)\n", - "Requirement already satisfied: Requests in /usr/local/lib/python3.10/dist-packages (from cloud-sql-python-connector[asyncpg]) (2.31.0)\n", - "Requirement already satisfied: google-auth in /usr/local/lib/python3.10/dist-packages (from cloud-sql-python-connector[asyncpg]) (2.17.3)\n", - "Requirement already satisfied: asyncpg>=0.29.0 in /root/.local/lib/python3.10/site-packages (from cloud-sql-python-connector[asyncpg]) (0.29.0)\n", - "Requirement already satisfied: async-timeout>=4.0.3 in /usr/local/lib/python3.10/dist-packages (from asyncpg>=0.29.0->cloud-sql-python-connector[asyncpg]) (4.0.3)\n", - "Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.10/dist-packages (from cryptography>=38.0.3->cloud-sql-python-connector[asyncpg]) (1.16.0)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->cloud-sql-python-connector[asyncpg]) (1.3.1)\n", - "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->cloud-sql-python-connector[asyncpg]) (23.2.0)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->cloud-sql-python-connector[asyncpg]) (1.4.1)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->cloud-sql-python-connector[asyncpg]) (6.0.5)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->cloud-sql-python-connector[asyncpg]) (1.9.4)\n", - "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from google-auth->cloud-sql-python-connector[asyncpg]) (5.3.2)\n", - "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from google-auth->cloud-sql-python-connector[asyncpg]) (0.3.0)\n", - "Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from google-auth->cloud-sql-python-connector[asyncpg]) (1.16.0)\n", - "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.10/dist-packages (from google-auth->cloud-sql-python-connector[asyncpg]) (4.9)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from Requests->cloud-sql-python-connector[asyncpg]) (3.3.2)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from Requests->cloud-sql-python-connector[asyncpg]) (3.6)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from Requests->cloud-sql-python-connector[asyncpg]) (2.0.7)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from Requests->cloud-sql-python-connector[asyncpg]) (2024.2.2)\n", - "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.12->cryptography>=38.0.3->cloud-sql-python-connector[asyncpg]) (2.21)\n", - "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /usr/local/lib/python3.10/dist-packages (from pyasn1-modules>=0.2.1->google-auth->cloud-sql-python-connector[asyncpg]) (0.5.1)\n", - "Installing collected packages: cloud-sql-python-connector\n", - "Successfully installed cloud-sql-python-connector-1.6.0\n" - ] - }, - { - "output_type": "display_data", - "data": { - "application/vnd.colab-display-data+json": { - "pip_warning": { - "packages": [ - "google" - ] - } - } - }, - "metadata": {} - } - ], - "source": [ - "! pip install langchain langchain-community google-cloud google-cloud-aiplatform asyncio asyncpg --upgrade --user\n", - "! pip install \"cloud-sql-python-connector[asyncpg]\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "v40bB_GMcr9f" - }, - "source": [ - ":**Colab only:** Uncomment the following cell to restart the kernel or use the button to restart the kernel. For Vertex AI Workbench you can restart the terminal using the button on top." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6o0iGVIdDD6K" - }, - "outputs": [], - "source": [ - "# # Automatically restart kernel after installs so that your environment can access the new packages\n", - "# import IPython\n", - "\n", - "# app = IPython.Application.instance()\n", - "# app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "source": [ - "###Note\n", - "\n", - "`If you do not have a GCP project, please follow the below link to create a new project`\n", - "\n", - "[Create a Google Cloud project](https://developers.google.com/workspace/guides/create-project)\n" - ], - "metadata": { - "id": "cTXTbj4UltKf" - } - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Uj02bMRAc9_c" - }, - "source": [ - "#### Set your project ID\n", - "\n", - "If you don't know your project ID, try the following:\n", - "* Run `gcloud config list`.\n", - "* Run `gcloud projects list`.\n", - "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "wnp1R1PYc9_c", - "outputId": "6502c721-a2fd-451f-b946-9f7b850d5966" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Updated property [core/project].\n" - ] - } - ], - "source": [ - "# @title Project { display-mode: \"form\" }\n", - "PROJECT_ID = \"gcp_project_id\" # @param {type:\"string\"}\n", - "\n", - "# Set the project id\n", - "! gcloud config set project {PROJECT_ID}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "38OFiUrIc9_c" - }, - "source": [ - "#### Set the region\n", - "\n", - "You can also change the `REGION` variable used by CloudSQL Postgres. Learn more about [CloudSQL Postgres regions](https://cloud.google.com/sql/docs/postgres/locations)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "DWQxsk80c9_d" - }, - "outputs": [], - "source": [ - "# @title Region { display-mode: \"form\" }\n", - "REGION = \"US\" # @param {type: \"string\"}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aG5_tNwHc9_d" - }, - "source": [ - "#### Set the dataset and table names\n", - "\n", - "They will be your CloudSQL Postgres Vector Store." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "F8gPQnbDc9_d" - }, - "outputs": [], - "source": [ - "# @title Instance, Database and Table { display-mode: \"form\" }\n", - "INSTANCE = \"my_cloudsql_instance\" # @param {type: \"string\"}\n", - "DATABASE = \"my_langchain_database\" # @param {type: \"string\"}\n", - "TABLE = \"doc_and_vectors\" # @param {type: \"string\"}" - ] - }, - { - "cell_type": "markdown", - "source": [ - "###Pre-requisites for connecting to the CloudSQL instance\n", - "\n", - "To connect to the postgreSQL instance make sure to setup the cloudSQL auth proxy and ensure the addition of IAM users to the list of authenticated users to connect to the instance.\n", - "\n", - "Refer to this [link](https://github.com/GoogleCloudPlatform/cloud-sql-proxy) to setup auth proxy.\n", - "\n", - "Refer to this [link](https://cloud.google.com/sql/docs/postgres/users?_ga=2.165429503.-1722697531.1694071937) to add users to the instance" - ], - "metadata": { - "id": "W6wxYasx_EKB" - } - }, - { - "cell_type": "markdown", - "metadata": { - "id": "w7JEEj49c9_d" - }, - "source": [ - "### Authenticating your notebook environment\n", - "\n", - "- If you are using **Colab** to run this notebook, uncomment the cell below and continue.\n", - "- If you are using **Vertex AI Workbench**, check out the setup instructions [here](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/setup-env)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1JZmXFavc9_d" - }, - "outputs": [], - "source": [ - "from google.colab import auth as google_auth\n", - "\n", - "google_auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AD3yG49BdLlr" - }, - "source": [ - "## Demo: CloudSQL Postgres VectorSearch" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vMi7sXhtc9_e" - }, - "source": [ - "### Create an embedding class [instance](https://)\n", - "\n", - "---\n", - "\n", - "\n", - "\n", - "You may need to enable Vertex AI API in your project by running\n", - "`gcloud services enable aiplatform.googleapis.com --project {PROJECT_ID}`\n", - "(replace `{PROJECT_ID}` with the name of your project).\n", - "\n", - "You can use any [LangChain embeddings model](https://python.langchain.com/docs/integrations/text_embedding/)." - ] - }, - { - "cell_type": "code", - "source": [ - "# Importing the necessary libraries\n", - "from langchain_community.vectorstores.cloudSQL import CloudSQLVectorStore\n", - "from langchain_community.vectorstores.cloudSQL import CloudSQLEngine\n", - "from langchain_community.vectorstores.cloudSQL import HNSWIndex" - ], - "metadata": { - "id": "TuH9AOl58bAs" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Vb2RJocV9_LQ" - }, - "outputs": [], - "source": [ - "from langchain_community.embeddings import VertexAIEmbeddings\n", - "\n", - "embedding = VertexAIEmbeddings(\n", - " model_name=\"textembedding-gecko@latest\", project=PROJECT_ID\n", - ")" - ] - }, - { - "cell_type": "markdown", - "source": [ - "### Create CloudSQLEngine to connect to the database" - ], - "metadata": { - "id": "D9Xs2qhm6X56" - } - }, - { - "cell_type": "code", - "source": [ - "# ClouSQLVectorStore requires an engine created using the CloudSQLEngine class\n", - "engine = CloudSQLEngine.from_instance(\n", - " region = \"region_name\",\n", - " instance = \"instance_name\",\n", - " database = \"dbname\"\n", - ")" - ], - "metadata": { - "id": "avlyHEMn6gzU" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Create CloudSQLVectorStore to create a table" - ], - "metadata": { - "id": "e1tl0aNx7SWy" - } - }, - { - "cell_type": "code", - "source": [ - "# Creating a basic CloudSQLVectorStore object\n", - "db = CloudSQLVectorStore(\n", - " engine=engine,\n", - " table_name='table_name',\n", - " embedding_service=embedding)\n", - "\n", - "# Alternatively we can create a non-default vector store object by tweaking the following args:\n", - "# vector_size - By default it is set to 768. Can be set to vector size of choice.\n", - "# content_column - By default the content column is named 'content'. Can be set to any name of choice.\n", - "# embedding_column - By default the embedding column is named 'embedding'. Can be set to any name of choice.\n", - "# metadata_columns - By default the metadata column is named 'metadata'. Can be set to any name/ list of names of choice.\n", - "# ignore_metadata_columns - By default the ignore_metadata_columns is None. Can be set to any name/ list of names of choice.\n", - "# index_query_options - By default the index_query_options is None. Can be set using HNSWIndex.QueryOptions() or IVFFlatIndex.QueryOptions().\n", - "# index - By default the index is a HNSWIndex object. Can be set to a IVFFlatIndex object or BruteForce object.\n", - "# distance_strategy - By default the distance_strategy is 'L2'. Can be set to 'INNER PRODUCT' or 'COSINE'.\n", - "# overwrite_existing - By default the overwrite_existing is False. Can be set to True if table needs to be overwritten." - ], - "metadata": { - "id": "z-AZyzAQ7bsf" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PeOMpftjc9_e" - }, - "source": [ - "### Add texts\n", - "This method helps add texts into the table" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cwvi_O5Wc9_e" - }, - "outputs": [], - "source": [ - "texts = [\"Apples and oranges\", \"Cars and airplanes\", \"Pineapple\", \"Train\", \"Banana\"]\n", - "metadatas = [{\"len\": len(t)} for t in texts]\n", - "await db.add_texts(texts=texts,metadatas=metadatas)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kSkL9l1Hc9_e" - }, - "source": [ - "### Search for documents\n", - "The default distance strategy used for querying similar documents is L2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Q4pCL2I_c9_f" - }, - "outputs": [], - "source": [ - "query = \"I'd like a fruit.\"\n", - "docs = await db.similarity_search(query)\n", - "print(docs)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5R6h0_Cvc9_f" - }, - "source": [ - "### Search for documents by vector\n", - "Searching for similar documents with list of embeddings as params\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NGNdS7cqc9_f" - }, - "outputs": [], - "source": [ - "query_vector = embedding.embed_query(query)\n", - "docs = await db.asimilarity_search_by_vector(query_vector, k=2)\n", - "print(docs)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yKw_Lab-c9_f" - }, - "source": [ - "### Search for documents with metadata filter\n", - "Additional metadata filtering" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uyYDfbMKc9_f" - }, - "outputs": [], - "source": [ - "# This should only return \"Banana\" document.\n", - "docs = await db.asimilarity_search_by_vector(query_vector, filter={\"len\": 6})\n", - "print(docs)" - ] - }, - { - "cell_type": "markdown", - "source": [ - "###Maximum Marginal relevance search (MMR)\n", - "Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents.\n", - "\n" - ], - "metadata": { - "id": "IPhxeqGr7sOS" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "E_RJy7C1bpCT" + }, + "source": [ + "# CloudSQLVectorStore\n", + "> **CloudSQLVectorStore**:\n", + "CloudSQLVectorStore lets you create vector stores on the Cloud SQL for PostgreSQL database. It also allows for semantic search, using vector indexes for fast approximate results, or using brute force for exact results.\n", + "\n", + "\n", + "This tutorial illustrates how to work with an end-to-end data and embedding management system in LangChain, and provide scalable semantic search in CloudSQL for PostgreSQL." + ] + }, + { + "cell_type": "markdown", + "source": [ + "###Pre-requisites" + ], + "metadata": { + "id": "xjcxaw6--Xyy" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IR54BmgvdHT_" + }, + "source": [ + "### Install the library" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "0ZITIDE160OD", + "outputId": "90e0636e-ff34-4e1e-ad37-d2a6db4a317e" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting langchain\n", + " Downloading langchain-0.1.5-py3-none-any.whl (806 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m806.7/806.7 kB\u001b[0m \u001b[31m11.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting langchain-community\n", + " Downloading langchain_community-0.0.18-py3-none-any.whl (1.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m39.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting google-cloud\n", + " Downloading google_cloud-0.34.0-py2.py3-none-any.whl (1.8 kB)\n", + "Requirement already satisfied: google-cloud-aiplatform in /usr/local/lib/python3.10/dist-packages (1.39.0)\n", + "Collecting google-cloud-aiplatform\n", + " Downloading google_cloud_aiplatform-1.40.0-py2.py3-none-any.whl (3.4 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m56.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting asyncio\n", + " Downloading asyncio-3.4.3-py3-none-any.whl (101 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m101.8/101.8 kB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting asyncpg\n", + " Downloading asyncpg-0.29.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.7/2.7 MB\u001b[0m \u001b[31m54.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (6.0.1)\n", + "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.0.25)\n", + "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (3.9.3)\n", + "Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (4.0.3)\n", + "Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)\n", + " Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)\n", + "Collecting jsonpatch<2.0,>=1.33 (from langchain)\n", + " Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)\n", + "Collecting langchain-core<0.2,>=0.1.16 (from langchain)\n", + " Downloading langchain_core-0.1.19-py3-none-any.whl (238 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m238.5/238.5 kB\u001b[0m \u001b[31m30.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting langsmith<0.1,>=0.0.83 (from langchain)\n", + " Downloading langsmith-0.0.87-py3-none-any.whl (55 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.4/55.4 kB\u001b[0m \u001b[31m7.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.23.5)\n", + "Requirement already satisfied: pydantic<3,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.6.0)\n", + "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.31.0)\n", + "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (8.2.3)\n", + "Requirement already satisfied: google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (2.11.1)\n", + "Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (1.23.0)\n", + "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (3.20.3)\n", + "Requirement already satisfied: packaging>=14.3 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (23.2)\n", + "Requirement already satisfied: google-cloud-storage<3.0.0dev,>=1.32.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (2.8.0)\n", + "Requirement already satisfied: google-cloud-bigquery<4.0.0dev,>=1.15.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (3.12.0)\n", + "Requirement already satisfied: google-cloud-resource-manager<3.0.0dev,>=1.3.3 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (1.12.0)\n", + "Requirement already satisfied: shapely<3.0.0dev in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (2.0.2)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.2.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.4.1)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.5)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.4)\n", + "Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain)\n", + " Downloading marshmallow-3.20.2-py3-none-any.whl (49 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain)\n", + " Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)\n", + "Requirement already satisfied: googleapis-common-protos<2.0.dev0,>=1.56.2 in /usr/local/lib/python3.10/dist-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (1.62.0)\n", + "Requirement already satisfied: google-auth<3.0.dev0,>=2.14.1 in /usr/local/lib/python3.10/dist-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (2.17.3)\n", + "Requirement already satisfied: grpcio<2.0dev,>=1.33.2 in /usr/local/lib/python3.10/dist-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (1.60.1)\n", + "Requirement already satisfied: grpcio-status<2.0.dev0,>=1.33.2 in /usr/local/lib/python3.10/dist-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (1.48.2)\n", + "Requirement already satisfied: google-cloud-core<3.0.0dev,>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform) (2.3.3)\n", + "Requirement already satisfied: google-resumable-media<3.0dev,>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform) (2.7.0)\n", + "Requirement already satisfied: python-dateutil<3.0dev,>=2.7.2 in /usr/local/lib/python3.10/dist-packages (from google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform) (2.8.2)\n", + "Requirement already satisfied: grpc-google-iam-v1<1.0.0dev,>=0.12.4 in /usr/local/lib/python3.10/dist-packages (from google-cloud-resource-manager<3.0.0dev,>=1.3.3->google-cloud-aiplatform) (0.13.0)\n", + "Collecting jsonpointer>=1.9 (from jsonpatch<2.0,>=1.33->langchain)\n", + " Downloading jsonpointer-2.4-py2.py3-none-any.whl (7.8 kB)\n", + "Requirement already satisfied: anyio<5,>=3 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.2,>=0.1.16->langchain) (3.7.1)\n", + "Requirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (0.6.0)\n", + "Requirement already satisfied: pydantic-core==2.16.1 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (2.16.1)\n", + "Requirement already satisfied: typing-extensions>=4.6.1 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (4.9.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (3.6)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2024.2.2)\n", + "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain) (3.0.3)\n", + "Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3->langchain-core<0.2,>=0.1.16->langchain) (1.3.0)\n", + "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3->langchain-core<0.2,>=0.1.16->langchain) (1.2.0)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (5.3.2)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (0.3.0)\n", + "Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (1.16.0)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.10/dist-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (4.9)\n", + "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /usr/local/lib/python3.10/dist-packages (from google-resumable-media<3.0dev,>=0.6.0->google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform) (1.5.0)\n", + "Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain)\n", + " Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)\n", + "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /usr/local/lib/python3.10/dist-packages (from pyasn1-modules>=0.2.1->google-auth<3.0.dev0,>=2.14.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (0.5.1)\n", + "Installing collected packages: google-cloud, asyncio, mypy-extensions, marshmallow, jsonpointer, asyncpg, typing-inspect, jsonpatch, langsmith, dataclasses-json, langchain-core, langchain-community, langchain, google-cloud-aiplatform\n", + "\u001b[33m WARNING: The script langsmith is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script langchain-server is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script tb-gcp-uploader is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0mSuccessfully installed asyncio-3.4.3 asyncpg-0.29.0 dataclasses-json-0.6.4 google-cloud-0.34.0 google-cloud-aiplatform-1.40.0 jsonpatch-1.33 jsonpointer-2.4 langchain-0.1.5 langchain-community-0.0.18 langchain-core-0.1.19 langsmith-0.0.87 marshmallow-3.20.2 mypy-extensions-1.0.0 typing-inspect-0.9.0\n" + ] + }, + { + "output_type": "display_data", + "data": { + "application/vnd.colab-display-data+json": { + "pip_warning": { + "packages": [ + "asyncio", + "google" + ] + } } - }, - { - "cell_type": "code", - "source": [ - "# This should return top 4 relevant documents to the given query\n", - "docs = await db.amax_marginal_relevance_search(query)\n", - "print(docs)" - ], - "metadata": { - "id": "zmnGOrTT71BF" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "###Indexing\n", - "Setting custom indexes/ rebuilding indexes" - ], - "metadata": { - "id": "_K68SOsq73Tc" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting cloud-sql-python-connector[asyncpg]\n", + " Downloading cloud_sql_python_connector-1.6.0-py2.py3-none-any.whl (35 kB)\n", + "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from cloud-sql-python-connector[asyncpg]) (3.9.3)\n", + "Requirement already satisfied: cryptography>=38.0.3 in /usr/local/lib/python3.10/dist-packages (from cloud-sql-python-connector[asyncpg]) (42.0.2)\n", + "Requirement already satisfied: Requests in /usr/local/lib/python3.10/dist-packages (from cloud-sql-python-connector[asyncpg]) (2.31.0)\n", + "Requirement already satisfied: google-auth in /usr/local/lib/python3.10/dist-packages (from cloud-sql-python-connector[asyncpg]) (2.17.3)\n", + "Requirement already satisfied: asyncpg>=0.29.0 in /root/.local/lib/python3.10/site-packages (from cloud-sql-python-connector[asyncpg]) (0.29.0)\n", + "Requirement already satisfied: async-timeout>=4.0.3 in /usr/local/lib/python3.10/dist-packages (from asyncpg>=0.29.0->cloud-sql-python-connector[asyncpg]) (4.0.3)\n", + "Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.10/dist-packages (from cryptography>=38.0.3->cloud-sql-python-connector[asyncpg]) (1.16.0)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->cloud-sql-python-connector[asyncpg]) (1.3.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->cloud-sql-python-connector[asyncpg]) (23.2.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->cloud-sql-python-connector[asyncpg]) (1.4.1)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->cloud-sql-python-connector[asyncpg]) (6.0.5)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->cloud-sql-python-connector[asyncpg]) (1.9.4)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from google-auth->cloud-sql-python-connector[asyncpg]) (5.3.2)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from google-auth->cloud-sql-python-connector[asyncpg]) (0.3.0)\n", + "Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from google-auth->cloud-sql-python-connector[asyncpg]) (1.16.0)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.10/dist-packages (from google-auth->cloud-sql-python-connector[asyncpg]) (4.9)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from Requests->cloud-sql-python-connector[asyncpg]) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from Requests->cloud-sql-python-connector[asyncpg]) (3.6)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from Requests->cloud-sql-python-connector[asyncpg]) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from Requests->cloud-sql-python-connector[asyncpg]) (2024.2.2)\n", + "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.12->cryptography>=38.0.3->cloud-sql-python-connector[asyncpg]) (2.21)\n", + "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /usr/local/lib/python3.10/dist-packages (from pyasn1-modules>=0.2.1->google-auth->cloud-sql-python-connector[asyncpg]) (0.5.1)\n", + "Installing collected packages: cloud-sql-python-connector\n", + "Successfully installed cloud-sql-python-connector-1.6.0\n" + ] + }, + { + "output_type": "display_data", + "data": { + "application/vnd.colab-display-data+json": { + "pip_warning": { + "packages": [ + "google" + ] + } } - }, - { - "cell_type": "code", - "source": [ - "# This would return None if index is rebuilt or created.\n", - "index = HNSWIndex()\n", - "await db.areindex(index)" - ], - "metadata": { - "id": "aZdo-WVM77I7" - }, - "execution_count": null, - "outputs": [] + }, + "metadata": {} } - ], - "metadata": { + ], + "source": [ + "! pip install langchain langchain-community google-cloud google-cloud-aiplatform asyncio asyncpg --upgrade --user\n", + "! pip install \"cloud-sql-python-connector[asyncpg]\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "v40bB_GMcr9f" + }, + "source": [ + ":**Colab only:** Uncomment the following cell to restart the kernel or use the button to restart the kernel. For Vertex AI Workbench you can restart the terminal using the button on top." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6o0iGVIdDD6K" + }, + "outputs": [], + "source": [ + "# # Automatically restart kernel after installs so that your environment can access the new packages\n", + "# import IPython\n", + "\n", + "# app = IPython.Application.instance()\n", + "# app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "###Note\n", + "\n", + "`If you do not have a GCP project, please follow the below link to create a new project`\n", + "\n", + "[Create a Google Cloud project](https://developers.google.com/workspace/guides/create-project)\n" + ], + "metadata": { + "id": "cTXTbj4UltKf" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Uj02bMRAc9_c" + }, + "source": [ + "#### Set your project ID\n", + "\n", + "If you don't know your project ID, try the following:\n", + "* Run `gcloud config list`.\n", + "* Run `gcloud projects list`.\n", + "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { "colab": { - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.0" + "base_uri": "https://localhost:8080/" + }, + "id": "wnp1R1PYc9_c", + "outputId": "6502c721-a2fd-451f-b946-9f7b850d5966" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Updated property [core/project].\n" + ] } + ], + "source": [ + "# @title Project { display-mode: \"form\" }\n", + "PROJECT_ID = \"gcp_project_id\" # @param {type:\"string\"}\n", + "\n", + "# Set the project id\n", + "! gcloud config set project {PROJECT_ID}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "38OFiUrIc9_c" + }, + "source": [ + "#### Set the region\n", + "\n", + "You can also change the `REGION` variable used by CloudSQL Postgres. Learn more about [CloudSQL Postgres regions](https://cloud.google.com/sql/docs/postgres/locations)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DWQxsk80c9_d" + }, + "outputs": [], + "source": [ + "# @title Region { display-mode: \"form\" }\n", + "REGION = \"US\" # @param {type: \"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aG5_tNwHc9_d" + }, + "source": [ + "#### Set the dataset and table names\n", + "\n", + "They will be your CloudSQL Postgres Vector Store." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "F8gPQnbDc9_d" + }, + "outputs": [], + "source": [ + "# @title Instance, Database and Table { display-mode: \"form\" }\n", + "INSTANCE = \"my_cloudsql_instance\" # @param {type: \"string\"}\n", + "DATABASE = \"my_langchain_database\" # @param {type: \"string\"}\n", + "TABLE = \"doc_and_vectors\" # @param {type: \"string\"}" + ] + }, + { + "cell_type": "markdown", + "source": [ + "###Pre-requisites for connecting to the CloudSQL instance\n", + "\n", + "To connect to the postgreSQL instance make sure to setup the cloudSQL auth proxy and ensure the addition of IAM users to the list of authenticated users to connect to the instance.\n", + "\n", + "Refer to this [link](https://github.com/GoogleCloudPlatform/cloud-sql-proxy) to setup auth proxy.\n", + "\n", + "Refer to this [link](https://cloud.google.com/sql/docs/postgres/users?_ga=2.165429503.-1722697531.1694071937) to add users to the instance" + ], + "metadata": { + "id": "W6wxYasx_EKB" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "w7JEEj49c9_d" + }, + "source": [ + "### Authenticating your notebook environment\n", + "\n", + "- If you are using **Colab** to run this notebook, uncomment the cell below and continue.\n", + "- If you are using **Vertex AI Workbench**, check out the setup instructions [here](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/setup-env)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1JZmXFavc9_d" + }, + "outputs": [], + "source": [ + "from google.colab import auth as google_auth\n", + "\n", + "google_auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AD3yG49BdLlr" + }, + "source": [ + "## Demo: CloudSQL Postgres VectorSearch" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vMi7sXhtc9_e" + }, + "source": [ + "### Create an embedding class [instance](https://)\n", + "\n", + "---\n", + "\n", + "\n", + "\n", + "You may need to enable Vertex AI API in your project by running\n", + "`gcloud services enable aiplatform.googleapis.com --project {PROJECT_ID}`\n", + "(replace `{PROJECT_ID}` with the name of your project).\n", + "\n", + "You can use any [LangChain embeddings model](https://python.langchain.com/docs/integrations/text_embedding/)." + ] + }, + { + "cell_type": "code", + "source": [ + "# Importing the necessary libraries\n", + "from langchain_community.vectorstores.cloudSQL import CloudSQLVectorStore\n", + "from langchain_community.vectorstores.cloudSQL import CloudSQLEngine\n", + "from langchain_community.vectorstores.cloudSQL import HNSWIndex" + ], + "metadata": { + "id": "TuH9AOl58bAs" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Vb2RJocV9_LQ" + }, + "outputs": [], + "source": [ + "from langchain_community.embeddings import VertexAIEmbeddings\n", + "\n", + "embedding = VertexAIEmbeddings(\n", + " model_name=\"textembedding-gecko@latest\", project=PROJECT_ID\n", + ")" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Create CloudSQLEngine to connect to the database" + ], + "metadata": { + "id": "D9Xs2qhm6X56" + } + }, + { + "cell_type": "code", + "source": [ + "# ClouSQLVectorStore requires an engine created using the CloudSQLEngine class\n", + "engine = CloudSQLEngine.from_instance(\n", + " region=\"region_name\", instance=\"instance_name\", database=\"dbname\"\n", + ")" + ], + "metadata": { + "id": "avlyHEMn6gzU" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Create CloudSQLVectorStore to create a table" + ], + "metadata": { + "id": "e1tl0aNx7SWy" + } + }, + { + "cell_type": "code", + "source": [ + "# Creating a basic CloudSQLVectorStore object\n", + "db = CloudSQLVectorStore(\n", + " engine=engine, table_name=\"table_name\", embedding_service=embedding\n", + ")\n", + "\n", + "# Alternatively we can create a non-default vector store object by tweaking the following args:\n", + "# vector_size - By default it is set to 768. Can be set to vector size of choice.\n", + "# content_column - By default the content column is named 'content'. Can be set to any name of choice.\n", + "# embedding_column - By default the embedding column is named 'embedding'. Can be set to any name of choice.\n", + "# metadata_columns - By default the metadata column is named 'metadata'. Can be set to any name/ list of names of choice.\n", + "# ignore_metadata_columns - By default the ignore_metadata_columns is None. Can be set to any name/ list of names of choice.\n", + "# index_query_options - By default the index_query_options is None. Can be set using HNSWIndex.QueryOptions() or IVFFlatIndex.QueryOptions().\n", + "# index - By default the index is a HNSWIndex object. Can be set to a IVFFlatIndex object or BruteForce object.\n", + "# distance_strategy - By default the distance_strategy is 'L2'. Can be set to 'INNER PRODUCT' or 'COSINE'.\n", + "# overwrite_existing - By default the overwrite_existing is False. Can be set to True if table needs to be overwritten." + ], + "metadata": { + "id": "z-AZyzAQ7bsf" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PeOMpftjc9_e" + }, + "source": [ + "### Add texts\n", + "This method helps add texts into the table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cwvi_O5Wc9_e" + }, + "outputs": [], + "source": [ + "texts = [\"Apples and oranges\", \"Cars and airplanes\", \"Pineapple\", \"Train\", \"Banana\"]\n", + "metadatas = [{\"len\": len(t)} for t in texts]\n", + "await db.add_texts(texts=texts, metadatas=metadatas)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kSkL9l1Hc9_e" + }, + "source": [ + "### Search for documents\n", + "The default distance strategy used for querying similar documents is L2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Q4pCL2I_c9_f" + }, + "outputs": [], + "source": [ + "query = \"I'd like a fruit.\"\n", + "docs = await db.similarity_search(query)\n", + "print(docs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5R6h0_Cvc9_f" + }, + "source": [ + "### Search for documents by vector\n", + "Searching for similar documents with list of embeddings as params\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NGNdS7cqc9_f" + }, + "outputs": [], + "source": [ + "query_vector = embedding.embed_query(query)\n", + "docs = await db.asimilarity_search_by_vector(query_vector, k=2)\n", + "print(docs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yKw_Lab-c9_f" + }, + "source": [ + "### Search for documents with metadata filter\n", + "Additional metadata filtering" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uyYDfbMKc9_f" + }, + "outputs": [], + "source": [ + "# This should only return \"Banana\" document.\n", + "docs = await db.asimilarity_search_by_vector(query_vector, filter={\"len\": 6})\n", + "print(docs)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "###Maximum Marginal relevance search (MMR)\n", + "Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents.\n", + "\n" + ], + "metadata": { + "id": "IPhxeqGr7sOS" + } + }, + { + "cell_type": "code", + "source": [ + "# This should return top 4 relevant documents to the given query\n", + "docs = await db.amax_marginal_relevance_search(query)\n", + "print(docs)" + ], + "metadata": { + "id": "zmnGOrTT71BF" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "###Indexing\n", + "Setting custom indexes/ rebuilding indexes" + ], + "metadata": { + "id": "_K68SOsq73Tc" + } + }, + { + "cell_type": "code", + "source": [ + "# This would return None if index is rebuilt or created.\n", + "index = HNSWIndex()\n", + "await db.areindex(index)" + ], + "metadata": { + "id": "aZdo-WVM77I7" + }, + "execution_count": null, + "outputs": [] + } + ], + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 0 + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } \ No newline at end of file diff --git a/tests/test_cloudsql_vectorstore.py b/tests/test_cloudsql_vectorstore.py index 7f458d0..6eb95d3 100644 --- a/tests/test_cloudsql_vectorstore.py +++ b/tests/test_cloudsql_vectorstore.py @@ -32,10 +32,10 @@ IVFFlatIndex, ) -PROJECT_ID = "starter-akitsch" -INSTANCE = "pg15-pgvector-demo" -DATABASE = "assistantdemo" -REGION = "us-west2" +PROJECT_ID = os.environ.get("PROJECT_ID") +INSTANCE = os.environ.get("INSTANCE_ID") +DATABASE = os.environ.get("DATABASE_ID") +REGION = os.environ.get("REGION") ADA_TOKEN_COUNT = 768 DEFAULT_TABLE = "test_table" CUSTOM_COL = "page" @@ -52,7 +52,8 @@ class FakeEmbeddingsWithAdaDimension(FakeEmbeddings): def embed_documents(self, texts: List[str]) -> List[List[float]]: """Return simple embeddings.""" return [ - [float(1.0)] * (ADA_TOKEN_COUNT - 1) + [float(i)] for i in range(len(texts)) + [float(1.0)] * (ADA_TOKEN_COUNT - 1) + [float(i)] + for i in range(len(texts)) ] def embed_query(self, text: str = "default") -> List[float]: @@ -232,7 +233,9 @@ async def test_with_metadatas_with_scores( metadata_columns=["page"], ) output = await vs.asimilarity_search_with_score("foo", k=1) - assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)] + assert output == [ + (Document(page_content="foo", metadata={"page": "0"}), 0.0) + ] async def test_with_filter_match(self, engine_custom) -> None: """Test end to end construction and search.""" @@ -246,8 +249,12 @@ async def test_with_filter_match(self, engine_custom) -> None: engine=engine_custom, metadata_columns=["page"], ) - output = await vs.asimilarity_search_with_score("foo", k=1, filter="page = '0'") - assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)] + output = await vs.asimilarity_search_with_score( + "foo", k=1, filter="page = '0'" + ) + assert output == [ + (Document(page_content="foo", metadata={"page": "0"}), 0.0) + ] async def test_with_filter_distant_match( self, @@ -264,7 +271,9 @@ async def test_with_filter_distant_match( engine=engine_custom, metadata_columns=["page"], ) - output = await vs.asimilarity_search_with_score("foo", k=1, filter="page = '2'") + output = await vs.asimilarity_search_with_score( + "foo", k=1, filter="page = '2'" + ) assert output == [ ( Document(page_content="baz", metadata={"page": "2"}), @@ -287,7 +296,9 @@ async def test_with_filter_no_match( engine=engine_custom, metadata_columns=["page"], ) - output = await vs.asimilarity_search_with_score("foo", k=1, filter="page = '5'") + output = await vs.asimilarity_search_with_score( + "foo", k=1, filter="page = '5'" + ) assert output == [] async def test_relevance_score(self, engine_custom) -> None: @@ -360,7 +371,9 @@ async def test_max_marginal_relevance_search_amenities( embedding_service=embeddings_service, engine=engine_custom, ) - output = await vs.amax_marginal_relevance_search("coffee", k=1, fetch_k=3) + output = await vs.amax_marginal_relevance_search( + "coffee", k=1, fetch_k=3 + ) assert "coffee" in output[0].page_content @@ -392,7 +405,9 @@ async def test_applyindex(self, vs) -> None: await vs.aapply_index(index) async def test_applyindex_l2(self, vs) -> None: - index = HNSWIndex(name="hnswl2", distance_strategy=DistanceStrategy.EUCLIDEAN) + index = HNSWIndex( + name="hnswl2", distance_strategy=DistanceStrategy.EUCLIDEAN + ) await vs.aapply_index(index) async def test_applyindex_ip(self, vs) -> None: From 37a0ec271e93bd2d0d3ebf9adfff540da9fac82c Mon Sep 17 00:00:00 2001 From: Averi Kitsch Date: Thu, 8 Feb 2024 21:02:53 -0800 Subject: [PATCH 7/9] fix --- docs/vector_store.ipynb | 1302 ++++++++--------- integration.cloudbuild.yaml | 10 + .../cloudsql_vectorstore.py | 15 +- tests/test_cloudsql_vectorstore.py | 41 +- 4 files changed, 679 insertions(+), 689 deletions(-) diff --git a/docs/vector_store.ipynb b/docs/vector_store.ipynb index 8d49e5f..2acb762 100644 --- a/docs/vector_store.ipynb +++ b/docs/vector_store.ipynb @@ -1,654 +1,654 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "E_RJy7C1bpCT" - }, - "source": [ - "# CloudSQLVectorStore\n", - "> **CloudSQLVectorStore**:\n", - "CloudSQLVectorStore lets you create vector stores on the Cloud SQL for PostgreSQL database. It also allows for semantic search, using vector indexes for fast approximate results, or using brute force for exact results.\n", - "\n", - "\n", - "This tutorial illustrates how to work with an end-to-end data and embedding management system in LangChain, and provide scalable semantic search in CloudSQL for PostgreSQL." - ] - }, - { - "cell_type": "markdown", - "source": [ - "###Pre-requisites" - ], - "metadata": { - "id": "xjcxaw6--Xyy" - } - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IR54BmgvdHT_" - }, - "source": [ - "### Install the library" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "E_RJy7C1bpCT" + }, + "source": [ + "# CloudSQLVectorStore\n", + "> **CloudSQLVectorStore**:\n", + "CloudSQLVectorStore lets you create vector stores on the Cloud SQL for PostgreSQL database. It also allows for semantic search, using vector indexes for fast approximate results, or using brute force for exact results.\n", + "\n", + "\n", + "This tutorial illustrates how to work with an end-to-end data and embedding management system in LangChain, and provide scalable semantic search in CloudSQL for PostgreSQL." + ] + }, + { + "cell_type": "markdown", + "source": [ + "###Pre-requisites" + ], + "metadata": { + "id": "xjcxaw6--Xyy" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IR54BmgvdHT_" + }, + "source": [ + "### Install the library" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "0ZITIDE160OD", + "outputId": "90e0636e-ff34-4e1e-ad37-d2a6db4a317e" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting langchain\n", + " Downloading langchain-0.1.5-py3-none-any.whl (806 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m806.7/806.7 kB\u001b[0m \u001b[31m11.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting langchain-community\n", + " Downloading langchain_community-0.0.18-py3-none-any.whl (1.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m39.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting google-cloud\n", + " Downloading google_cloud-0.34.0-py2.py3-none-any.whl (1.8 kB)\n", + "Requirement already satisfied: google-cloud-aiplatform in /usr/local/lib/python3.10/dist-packages (1.39.0)\n", + "Collecting google-cloud-aiplatform\n", + " Downloading google_cloud_aiplatform-1.40.0-py2.py3-none-any.whl (3.4 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m56.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting asyncio\n", + " Downloading asyncio-3.4.3-py3-none-any.whl (101 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m101.8/101.8 kB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting asyncpg\n", + " Downloading asyncpg-0.29.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.7/2.7 MB\u001b[0m \u001b[31m54.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (6.0.1)\n", + "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.0.25)\n", + "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (3.9.3)\n", + "Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (4.0.3)\n", + "Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)\n", + " Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)\n", + "Collecting jsonpatch<2.0,>=1.33 (from langchain)\n", + " Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)\n", + "Collecting langchain-core<0.2,>=0.1.16 (from langchain)\n", + " Downloading langchain_core-0.1.19-py3-none-any.whl (238 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m238.5/238.5 kB\u001b[0m \u001b[31m30.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting langsmith<0.1,>=0.0.83 (from langchain)\n", + " Downloading langsmith-0.0.87-py3-none-any.whl (55 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.4/55.4 kB\u001b[0m \u001b[31m7.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.23.5)\n", + "Requirement already satisfied: pydantic<3,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.6.0)\n", + "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.31.0)\n", + "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (8.2.3)\n", + "Requirement already satisfied: google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (2.11.1)\n", + "Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (1.23.0)\n", + "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (3.20.3)\n", + "Requirement already satisfied: packaging>=14.3 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (23.2)\n", + "Requirement already satisfied: google-cloud-storage<3.0.0dev,>=1.32.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (2.8.0)\n", + "Requirement already satisfied: google-cloud-bigquery<4.0.0dev,>=1.15.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (3.12.0)\n", + "Requirement already satisfied: google-cloud-resource-manager<3.0.0dev,>=1.3.3 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (1.12.0)\n", + "Requirement already satisfied: shapely<3.0.0dev in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (2.0.2)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.2.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.4.1)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.5)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.4)\n", + "Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain)\n", + " Downloading marshmallow-3.20.2-py3-none-any.whl (49 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain)\n", + " Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)\n", + "Requirement already satisfied: googleapis-common-protos<2.0.dev0,>=1.56.2 in /usr/local/lib/python3.10/dist-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (1.62.0)\n", + "Requirement already satisfied: google-auth<3.0.dev0,>=2.14.1 in /usr/local/lib/python3.10/dist-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (2.17.3)\n", + "Requirement already satisfied: grpcio<2.0dev,>=1.33.2 in /usr/local/lib/python3.10/dist-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (1.60.1)\n", + "Requirement already satisfied: grpcio-status<2.0.dev0,>=1.33.2 in /usr/local/lib/python3.10/dist-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (1.48.2)\n", + "Requirement already satisfied: google-cloud-core<3.0.0dev,>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform) (2.3.3)\n", + "Requirement already satisfied: google-resumable-media<3.0dev,>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform) (2.7.0)\n", + "Requirement already satisfied: python-dateutil<3.0dev,>=2.7.2 in /usr/local/lib/python3.10/dist-packages (from google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform) (2.8.2)\n", + "Requirement already satisfied: grpc-google-iam-v1<1.0.0dev,>=0.12.4 in /usr/local/lib/python3.10/dist-packages (from google-cloud-resource-manager<3.0.0dev,>=1.3.3->google-cloud-aiplatform) (0.13.0)\n", + "Collecting jsonpointer>=1.9 (from jsonpatch<2.0,>=1.33->langchain)\n", + " Downloading jsonpointer-2.4-py2.py3-none-any.whl (7.8 kB)\n", + "Requirement already satisfied: anyio<5,>=3 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.2,>=0.1.16->langchain) (3.7.1)\n", + "Requirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (0.6.0)\n", + "Requirement already satisfied: pydantic-core==2.16.1 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (2.16.1)\n", + "Requirement already satisfied: typing-extensions>=4.6.1 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (4.9.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (3.6)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2024.2.2)\n", + "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain) (3.0.3)\n", + "Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3->langchain-core<0.2,>=0.1.16->langchain) (1.3.0)\n", + "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3->langchain-core<0.2,>=0.1.16->langchain) (1.2.0)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (5.3.2)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (0.3.0)\n", + "Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (1.16.0)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.10/dist-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (4.9)\n", + "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /usr/local/lib/python3.10/dist-packages (from google-resumable-media<3.0dev,>=0.6.0->google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform) (1.5.0)\n", + "Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain)\n", + " Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)\n", + "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /usr/local/lib/python3.10/dist-packages (from pyasn1-modules>=0.2.1->google-auth<3.0.dev0,>=2.14.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (0.5.1)\n", + "Installing collected packages: google-cloud, asyncio, mypy-extensions, marshmallow, jsonpointer, asyncpg, typing-inspect, jsonpatch, langsmith, dataclasses-json, langchain-core, langchain-community, langchain, google-cloud-aiplatform\n", + "\u001b[33m WARNING: The script langsmith is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script langchain-server is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script tb-gcp-uploader is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0mSuccessfully installed asyncio-3.4.3 asyncpg-0.29.0 dataclasses-json-0.6.4 google-cloud-0.34.0 google-cloud-aiplatform-1.40.0 jsonpatch-1.33 jsonpointer-2.4 langchain-0.1.5 langchain-community-0.0.18 langchain-core-0.1.19 langsmith-0.0.87 marshmallow-3.20.2 mypy-extensions-1.0.0 typing-inspect-0.9.0\n" + ] + }, + { + "output_type": "display_data", + "data": { + "application/vnd.colab-display-data+json": { + "pip_warning": { + "packages": [ + "asyncio", + "google" + ] + } + } + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting cloud-sql-python-connector[asyncpg]\n", + " Downloading cloud_sql_python_connector-1.6.0-py2.py3-none-any.whl (35 kB)\n", + "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from cloud-sql-python-connector[asyncpg]) (3.9.3)\n", + "Requirement already satisfied: cryptography>=38.0.3 in /usr/local/lib/python3.10/dist-packages (from cloud-sql-python-connector[asyncpg]) (42.0.2)\n", + "Requirement already satisfied: Requests in /usr/local/lib/python3.10/dist-packages (from cloud-sql-python-connector[asyncpg]) (2.31.0)\n", + "Requirement already satisfied: google-auth in /usr/local/lib/python3.10/dist-packages (from cloud-sql-python-connector[asyncpg]) (2.17.3)\n", + "Requirement already satisfied: asyncpg>=0.29.0 in /root/.local/lib/python3.10/site-packages (from cloud-sql-python-connector[asyncpg]) (0.29.0)\n", + "Requirement already satisfied: async-timeout>=4.0.3 in /usr/local/lib/python3.10/dist-packages (from asyncpg>=0.29.0->cloud-sql-python-connector[asyncpg]) (4.0.3)\n", + "Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.10/dist-packages (from cryptography>=38.0.3->cloud-sql-python-connector[asyncpg]) (1.16.0)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->cloud-sql-python-connector[asyncpg]) (1.3.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->cloud-sql-python-connector[asyncpg]) (23.2.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->cloud-sql-python-connector[asyncpg]) (1.4.1)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->cloud-sql-python-connector[asyncpg]) (6.0.5)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->cloud-sql-python-connector[asyncpg]) (1.9.4)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from google-auth->cloud-sql-python-connector[asyncpg]) (5.3.2)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from google-auth->cloud-sql-python-connector[asyncpg]) (0.3.0)\n", + "Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from google-auth->cloud-sql-python-connector[asyncpg]) (1.16.0)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.10/dist-packages (from google-auth->cloud-sql-python-connector[asyncpg]) (4.9)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from Requests->cloud-sql-python-connector[asyncpg]) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from Requests->cloud-sql-python-connector[asyncpg]) (3.6)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from Requests->cloud-sql-python-connector[asyncpg]) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from Requests->cloud-sql-python-connector[asyncpg]) (2024.2.2)\n", + "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.12->cryptography>=38.0.3->cloud-sql-python-connector[asyncpg]) (2.21)\n", + "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /usr/local/lib/python3.10/dist-packages (from pyasn1-modules>=0.2.1->google-auth->cloud-sql-python-connector[asyncpg]) (0.5.1)\n", + "Installing collected packages: cloud-sql-python-connector\n", + "Successfully installed cloud-sql-python-connector-1.6.0\n" + ] + }, + { + "output_type": "display_data", + "data": { + "application/vnd.colab-display-data+json": { + "pip_warning": { + "packages": [ + "google" + ] + } + } + }, + "metadata": {} + } + ], + "source": [ + "! pip install langchain langchain-community google-cloud google-cloud-aiplatform asyncio asyncpg --upgrade --user\n", + "! pip install \"cloud-sql-python-connector[asyncpg]\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "v40bB_GMcr9f" + }, + "source": [ + ":**Colab only:** Uncomment the following cell to restart the kernel or use the button to restart the kernel. For Vertex AI Workbench you can restart the terminal using the button on top." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6o0iGVIdDD6K" + }, + "outputs": [], + "source": [ + "# # Automatically restart kernel after installs so that your environment can access the new packages\n", + "# import IPython\n", + "\n", + "# app = IPython.Application.instance()\n", + "# app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "###Note\n", + "\n", + "`If you do not have a GCP project, please follow the below link to create a new project`\n", + "\n", + "[Create a Google Cloud project](https://developers.google.com/workspace/guides/create-project)\n" + ], + "metadata": { + "id": "cTXTbj4UltKf" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Uj02bMRAc9_c" + }, + "source": [ + "#### Set your project ID\n", + "\n", + "If you don't know your project ID, try the following:\n", + "* Run `gcloud config list`.\n", + "* Run `gcloud projects list`.\n", + "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wnp1R1PYc9_c", + "outputId": "6502c721-a2fd-451f-b946-9f7b850d5966" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Updated property [core/project].\n" + ] + } + ], + "source": [ + "# @title Project { display-mode: \"form\" }\n", + "PROJECT_ID = \"gcp_project_id\" # @param {type:\"string\"}\n", + "\n", + "# Set the project id\n", + "! gcloud config set project {PROJECT_ID}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "38OFiUrIc9_c" + }, + "source": [ + "#### Set the region\n", + "\n", + "You can also change the `REGION` variable used by CloudSQL Postgres. Learn more about [CloudSQL Postgres regions](https://cloud.google.com/sql/docs/postgres/locations)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DWQxsk80c9_d" + }, + "outputs": [], + "source": [ + "# @title Region { display-mode: \"form\" }\n", + "REGION = \"US\" # @param {type: \"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aG5_tNwHc9_d" + }, + "source": [ + "#### Set the dataset and table names\n", + "\n", + "They will be your CloudSQL Postgres Vector Store." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "F8gPQnbDc9_d" + }, + "outputs": [], + "source": [ + "# @title Instance, Database and Table { display-mode: \"form\" }\n", + "INSTANCE = \"my_cloudsql_instance\" # @param {type: \"string\"}\n", + "DATABASE = \"my_langchain_database\" # @param {type: \"string\"}\n", + "TABLE = \"doc_and_vectors\" # @param {type: \"string\"}" + ] + }, + { + "cell_type": "markdown", + "source": [ + "###Pre-requisites for connecting to the CloudSQL instance\n", + "\n", + "To connect to the postgreSQL instance make sure to setup the cloudSQL auth proxy and ensure the addition of IAM users to the list of authenticated users to connect to the instance.\n", + "\n", + "Refer to this [link](https://github.com/GoogleCloudPlatform/cloud-sql-proxy) to setup auth proxy.\n", + "\n", + "Refer to this [link](https://cloud.google.com/sql/docs/postgres/users?_ga=2.165429503.-1722697531.1694071937) to add users to the instance" + ], + "metadata": { + "id": "W6wxYasx_EKB" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "w7JEEj49c9_d" + }, + "source": [ + "### Authenticating your notebook environment\n", + "\n", + "- If you are using **Colab** to run this notebook, uncomment the cell below and continue.\n", + "- If you are using **Vertex AI Workbench**, check out the setup instructions [here](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/setup-env)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1JZmXFavc9_d" + }, + "outputs": [], + "source": [ + "from google.colab import auth as google_auth\n", + "\n", + "google_auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AD3yG49BdLlr" + }, + "source": [ + "## Demo: CloudSQL Postgres VectorSearch" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vMi7sXhtc9_e" + }, + "source": [ + "### Create an embedding class [instance](https://)\n", + "\n", + "---\n", + "\n", + "\n", + "\n", + "You may need to enable Vertex AI API in your project by running\n", + "`gcloud services enable aiplatform.googleapis.com --project {PROJECT_ID}`\n", + "(replace `{PROJECT_ID}` with the name of your project).\n", + "\n", + "You can use any [LangChain embeddings model](https://python.langchain.com/docs/integrations/text_embedding/)." + ] + }, + { + "cell_type": "code", + "source": [ + "# Importing the necessary libraries\n", + "from langchain_community.vectorstores.cloudSQL import CloudSQLVectorStore\n", + "from langchain_community.vectorstores.cloudSQL import PostgreSQLEngine\n", + "from langchain_community.vectorstores.cloudSQL import HNSWIndex" + ], + "metadata": { + "id": "TuH9AOl58bAs" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Vb2RJocV9_LQ" + }, + "outputs": [], + "source": [ + "from langchain_community.embeddings import VertexAIEmbeddings\n", + "\n", + "embedding = VertexAIEmbeddings(\n", + " model_name=\"textembedding-gecko@latest\", project=PROJECT_ID\n", + ")" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Create PostgreSQLEngine to connect to the database" + ], + "metadata": { + "id": "D9Xs2qhm6X56" + } + }, + { + "cell_type": "code", + "source": [ + "# ClouSQLVectorStore requires an engine created using the PostgreSQLEngine class\n", + "engine = PostgreSQLEngine.from_instance(\n", + " region=\"region_name\", instance=\"instance_name\", database=\"dbname\"\n", + ")" + ], + "metadata": { + "id": "avlyHEMn6gzU" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Create CloudSQLVectorStore to create a table" + ], + "metadata": { + "id": "e1tl0aNx7SWy" + } + }, + { + "cell_type": "code", + "source": [ + "# Creating a basic CloudSQLVectorStore object\n", + "db = CloudSQLVectorStore(\n", + " engine=engine, table_name=\"table_name\", embedding_service=embedding\n", + ")\n", + "\n", + "# Alternatively we can create a non-default vector store object by tweaking the following args:\n", + "# vector_size - By default it is set to 768. Can be set to vector size of choice.\n", + "# content_column - By default the content column is named 'content'. Can be set to any name of choice.\n", + "# embedding_column - By default the embedding column is named 'embedding'. Can be set to any name of choice.\n", + "# metadata_columns - By default the metadata column is named 'metadata'. Can be set to any name/ list of names of choice.\n", + "# ignore_metadata_columns - By default the ignore_metadata_columns is None. Can be set to any name/ list of names of choice.\n", + "# index_query_options - By default the index_query_options is None. Can be set using HNSWIndex.QueryOptions() or IVFFlatIndex.QueryOptions().\n", + "# index - By default the index is a HNSWIndex object. Can be set to a IVFFlatIndex object or BruteForce object.\n", + "# distance_strategy - By default the distance_strategy is 'L2'. Can be set to 'INNER PRODUCT' or 'COSINE'.\n", + "# overwrite_existing - By default the overwrite_existing is False. Can be set to True if table needs to be overwritten." + ], + "metadata": { + "id": "z-AZyzAQ7bsf" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PeOMpftjc9_e" + }, + "source": [ + "### Add texts\n", + "This method helps add texts into the table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cwvi_O5Wc9_e" + }, + "outputs": [], + "source": [ + "texts = [\"Apples and oranges\", \"Cars and airplanes\", \"Pineapple\", \"Train\", \"Banana\"]\n", + "metadatas = [{\"len\": len(t)} for t in texts]\n", + "await db.add_texts(texts=texts, metadatas=metadatas)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kSkL9l1Hc9_e" + }, + "source": [ + "### Search for documents\n", + "The default distance strategy used for querying similar documents is L2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Q4pCL2I_c9_f" + }, + "outputs": [], + "source": [ + "query = \"I'd like a fruit.\"\n", + "docs = await db.similarity_search(query)\n", + "print(docs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5R6h0_Cvc9_f" + }, + "source": [ + "### Search for documents by vector\n", + "Searching for similar documents with list of embeddings as params\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NGNdS7cqc9_f" + }, + "outputs": [], + "source": [ + "query_vector = embedding.embed_query(query)\n", + "docs = await db.asimilarity_search_by_vector(query_vector, k=2)\n", + "print(docs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yKw_Lab-c9_f" + }, + "source": [ + "### Search for documents with metadata filter\n", + "Additional metadata filtering" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uyYDfbMKc9_f" + }, + "outputs": [], + "source": [ + "# This should only return \"Banana\" document.\n", + "docs = await db.asimilarity_search_by_vector(query_vector, filter={\"len\": 6})\n", + "print(docs)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "###Maximum Marginal relevance search (MMR)\n", + "Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents.\n", + "\n" + ], + "metadata": { + "id": "IPhxeqGr7sOS" + } + }, + { + "cell_type": "code", + "source": [ + "# This should return top 4 relevant documents to the given query\n", + "docs = await db.amax_marginal_relevance_search(query)\n", + "print(docs)" + ], + "metadata": { + "id": "zmnGOrTT71BF" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "###Indexing\n", + "Setting custom indexes/ rebuilding indexes" + ], + "metadata": { + "id": "_K68SOsq73Tc" + } + }, + { + "cell_type": "code", + "source": [ + "# This would return None if index is rebuilt or created.\n", + "index = HNSWIndex()\n", + "await db.areindex(index)" + ], + "metadata": { + "id": "aZdo-WVM77I7" + }, + "execution_count": null, + "outputs": [] + } + ], + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } }, - "id": "0ZITIDE160OD", - "outputId": "90e0636e-ff34-4e1e-ad37-d2a6db4a317e" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting langchain\n", - " Downloading langchain-0.1.5-py3-none-any.whl (806 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m806.7/806.7 kB\u001b[0m \u001b[31m11.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting langchain-community\n", - " Downloading langchain_community-0.0.18-py3-none-any.whl (1.6 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m39.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting google-cloud\n", - " Downloading google_cloud-0.34.0-py2.py3-none-any.whl (1.8 kB)\n", - "Requirement already satisfied: google-cloud-aiplatform in /usr/local/lib/python3.10/dist-packages (1.39.0)\n", - "Collecting google-cloud-aiplatform\n", - " Downloading google_cloud_aiplatform-1.40.0-py2.py3-none-any.whl (3.4 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m56.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting asyncio\n", - " Downloading asyncio-3.4.3-py3-none-any.whl (101 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m101.8/101.8 kB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting asyncpg\n", - " Downloading asyncpg-0.29.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.7/2.7 MB\u001b[0m \u001b[31m54.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (6.0.1)\n", - "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.0.25)\n", - "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (3.9.3)\n", - "Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (4.0.3)\n", - "Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)\n", - " Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)\n", - "Collecting jsonpatch<2.0,>=1.33 (from langchain)\n", - " Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)\n", - "Collecting langchain-core<0.2,>=0.1.16 (from langchain)\n", - " Downloading langchain_core-0.1.19-py3-none-any.whl (238 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m238.5/238.5 kB\u001b[0m \u001b[31m30.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting langsmith<0.1,>=0.0.83 (from langchain)\n", - " Downloading langsmith-0.0.87-py3-none-any.whl (55 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.4/55.4 kB\u001b[0m \u001b[31m7.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.23.5)\n", - "Requirement already satisfied: pydantic<3,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.6.0)\n", - "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.31.0)\n", - "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (8.2.3)\n", - "Requirement already satisfied: google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (2.11.1)\n", - "Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (1.23.0)\n", - "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (3.20.3)\n", - "Requirement already satisfied: packaging>=14.3 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (23.2)\n", - "Requirement already satisfied: google-cloud-storage<3.0.0dev,>=1.32.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (2.8.0)\n", - "Requirement already satisfied: google-cloud-bigquery<4.0.0dev,>=1.15.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (3.12.0)\n", - "Requirement already satisfied: google-cloud-resource-manager<3.0.0dev,>=1.3.3 in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (1.12.0)\n", - "Requirement already satisfied: shapely<3.0.0dev in /usr/local/lib/python3.10/dist-packages (from google-cloud-aiplatform) (2.0.2)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\n", - "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.2.0)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.4.1)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.5)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.4)\n", - "Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain)\n", - " Downloading marshmallow-3.20.2-py3-none-any.whl (49 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain)\n", - " Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)\n", - "Requirement already satisfied: googleapis-common-protos<2.0.dev0,>=1.56.2 in /usr/local/lib/python3.10/dist-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (1.62.0)\n", - "Requirement already satisfied: google-auth<3.0.dev0,>=2.14.1 in /usr/local/lib/python3.10/dist-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (2.17.3)\n", - "Requirement already satisfied: grpcio<2.0dev,>=1.33.2 in /usr/local/lib/python3.10/dist-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (1.60.1)\n", - "Requirement already satisfied: grpcio-status<2.0.dev0,>=1.33.2 in /usr/local/lib/python3.10/dist-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (1.48.2)\n", - "Requirement already satisfied: google-cloud-core<3.0.0dev,>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform) (2.3.3)\n", - "Requirement already satisfied: google-resumable-media<3.0dev,>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform) (2.7.0)\n", - "Requirement already satisfied: python-dateutil<3.0dev,>=2.7.2 in /usr/local/lib/python3.10/dist-packages (from google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform) (2.8.2)\n", - "Requirement already satisfied: grpc-google-iam-v1<1.0.0dev,>=0.12.4 in /usr/local/lib/python3.10/dist-packages (from google-cloud-resource-manager<3.0.0dev,>=1.3.3->google-cloud-aiplatform) (0.13.0)\n", - "Collecting jsonpointer>=1.9 (from jsonpatch<2.0,>=1.33->langchain)\n", - " Downloading jsonpointer-2.4-py2.py3-none-any.whl (7.8 kB)\n", - "Requirement already satisfied: anyio<5,>=3 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.2,>=0.1.16->langchain) (3.7.1)\n", - "Requirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (0.6.0)\n", - "Requirement already satisfied: pydantic-core==2.16.1 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (2.16.1)\n", - "Requirement already satisfied: typing-extensions>=4.6.1 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain) (4.9.0)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (3.3.2)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (3.6)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2.0.7)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (2024.2.2)\n", - "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain) (3.0.3)\n", - "Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3->langchain-core<0.2,>=0.1.16->langchain) (1.3.0)\n", - "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3->langchain-core<0.2,>=0.1.16->langchain) (1.2.0)\n", - "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (5.3.2)\n", - "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (0.3.0)\n", - "Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (1.16.0)\n", - "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.10/dist-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (4.9)\n", - "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /usr/local/lib/python3.10/dist-packages (from google-resumable-media<3.0dev,>=0.6.0->google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform) (1.5.0)\n", - "Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain)\n", - " Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)\n", - "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /usr/local/lib/python3.10/dist-packages (from pyasn1-modules>=0.2.1->google-auth<3.0.dev0,>=2.14.1->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform) (0.5.1)\n", - "Installing collected packages: google-cloud, asyncio, mypy-extensions, marshmallow, jsonpointer, asyncpg, typing-inspect, jsonpatch, langsmith, dataclasses-json, langchain-core, langchain-community, langchain, google-cloud-aiplatform\n", - "\u001b[33m WARNING: The script langsmith is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script langchain-server is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script tb-gcp-uploader is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0mSuccessfully installed asyncio-3.4.3 asyncpg-0.29.0 dataclasses-json-0.6.4 google-cloud-0.34.0 google-cloud-aiplatform-1.40.0 jsonpatch-1.33 jsonpointer-2.4 langchain-0.1.5 langchain-community-0.0.18 langchain-core-0.1.19 langsmith-0.0.87 marshmallow-3.20.2 mypy-extensions-1.0.0 typing-inspect-0.9.0\n" - ] - }, - { - "output_type": "display_data", - "data": { - "application/vnd.colab-display-data+json": { - "pip_warning": { - "packages": [ - "asyncio", - "google" - ] - } - } - }, - "metadata": {} - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting cloud-sql-python-connector[asyncpg]\n", - " Downloading cloud_sql_python_connector-1.6.0-py2.py3-none-any.whl (35 kB)\n", - "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from cloud-sql-python-connector[asyncpg]) (3.9.3)\n", - "Requirement already satisfied: cryptography>=38.0.3 in /usr/local/lib/python3.10/dist-packages (from cloud-sql-python-connector[asyncpg]) (42.0.2)\n", - "Requirement already satisfied: Requests in /usr/local/lib/python3.10/dist-packages (from cloud-sql-python-connector[asyncpg]) (2.31.0)\n", - "Requirement already satisfied: google-auth in /usr/local/lib/python3.10/dist-packages (from cloud-sql-python-connector[asyncpg]) (2.17.3)\n", - "Requirement already satisfied: asyncpg>=0.29.0 in /root/.local/lib/python3.10/site-packages (from cloud-sql-python-connector[asyncpg]) (0.29.0)\n", - "Requirement already satisfied: async-timeout>=4.0.3 in /usr/local/lib/python3.10/dist-packages (from asyncpg>=0.29.0->cloud-sql-python-connector[asyncpg]) (4.0.3)\n", - "Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.10/dist-packages (from cryptography>=38.0.3->cloud-sql-python-connector[asyncpg]) (1.16.0)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->cloud-sql-python-connector[asyncpg]) (1.3.1)\n", - "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->cloud-sql-python-connector[asyncpg]) (23.2.0)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->cloud-sql-python-connector[asyncpg]) (1.4.1)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->cloud-sql-python-connector[asyncpg]) (6.0.5)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->cloud-sql-python-connector[asyncpg]) (1.9.4)\n", - "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from google-auth->cloud-sql-python-connector[asyncpg]) (5.3.2)\n", - "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from google-auth->cloud-sql-python-connector[asyncpg]) (0.3.0)\n", - "Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from google-auth->cloud-sql-python-connector[asyncpg]) (1.16.0)\n", - "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.10/dist-packages (from google-auth->cloud-sql-python-connector[asyncpg]) (4.9)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from Requests->cloud-sql-python-connector[asyncpg]) (3.3.2)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from Requests->cloud-sql-python-connector[asyncpg]) (3.6)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from Requests->cloud-sql-python-connector[asyncpg]) (2.0.7)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from Requests->cloud-sql-python-connector[asyncpg]) (2024.2.2)\n", - "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.12->cryptography>=38.0.3->cloud-sql-python-connector[asyncpg]) (2.21)\n", - "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /usr/local/lib/python3.10/dist-packages (from pyasn1-modules>=0.2.1->google-auth->cloud-sql-python-connector[asyncpg]) (0.5.1)\n", - "Installing collected packages: cloud-sql-python-connector\n", - "Successfully installed cloud-sql-python-connector-1.6.0\n" - ] - }, - { - "output_type": "display_data", - "data": { - "application/vnd.colab-display-data+json": { - "pip_warning": { - "packages": [ - "google" - ] - } - } - }, - "metadata": {} - } - ], - "source": [ - "! pip install langchain langchain-community google-cloud google-cloud-aiplatform asyncio asyncpg --upgrade --user\n", - "! pip install \"cloud-sql-python-connector[asyncpg]\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "v40bB_GMcr9f" - }, - "source": [ - ":**Colab only:** Uncomment the following cell to restart the kernel or use the button to restart the kernel. For Vertex AI Workbench you can restart the terminal using the button on top." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6o0iGVIdDD6K" - }, - "outputs": [], - "source": [ - "# # Automatically restart kernel after installs so that your environment can access the new packages\n", - "# import IPython\n", - "\n", - "# app = IPython.Application.instance()\n", - "# app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "source": [ - "###Note\n", - "\n", - "`If you do not have a GCP project, please follow the below link to create a new project`\n", - "\n", - "[Create a Google Cloud project](https://developers.google.com/workspace/guides/create-project)\n" - ], - "metadata": { - "id": "cTXTbj4UltKf" - } - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Uj02bMRAc9_c" - }, - "source": [ - "#### Set your project ID\n", - "\n", - "If you don't know your project ID, try the following:\n", - "* Run `gcloud config list`.\n", - "* Run `gcloud projects list`.\n", - "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "wnp1R1PYc9_c", - "outputId": "6502c721-a2fd-451f-b946-9f7b850d5966" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Updated property [core/project].\n" - ] - } - ], - "source": [ - "# @title Project { display-mode: \"form\" }\n", - "PROJECT_ID = \"gcp_project_id\" # @param {type:\"string\"}\n", - "\n", - "# Set the project id\n", - "! gcloud config set project {PROJECT_ID}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "38OFiUrIc9_c" - }, - "source": [ - "#### Set the region\n", - "\n", - "You can also change the `REGION` variable used by CloudSQL Postgres. Learn more about [CloudSQL Postgres regions](https://cloud.google.com/sql/docs/postgres/locations)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "DWQxsk80c9_d" - }, - "outputs": [], - "source": [ - "# @title Region { display-mode: \"form\" }\n", - "REGION = \"US\" # @param {type: \"string\"}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aG5_tNwHc9_d" - }, - "source": [ - "#### Set the dataset and table names\n", - "\n", - "They will be your CloudSQL Postgres Vector Store." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "F8gPQnbDc9_d" - }, - "outputs": [], - "source": [ - "# @title Instance, Database and Table { display-mode: \"form\" }\n", - "INSTANCE = \"my_cloudsql_instance\" # @param {type: \"string\"}\n", - "DATABASE = \"my_langchain_database\" # @param {type: \"string\"}\n", - "TABLE = \"doc_and_vectors\" # @param {type: \"string\"}" - ] - }, - { - "cell_type": "markdown", - "source": [ - "###Pre-requisites for connecting to the CloudSQL instance\n", - "\n", - "To connect to the postgreSQL instance make sure to setup the cloudSQL auth proxy and ensure the addition of IAM users to the list of authenticated users to connect to the instance.\n", - "\n", - "Refer to this [link](https://github.com/GoogleCloudPlatform/cloud-sql-proxy) to setup auth proxy.\n", - "\n", - "Refer to this [link](https://cloud.google.com/sql/docs/postgres/users?_ga=2.165429503.-1722697531.1694071937) to add users to the instance" - ], - "metadata": { - "id": "W6wxYasx_EKB" - } - }, - { - "cell_type": "markdown", - "metadata": { - "id": "w7JEEj49c9_d" - }, - "source": [ - "### Authenticating your notebook environment\n", - "\n", - "- If you are using **Colab** to run this notebook, uncomment the cell below and continue.\n", - "- If you are using **Vertex AI Workbench**, check out the setup instructions [here](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/setup-env)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1JZmXFavc9_d" - }, - "outputs": [], - "source": [ - "from google.colab import auth as google_auth\n", - "\n", - "google_auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AD3yG49BdLlr" - }, - "source": [ - "## Demo: CloudSQL Postgres VectorSearch" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vMi7sXhtc9_e" - }, - "source": [ - "### Create an embedding class [instance](https://)\n", - "\n", - "---\n", - "\n", - "\n", - "\n", - "You may need to enable Vertex AI API in your project by running\n", - "`gcloud services enable aiplatform.googleapis.com --project {PROJECT_ID}`\n", - "(replace `{PROJECT_ID}` with the name of your project).\n", - "\n", - "You can use any [LangChain embeddings model](https://python.langchain.com/docs/integrations/text_embedding/)." - ] - }, - { - "cell_type": "code", - "source": [ - "# Importing the necessary libraries\n", - "from langchain_community.vectorstores.cloudSQL import CloudSQLVectorStore\n", - "from langchain_community.vectorstores.cloudSQL import CloudSQLEngine\n", - "from langchain_community.vectorstores.cloudSQL import HNSWIndex" - ], - "metadata": { - "id": "TuH9AOl58bAs" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Vb2RJocV9_LQ" - }, - "outputs": [], - "source": [ - "from langchain_community.embeddings import VertexAIEmbeddings\n", - "\n", - "embedding = VertexAIEmbeddings(\n", - " model_name=\"textembedding-gecko@latest\", project=PROJECT_ID\n", - ")" - ] - }, - { - "cell_type": "markdown", - "source": [ - "### Create CloudSQLEngine to connect to the database" - ], - "metadata": { - "id": "D9Xs2qhm6X56" - } - }, - { - "cell_type": "code", - "source": [ - "# ClouSQLVectorStore requires an engine created using the CloudSQLEngine class\n", - "engine = CloudSQLEngine.from_instance(\n", - " region=\"region_name\", instance=\"instance_name\", database=\"dbname\"\n", - ")" - ], - "metadata": { - "id": "avlyHEMn6gzU" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Create CloudSQLVectorStore to create a table" - ], - "metadata": { - "id": "e1tl0aNx7SWy" - } - }, - { - "cell_type": "code", - "source": [ - "# Creating a basic CloudSQLVectorStore object\n", - "db = CloudSQLVectorStore(\n", - " engine=engine, table_name=\"table_name\", embedding_service=embedding\n", - ")\n", - "\n", - "# Alternatively we can create a non-default vector store object by tweaking the following args:\n", - "# vector_size - By default it is set to 768. Can be set to vector size of choice.\n", - "# content_column - By default the content column is named 'content'. Can be set to any name of choice.\n", - "# embedding_column - By default the embedding column is named 'embedding'. Can be set to any name of choice.\n", - "# metadata_columns - By default the metadata column is named 'metadata'. Can be set to any name/ list of names of choice.\n", - "# ignore_metadata_columns - By default the ignore_metadata_columns is None. Can be set to any name/ list of names of choice.\n", - "# index_query_options - By default the index_query_options is None. Can be set using HNSWIndex.QueryOptions() or IVFFlatIndex.QueryOptions().\n", - "# index - By default the index is a HNSWIndex object. Can be set to a IVFFlatIndex object or BruteForce object.\n", - "# distance_strategy - By default the distance_strategy is 'L2'. Can be set to 'INNER PRODUCT' or 'COSINE'.\n", - "# overwrite_existing - By default the overwrite_existing is False. Can be set to True if table needs to be overwritten." - ], - "metadata": { - "id": "z-AZyzAQ7bsf" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PeOMpftjc9_e" - }, - "source": [ - "### Add texts\n", - "This method helps add texts into the table" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cwvi_O5Wc9_e" - }, - "outputs": [], - "source": [ - "texts = [\"Apples and oranges\", \"Cars and airplanes\", \"Pineapple\", \"Train\", \"Banana\"]\n", - "metadatas = [{\"len\": len(t)} for t in texts]\n", - "await db.add_texts(texts=texts, metadatas=metadatas)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kSkL9l1Hc9_e" - }, - "source": [ - "### Search for documents\n", - "The default distance strategy used for querying similar documents is L2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Q4pCL2I_c9_f" - }, - "outputs": [], - "source": [ - "query = \"I'd like a fruit.\"\n", - "docs = await db.similarity_search(query)\n", - "print(docs)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5R6h0_Cvc9_f" - }, - "source": [ - "### Search for documents by vector\n", - "Searching for similar documents with list of embeddings as params\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NGNdS7cqc9_f" - }, - "outputs": [], - "source": [ - "query_vector = embedding.embed_query(query)\n", - "docs = await db.asimilarity_search_by_vector(query_vector, k=2)\n", - "print(docs)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yKw_Lab-c9_f" - }, - "source": [ - "### Search for documents with metadata filter\n", - "Additional metadata filtering" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uyYDfbMKc9_f" - }, - "outputs": [], - "source": [ - "# This should only return \"Banana\" document.\n", - "docs = await db.asimilarity_search_by_vector(query_vector, filter={\"len\": 6})\n", - "print(docs)" - ] - }, - { - "cell_type": "markdown", - "source": [ - "###Maximum Marginal relevance search (MMR)\n", - "Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents.\n", - "\n" - ], - "metadata": { - "id": "IPhxeqGr7sOS" - } - }, - { - "cell_type": "code", - "source": [ - "# This should return top 4 relevant documents to the given query\n", - "docs = await db.amax_marginal_relevance_search(query)\n", - "print(docs)" - ], - "metadata": { - "id": "zmnGOrTT71BF" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "###Indexing\n", - "Setting custom indexes/ rebuilding indexes" - ], - "metadata": { - "id": "_K68SOsq73Tc" - } - }, - { - "cell_type": "code", - "source": [ - "# This would return None if index is rebuilt or created.\n", - "index = HNSWIndex()\n", - "await db.areindex(index)" - ], - "metadata": { - "id": "aZdo-WVM77I7" - }, - "execution_count": null, - "outputs": [] - } - ], - "metadata": { - "colab": { - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.0" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "nbformat": 4, + "nbformat_minor": 0 } \ No newline at end of file diff --git a/integration.cloudbuild.yaml b/integration.cloudbuild.yaml index c78148e..39ccd0e 100644 --- a/integration.cloudbuild.yaml +++ b/integration.cloudbuild.yaml @@ -22,3 +22,13 @@ steps: name: python:3.11 entrypoint: python args: ["-m", "pytest"] + env: + - "PROJECT_ID=$PROJECT_ID" + - "INSTANCE_ID=$_INSTANCE_ID" + - "DATABASE_ID=$_DATABASE_ID" + - "REGION=$_REGION" + +substitutions: + _DATABASE_USER: test-instance + _DATABASE_ID: test-database + _REGION: us-central1 diff --git a/src/langchain_google_cloud_sql_pg/cloudsql_vectorstore.py b/src/langchain_google_cloud_sql_pg/cloudsql_vectorstore.py index 96d5683..99e34f4 100644 --- a/src/langchain_google_cloud_sql_pg/cloudsql_vectorstore.py +++ b/src/langchain_google_cloud_sql_pg/cloudsql_vectorstore.py @@ -52,7 +52,7 @@ def __init__( content_column: str = "content", embedding_column: str = "embedding", metadata_columns: List[str] = [], - ignore_metadata_columns: List[str] = None, + ignore_metadata_columns: Optional[List[str]] = None, id_column: str = "langchain_id", metadata_json_column: str = "langchain_metadata", index_query_options: Optional[ @@ -60,10 +60,10 @@ def __init__( ] = None, distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, overwrite_existing: bool = False, - k: int = None, - score_threshold: float = None, - fetch_k: int = None, - lambda_mult: float = None, + k: Optional[int] = None, + score_threshold: Optional[float] = None, + fetch_k: Optional[int] = None, + lambda_mult: Optional[float] = None, ): """_summary_ @@ -155,7 +155,6 @@ async def __post_init__(self) -> None: # if column_types[content_column] is not "String": # raise ValueError(f"Content column, {content_column}, does not exist.") if self.metadata_json_column in columns: - print("found") self.store_metadata = True all_columns = columns # .keys() @@ -166,7 +165,6 @@ async def __post_init__(self) -> None: del all_columns[self.id_column] del all_columns[self.content_column] del all_columns[self.embedding_column] - # print("key", self.metadata_columns) self.metadata_columns = [k for k, v in all_columns.keys()] @property @@ -203,9 +201,6 @@ async def aadd_embeddings( ) values_stmt += f",'{extra}')" if self.store_metadata else ")" query = insert_stmt + values_stmt - print(query) - print(extra) - print(self.metadata_columns) await self.engine._aexecute_update(query) return ids diff --git a/tests/test_cloudsql_vectorstore.py b/tests/test_cloudsql_vectorstore.py index 6eb95d3..0e49c87 100644 --- a/tests/test_cloudsql_vectorstore.py +++ b/tests/test_cloudsql_vectorstore.py @@ -52,8 +52,7 @@ class FakeEmbeddingsWithAdaDimension(FakeEmbeddings): def embed_documents(self, texts: List[str]) -> List[List[float]]: """Return simple embeddings.""" return [ - [float(1.0)] * (ADA_TOKEN_COUNT - 1) + [float(i)] - for i in range(len(texts)) + [float(1.0)] * (ADA_TOKEN_COUNT - 1) + [float(i)] for i in range(len(texts)) ] def embed_query(self, text: str = "default") -> List[float]: @@ -145,7 +144,7 @@ async def test_override(self, engine): class TestAsync: @pytest_asyncio.fixture # (scope="function") async def engine(self): - engine = await PostgreSQLEngine.from_instance( + engine = PostgreSQLEngine.from_instance( project_id=PROJECT_ID, instance=INSTANCE, region=REGION, @@ -158,7 +157,7 @@ async def engine(self): @pytest_asyncio.fixture # (scope="function") async def engine_custom(self): table_name = CUSTOM_TABLE - engine = await PostgreSQLEngine.from_instance( + engine = PostgreSQLEngine.from_instance( project_id=PROJECT_ID, instance=INSTANCE, region=REGION, @@ -233,9 +232,7 @@ async def test_with_metadatas_with_scores( metadata_columns=["page"], ) output = await vs.asimilarity_search_with_score("foo", k=1) - assert output == [ - (Document(page_content="foo", metadata={"page": "0"}), 0.0) - ] + assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)] async def test_with_filter_match(self, engine_custom) -> None: """Test end to end construction and search.""" @@ -249,12 +246,8 @@ async def test_with_filter_match(self, engine_custom) -> None: engine=engine_custom, metadata_columns=["page"], ) - output = await vs.asimilarity_search_with_score( - "foo", k=1, filter="page = '0'" - ) - assert output == [ - (Document(page_content="foo", metadata={"page": "0"}), 0.0) - ] + output = await vs.asimilarity_search_with_score("foo", k=1, filter="page = '0'") + assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)] async def test_with_filter_distant_match( self, @@ -271,9 +264,7 @@ async def test_with_filter_distant_match( engine=engine_custom, metadata_columns=["page"], ) - output = await vs.asimilarity_search_with_score( - "foo", k=1, filter="page = '2'" - ) + output = await vs.asimilarity_search_with_score("foo", k=1, filter="page = '2'") assert output == [ ( Document(page_content="baz", metadata={"page": "2"}), @@ -296,9 +287,7 @@ async def test_with_filter_no_match( engine=engine_custom, metadata_columns=["page"], ) - output = await vs.asimilarity_search_with_score( - "foo", k=1, filter="page = '5'" - ) + output = await vs.asimilarity_search_with_score("foo", k=1, filter="page = '5'") assert output == [] async def test_relevance_score(self, engine_custom) -> None: @@ -371,9 +360,7 @@ async def test_max_marginal_relevance_search_amenities( embedding_service=embeddings_service, engine=engine_custom, ) - output = await vs.amax_marginal_relevance_search( - "coffee", k=1, fetch_k=3 - ) + output = await vs.amax_marginal_relevance_search("coffee", k=1, fetch_k=3) assert "coffee" in output[0].page_content @@ -382,7 +369,7 @@ class TestIndex: @pytest_asyncio.fixture() async def vs(self): table_name = "test_table2" - engine = await PostgreSQLEngine.from_instance( + engine = PostgreSQLEngine.from_instance( project_id=PROJECT_ID, instance=INSTANCE, region=REGION, @@ -405,9 +392,7 @@ async def test_applyindex(self, vs) -> None: await vs.aapply_index(index) async def test_applyindex_l2(self, vs) -> None: - index = HNSWIndex( - name="hnswl2", distance_strategy=DistanceStrategy.EUCLIDEAN - ) + index = HNSWIndex(name="hnswl2", distance_strategy=DistanceStrategy.EUCLIDEAN) await vs.aapply_index(index) async def test_applyindex_ip(self, vs) -> None: @@ -432,7 +417,7 @@ async def test_dropindex(self, vs) -> None: # @pytest_asyncio.fixture(scope="function") # async def engine(self): # table_name = "test_table_sync" -# engine = await PostgreSQLEngine.from_instance( +# engine = PostgreSQLEngine.from_instance( # project_id=PROJECT_ID, # instance=INSTANCE, # region=REGION, @@ -446,7 +431,7 @@ async def test_dropindex(self, vs) -> None: # @pytest_asyncio.fixture(scope="function") # async def engine_custom(self): # table_name = CUSTOM_TABLE -# engine = await PostgreSQLEngine.from_instance( +# engine = PostgreSQLEngine.from_instance( # project_id=PROJECT_ID, # instance=INSTANCE, # region=REGION, From 4f9474d020e116afec80ff6db8e0d8ed5dc6522b Mon Sep 17 00:00:00 2001 From: Averi Kitsch Date: Thu, 8 Feb 2024 21:21:42 -0800 Subject: [PATCH 8/9] mypy --- .../cloudsql_vectorstore.py | 26 +++++++++--------- .../postgresql_engine.py | 27 +++++++------------ 2 files changed, 24 insertions(+), 29 deletions(-) diff --git a/src/langchain_google_cloud_sql_pg/cloudsql_vectorstore.py b/src/langchain_google_cloud_sql_pg/cloudsql_vectorstore.py index 99e34f4..310ad0d 100644 --- a/src/langchain_google_cloud_sql_pg/cloudsql_vectorstore.py +++ b/src/langchain_google_cloud_sql_pg/cloudsql_vectorstore.py @@ -20,13 +20,12 @@ import uuid from typing import Any, Callable, Iterable, List, Optional, Tuple, Type, Union -import nest_asyncio +import nest_asyncio # type: ignore import numpy as np from langchain_community.vectorstores.utils import maximal_marginal_relevance from langchain_core.documents import Document from langchain_core.embeddings import Embeddings from langchain_core.vectorstores import VectorStore -from pgvector.sqlalchemy import Vector from sqlalchemy import text from .indexes import ( @@ -218,7 +217,10 @@ async def aadd_embeddings( # ) async def aadd_documents( - self, documents: List[Document], ids: List[str] = None, **kwargs: Any + self, + documents: List[Document], + ids: Optional[List[str]] = None, + **kwargs: Any, ) -> List[str]: texts = [doc.page_content for doc in documents] metadatas = [doc.metadata for doc in documents] @@ -349,7 +351,7 @@ async def __query_collection( self, embedding: List[float], k: int = 4, - filter: str = None, + filter: Optional[str] = None, ) -> List[Any]: k = self.k if self.k else k if self.distance_strategy == DistanceStrategy.EUCLIDEAN: @@ -371,7 +373,7 @@ async def asimilarity_search( self, query: str, k: int = 4, - filter: str = None, + filter: Optional[str] = None, **kwargs: Any, ) -> List[Document]: embedding = self.embedding_service.embed_query(text=query) @@ -384,7 +386,7 @@ def similarity_search( self, query: str, k: int = 4, - filter: str = None, + filter: Optional[str] = None, **kwargs: Any, ) -> List[Document]: return self.loop.create_task( @@ -395,7 +397,7 @@ async def asimilarity_search_with_score( self, query: str, k: int = 4, - filter: str = None, + filter: Optional[str] = None, **kwargs: Any, ) -> List[Tuple[Document, float]]: embedding = self.embedding_service.embed_query(query) @@ -417,7 +419,7 @@ async def asimilarity_search_by_vector( self, embedding: List[float], k: int = 4, - filter: str = None, + filter: Optional[str] = None, **kwargs: Any, ) -> List[Document]: docs_and_scores = await self.asimilarity_search_with_score_by_vector( @@ -439,7 +441,7 @@ async def asimilarity_search_with_score_by_vector( self, embedding: List[float], k: int = 4, - filter: str = None, + filter: Optional[str] = None, **kwargs: Any, ) -> List[Tuple[Document, float]]: results = await self.__query_collection(embedding=embedding, k=k, filter=filter) @@ -484,7 +486,7 @@ async def amax_marginal_relevance_search( k: int = 4, fetch_k: int = 20, lambda_mult: float = 0.5, - filter: str = None, + filter: Optional[str] = None, **kwargs: Any, ) -> List[Document]: embedding = self.embedding_service.embed_query(text=query) @@ -519,7 +521,7 @@ async def amax_marginal_relevance_search_by_vector( k: int = 4, fetch_k: int = 20, lambda_mult: float = 0.5, - filter: str = None, + filter: Optional[str] = None, **kwargs: Any, ) -> List[Document]: """Return docs selected using the maximal marginal relevance.""" @@ -542,7 +544,7 @@ async def amax_marginal_relevance_search_with_score_by_vector( k: int = 4, fetch_k: int = 20, lambda_mult: float = 0.5, - filter: str = None, + filter: Optional[str] = None, ) -> List[Tuple[Document, float]]: results = await self.__query_collection( embedding=embedding, k=fetch_k, filter=filter diff --git a/src/langchain_google_cloud_sql_pg/postgresql_engine.py b/src/langchain_google_cloud_sql_pg/postgresql_engine.py index 97fe8be..79d7a1c 100644 --- a/src/langchain_google_cloud_sql_pg/postgresql_engine.py +++ b/src/langchain_google_cloud_sql_pg/postgresql_engine.py @@ -17,21 +17,22 @@ # import requests # import sqlalchemy -import asyncio +import asyncio # type: ignore from threading import Thread from typing import TYPE_CHECKING, Dict, List, Optional, Type import aiohttp -import google.auth -import google.auth.transport.requests -import nest_asyncio +import google.auth # type: ignore +import google.auth.transport.requests # type: ignore from google.cloud.sql.connector import Connector, create_async_connector # from pgvector.asyncpg import register_vector from sqlalchemy import Column, text -from sqlalchemy.ext.asyncio import AsyncConnection, AsyncEngine, create_async_engine - -# nest_asyncio.apply() +from sqlalchemy.ext.asyncio import ( + AsyncConnection, + AsyncEngine, + create_async_engine, +) if TYPE_CHECKING: import asyncpg @@ -109,7 +110,7 @@ def from_instance( region: str, instance: str, database: str, - project_id: str = None, + project_id: Optional[str] = None, ) -> PostgreSQLEngine: """Create PostgreSQLEngine connection to the postgres database in the CloudSQL instance. Args: @@ -148,9 +149,7 @@ async def get_conn(): conn = await connector.connect_async( f"{self.project_id}:{self.region}:{self.instance}", "asyncpg", - # user=await _get_iam_principal_email(credentials), - user="postgres", - password="my-pg-pass", + user=await _get_iam_principal_email(credentials), enable_iam_auth=True, db=self.database, ) @@ -189,18 +188,12 @@ async def init_vectorstore_table( overwrite_existing: bool = False, store_metadata: bool = True, ) -> None: - # async with self.engine.connect() as conn: - # Enable pgvector - # await conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector")) await self._aexecute_update("CREATE EXTENSION IF NOT EXISTS vector") # Register the vector type # await register_vector(conn) if overwrite_existing: await self._aexecute_update(f"DROP TABLE {table_name}") - # await conn.execute( - # text(f"TRUNCATE TABLE {table_name} RESET IDENTITY") - # ) # TODO? query = f"""CREATE TABLE IF NOT EXISTS {table_name}( {id_column} UUID PRIMARY KEY, From 8c96f1cc89236cf2ee71a318748c317c71325c23 Mon Sep 17 00:00:00 2001 From: Averi Kitsch Date: Fri, 9 Feb 2024 08:02:03 -0800 Subject: [PATCH 9/9] clean up --- src/langchain_google_cloud_sql_pg/__init__.py | 4 +++- src/langchain_google_cloud_sql_pg/indexes.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/langchain_google_cloud_sql_pg/__init__.py b/src/langchain_google_cloud_sql_pg/__init__.py index 3ad6dd5..7363b32 100644 --- a/src/langchain_google_cloud_sql_pg/__init__.py +++ b/src/langchain_google_cloud_sql_pg/__init__.py @@ -12,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from langchain_google_cloud_sql_pg.cloudsql_vectorstore import CloudSQLVectorStore +from langchain_google_cloud_sql_pg.cloudsql_vectorstore import ( + CloudSQLVectorStore, +) from langchain_google_cloud_sql_pg.postgresql_engine import PostgreSQLEngine __all__ = ["PostgreSQLEngine", "CloudSQLVectorStore"] diff --git a/src/langchain_google_cloud_sql_pg/indexes.py b/src/langchain_google_cloud_sql_pg/indexes.py index 75c6711..690d5dd 100644 --- a/src/langchain_google_cloud_sql_pg/indexes.py +++ b/src/langchain_google_cloud_sql_pg/indexes.py @@ -43,7 +43,7 @@ class HNSWIndex: def __init__( self, name: str = "langchainhnsw", - m: int = 16, # TODO! + m: int = 16, ef_construction: int = 64, partial_indexes: List = [], distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,