From 1172bd39a4035760dc0cd15e5eae7179be521ed4 Mon Sep 17 00:00:00 2001 From: Andrew Blum Date: Fri, 10 May 2024 17:04:03 -0700 Subject: [PATCH 1/5] initial Nomic embed function --- chromadb/test/ef/test_nomic_ef.py | 32 +++++++++++++++++ chromadb/utils/embedding_functions.py | 52 +++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 chromadb/test/ef/test_nomic_ef.py diff --git a/chromadb/test/ef/test_nomic_ef.py b/chromadb/test/ef/test_nomic_ef.py new file mode 100644 index 00000000000..875db6976ad --- /dev/null +++ b/chromadb/test/ef/test_nomic_ef.py @@ -0,0 +1,32 @@ +import os + +import pytest + +# import requests +# from requests import HTTPError +# from requests.exceptions import ConnectionError + +from chromadb.utils.embedding_functions import NomicEmbeddingFunction + + +def test_nomic() -> None: + """ + To learn more about the Nomic API: https://docs.nomic.ai/reference/endpoints/nomic-embed-text + Export the NOMIC_API_KEY and optionally the NOMIC_MODEL environment variables. + """ + if os.environ.get("NOMIC_API_KEY") is None: + pytest.skip("NOMIC_API_KEY environment variable not set. Skipping test.") + # try: + # response = requests.get(os.environ.get(???, "")) + # # If the response was successful, no Exception will be raised + # response.raise_for_status() + # except (HTTPError, ConnectionError): + # pytest.skip("Nomic API server can't be reached. Skipping test.") + ef = NomicEmbeddingFunction( + api_key=os.environ.get("NOMIC_API_KEY") or "", + model_name=os.environ.get("NOMIC_MODEL") or "nomic-embed-text-v1.5", + ) + embeddings = ef( + ["Henceforth, it is the map that precedes the territory", "nom nom Nomic"] + ) + assert len(embeddings) == 2 diff --git a/chromadb/utils/embedding_functions.py b/chromadb/utils/embedding_functions.py index cc779865675..34637dba997 100644 --- a/chromadb/utils/embedding_functions.py +++ b/chromadb/utils/embedding_functions.py @@ -1017,6 +1017,58 @@ def __call__(self, input: Documents) -> Embeddings: ) +class NomicEmbeddingFunction(EmbeddingFunction[Documents]): + """ + This class is used to generate embeddings for a list of texts using the Nomic Embedding API (https://docs.nomic.ai/reference/endpoints/nomic-embed-text). + """ + + def __init__(self, api_key: str, model_name: str) -> None: + """ + Initialize the Nomic Embedding Function. + + Args: + model_name (str): The name of the model to use for text embeddings. E.g. "nomic-embed-text-v1.5" (see https://docs.nomic.ai/atlas/models/text-embedding for available models). + """ + try: + import requests + except ImportError: + raise ValueError( + "The requests python package is not installed. Please install it with `pip install requests`" + ) + self._api_url = "https://api-atlas.nomic.ai/v1/embedding/text" + self._api_key = api_key + self._model_name = model_name + self._session = requests.Session() + + def __call__(self, input: Documents) -> Embeddings: + """ + Get the embeddings for a list of texts. + + Args: + input (Documents): A list of texts to get embeddings for. + + Returns: + Embeddings: The embeddings for the texts. + + Example: + >>> nomic_ef = NomicEmbeddingFunction(model_name="nomic-embed-text-v1.5") + >>> texts = ["Hello, world!", "How are you?"] + >>> embeddings = nomic_ef(texts) + """ + texts = input if isinstance(input, list) else [input] + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {self._api_key}", + } + embeddings = self._session.post( + self._api_url, + headers=headers, + json={"model": self._model_name, "texts": texts}, + ).json() + + return cast(Embeddings, embeddings["embeddings"]) + + # List of all classes in this module _classes = [ name From af22d2785d798e9106825f2b0689b2a5afe81dc2 Mon Sep 17 00:00:00 2001 From: Andrew Blum Date: Mon, 13 May 2024 16:14:20 -0700 Subject: [PATCH 2/5] add more tests and error checking --- chromadb/test/ef/test_nomic_ef.py | 83 +++++++++++++++++++++++---- chromadb/utils/embedding_functions.py | 19 ++++-- 2 files changed, 87 insertions(+), 15 deletions(-) diff --git a/chromadb/test/ef/test_nomic_ef.py b/chromadb/test/ef/test_nomic_ef.py index 875db6976ad..f92903ed4f8 100644 --- a/chromadb/test/ef/test_nomic_ef.py +++ b/chromadb/test/ef/test_nomic_ef.py @@ -2,26 +2,31 @@ import pytest -# import requests -# from requests import HTTPError -# from requests.exceptions import ConnectionError +import requests +from requests import HTTPError +from requests.exceptions import ConnectionError +from pytest_httpserver import HTTPServer +import json +from unittest.mock import patch from chromadb.utils.embedding_functions import NomicEmbeddingFunction +@pytest.mark.skipif( + "NOMIC_API_KEY" not in os.environ, + reason="NOMIC_API_KEY environment variable not set, skipping test.", +) def test_nomic() -> None: """ To learn more about the Nomic API: https://docs.nomic.ai/reference/endpoints/nomic-embed-text Export the NOMIC_API_KEY and optionally the NOMIC_MODEL environment variables. """ - if os.environ.get("NOMIC_API_KEY") is None: - pytest.skip("NOMIC_API_KEY environment variable not set. Skipping test.") - # try: - # response = requests.get(os.environ.get(???, "")) - # # If the response was successful, no Exception will be raised - # response.raise_for_status() - # except (HTTPError, ConnectionError): - # pytest.skip("Nomic API server can't be reached. Skipping test.") + try: + response = requests.get("https://api-atlas.nomic.ai/v1/health", timeout=10) + # If the response was successful, no Exception will be raised + response.raise_for_status() + except (HTTPError, ConnectionError): + pytest.skip("Nomic API server can't be reached. Skipping test.") ef = NomicEmbeddingFunction( api_key=os.environ.get("NOMIC_API_KEY") or "", model_name=os.environ.get("NOMIC_MODEL") or "nomic-embed-text-v1.5", @@ -30,3 +35,59 @@ def test_nomic() -> None: ["Henceforth, it is the map that precedes the territory", "nom nom Nomic"] ) assert len(embeddings) == 2 + + +def test_nomic_no_api_key() -> None: + """ + To learn more about the Nomic API: https://docs.nomic.ai/reference/endpoints/nomic-embed-text + Test intentionaly excludes the NOMIC_API_KEY. + """ + with pytest.raises(ValueError, match="No Nomic API key provided"): + NomicEmbeddingFunction( + api_key="", + model_name=os.environ.get("NOMIC_MODEL") or "nomic-embed-text-v1.5", + ) + + +@pytest.mark.skipif( + "NOMIC_API_KEY" not in os.environ, + reason="NOMIC_API_KEY environment variable not set, skipping test.", +) +def test_nomic_no_model() -> None: + """ + To learn more about the Nomic API: https://docs.nomic.ai/reference/endpoints/nomic-embed-text + Test intentionaly excludes the NOMIC_MODEL. + """ + with pytest.raises(ValueError, match="No Nomic embedding model provided"): + NomicEmbeddingFunction( + api_key=os.environ.get("NOMIC_API_KEY") or "", + model_name="", + ) + + +@pytest.mark.skipif( + "NOMIC_API_KEY" not in os.environ, + reason="NOMIC_API_KEY environment variable not set, skipping test.", +) +def test_handle_nomic_api_returns_error() -> None: + """ + To learn more about the Nomic API: https://docs.nomic.ai/reference/endpoints/nomic-embed-text + """ + with HTTPServer() as httpserver: + httpserver.expect_oneshot_request( + "/embedding/text", method="POST" + ).respond_with_data( + json.dumps({"detail": "error"}), + status=400, + ) + nomic_ef = NomicEmbeddingFunction( + api_key=os.environ.get("NOMIC_API_KEY") or "", + model_name=os.environ.get("NOMIC_MODEL") or "nomic-embed-text-v1.5", + ) + with patch.object( + nomic_ef, + "_api_url", + f"http://{httpserver.host}:{httpserver.port}/embedding/text", + ): + with pytest.raises(Exception): + nomic_ef(["test text"]) diff --git a/chromadb/utils/embedding_functions.py b/chromadb/utils/embedding_functions.py index 34637dba997..7275b07fd79 100644 --- a/chromadb/utils/embedding_functions.py +++ b/chromadb/utils/embedding_functions.py @@ -907,7 +907,8 @@ def create_langchain_embedding(langchain_embdding_fn: Any): # type: ignore ) class ChromaLangchainEmbeddingFunction( - LangchainEmbeddings, EmbeddingFunction[Union[Documents, Images]] # type: ignore + LangchainEmbeddings, + EmbeddingFunction[Union[Documents, Images]], # type: ignore ): """ This class is used as bridge between langchain embedding functions and custom chroma embedding functions. @@ -1035,6 +1036,12 @@ def __init__(self, api_key: str, model_name: str) -> None: raise ValueError( "The requests python package is not installed. Please install it with `pip install requests`" ) + + if not api_key: + raise ValueError("No Nomic API key provided") + if not model_name: + raise ValueError("No Nomic embedding model provided") + self._api_url = "https://api-atlas.nomic.ai/v1/embedding/text" self._api_key = api_key self._model_name = model_name @@ -1060,13 +1067,17 @@ def __call__(self, input: Documents) -> Embeddings: "Content-Type": "application/json", "Authorization": f"Bearer {self._api_key}", } - embeddings = self._session.post( + response = self._session.post( self._api_url, headers=headers, json={"model": self._model_name, "texts": texts}, - ).json() + ) + response.raise_for_status() + response_json = response.json() + if "embeddings" not in response_json: + raise RuntimeError("Nomic API did not return embeddings") - return cast(Embeddings, embeddings["embeddings"]) + return cast(Embeddings, response_json["embeddings"]) # List of all classes in this module From 6ba0864019d85c306a0ebf7de3a533b1408727c1 Mon Sep 17 00:00:00 2001 From: Andrew Blum Date: Tue, 14 May 2024 13:22:31 -0700 Subject: [PATCH 3/5] unskip tests and add pytest mock to dev_reqs --- chromadb/test/ef/test_nomic_ef.py | 20 +++++--------------- requirements_dev.txt | 1 + 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/chromadb/test/ef/test_nomic_ef.py b/chromadb/test/ef/test_nomic_ef.py index f92903ed4f8..7e8aa8bc801 100644 --- a/chromadb/test/ef/test_nomic_ef.py +++ b/chromadb/test/ef/test_nomic_ef.py @@ -1,14 +1,11 @@ import os - import pytest - import requests from requests import HTTPError from requests.exceptions import ConnectionError from pytest_httpserver import HTTPServer import json from unittest.mock import patch - from chromadb.utils.embedding_functions import NomicEmbeddingFunction @@ -49,29 +46,22 @@ def test_nomic_no_api_key() -> None: ) -@pytest.mark.skipif( - "NOMIC_API_KEY" not in os.environ, - reason="NOMIC_API_KEY environment variable not set, skipping test.", -) def test_nomic_no_model() -> None: """ To learn more about the Nomic API: https://docs.nomic.ai/reference/endpoints/nomic-embed-text - Test intentionaly excludes the NOMIC_MODEL. + Test intentionally excludes the NOMIC_MODEL. api_key does not matter since we expect an error before hitting API. """ with pytest.raises(ValueError, match="No Nomic embedding model provided"): NomicEmbeddingFunction( - api_key=os.environ.get("NOMIC_API_KEY") or "", + api_key="does-not-matter", model_name="", ) -@pytest.mark.skipif( - "NOMIC_API_KEY" not in os.environ, - reason="NOMIC_API_KEY environment variable not set, skipping test.", -) def test_handle_nomic_api_returns_error() -> None: """ To learn more about the Nomic API: https://docs.nomic.ai/reference/endpoints/nomic-embed-text + Mocks an error from the Nomic API, so model and api key don't matter. """ with HTTPServer() as httpserver: httpserver.expect_oneshot_request( @@ -81,8 +71,8 @@ def test_handle_nomic_api_returns_error() -> None: status=400, ) nomic_ef = NomicEmbeddingFunction( - api_key=os.environ.get("NOMIC_API_KEY") or "", - model_name=os.environ.get("NOMIC_MODEL") or "nomic-embed-text-v1.5", + api_key="does-not-matter", + model_name="does-not-matter", ) with patch.object( nomic_ef, diff --git a/requirements_dev.txt b/requirements_dev.txt index 4df73fb8c56..34b36f323cc 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -8,6 +8,7 @@ mypy-protobuf pre-commit pytest pytest-asyncio +pytest_httpserver==1.0.10 setuptools_scm types-protobuf types-requests==2.30.0.0 From 718344b0197801ed47bdcff8ea58069de6fd2061 Mon Sep 17 00:00:00 2001 From: Andrew Blum Date: Wed, 15 May 2024 01:57:56 -0700 Subject: [PATCH 4/5] Add ending blank line to requirements_dev.txt From 1a1d30dc951edaf97cdc6a8e72fffdd48282764b Mon Sep 17 00:00:00 2001 From: Andrew Blum Date: Wed, 15 May 2024 02:04:14 -0700 Subject: [PATCH 5/5] sort requirements_dev.txt --- requirements_dev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements_dev.txt b/requirements_dev.txt index 9855bc53572..8f6a9a9d83f 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -8,8 +8,8 @@ mypy-protobuf pre-commit pytest pytest-asyncio -pytest_httpserver==1.0.10 pytest-xdist +pytest_httpserver==1.0.10 setuptools_scm types-protobuf types-requests==2.30.0.0