feat: Support embedding dimensions on DeepsetCloudDocumentStore (#2995)

* Add embedding_dim to dc store * Remove similarity from query params, it is not used * Remove unused `return_embedding` parameter * Remove unused param * Update the documentation * Update schemas * Revert openapi changes * Revert openapi changes * Fix openapi * Fix json schema * Improve docstrings Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Improve logs Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Update the docs * Fix similarity Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
deepset-ai · Aug 12, 2022 · da7836a · da7836a
1 parent c0fbe45
commit da7836a
Show file tree

Hide file tree

Showing 5 changed files with 15 additions and 25 deletions.
diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md
@@ -4265,7 +4265,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore)
 #### DeepsetCloudDocumentStore.\_\_init\_\_
 
 ```python
-def __init__(api_key: str = None, workspace: str = "default", index: Optional[str] = None, duplicate_documents: str = "overwrite", api_endpoint: Optional[str] = None, similarity: str = "dot_product", return_embedding: bool = False, label_index: str = "default")
+def __init__(api_key: str = None, workspace: str = "default", index: Optional[str] = None, duplicate_documents: str = "overwrite", api_endpoint: Optional[str] = None, similarity: str = "dot_product", return_embedding: bool = False, label_index: str = "default", embedding_dim: int = 768)
 ```
 
 A DocumentStore facade enabling you to interact with the documents stored in deepset Cloud.
@@ -4308,6 +4308,7 @@ If DEEPSET_CLOUD_API_ENDPOINT environment variable is not specified either, defa
 more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence Transformer model.
 - `label_index`: index for the evaluation set interface
 - `return_embedding`: To return document embedding.
+- `embedding_dim`: Specifies the dimensionality of the embedding vector (only needed when using a dense retriever, for example, DensePassageRetriever pr EmbeddingRetriever, on top).
 
 <a id="deepsetcloud.DeepsetCloudDocumentStore.get_all_documents"></a>
 

diff --git a/haystack/document_stores/deepsetcloud.py b/haystack/document_stores/deepsetcloud.py
@@ -45,6 +45,7 @@ def __init__(
         similarity: str = "dot_product",
         return_embedding: bool = False,
         label_index: str = "default",
+        embedding_dim: int = 768,
     ):
         """
         A DocumentStore facade enabling you to interact with the documents stored in deepset Cloud.
@@ -83,15 +84,15 @@ def __init__(
         :param similarity: The similarity function used to compare document vectors. 'dot_product' is the default since it is
                            more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence Transformer model.
         :param label_index: index for the evaluation set interface
-
         :param return_embedding: To return document embedding.
-
+        :param embedding_dim: Specifies the dimensionality of the embedding vector (only needed when using a dense retriever, for example, DensePassageRetriever pr EmbeddingRetriever, on top).
         """
         self.index = index
         self.label_index = label_index
         self.duplicate_documents = duplicate_documents
         self.similarity = similarity
         self.return_embedding = return_embedding
+        self.embedding_dim = embedding_dim
         self.client = DeepsetCloud.get_index_client(
             api_key=api_key, api_endpoint=api_endpoint, workspace=workspace, index=index
         )
@@ -128,7 +129,7 @@ def __init__(
         else:
             logger.info(
                 f"You are using a DeepsetCloudDocumentStore with an index that does not exist on deepset Cloud. "
-                f"This document store will always return empty responses. This is especially useful if you want to "
+                f"This document store always returns empty responses. This can be useful if you want to "
                 f"create a new pipeline within deepset Cloud.\n"
                 f"In order to create a new pipeline on deepset Cloud, take the following steps: \n"
                 f"  - create query and indexing pipelines using this DocumentStore\n"
@@ -271,7 +272,7 @@ def get_document_by_id(
         if index is None:
             index = self.index
 
-        doc_dict = self.client.get_document(id=id, return_embedding=self.return_embedding, index=index, headers=headers)
+        doc_dict = self.client.get_document(id=id, index=index, headers=headers)
         doc: Optional[Document] = None
         if doc_dict:
             doc = Document.from_dict(doc_dict)
@@ -407,7 +408,6 @@ def query_by_embedding(
         doc_dicts = self.client.query(
             query_emb=query_emb.tolist(),
             filters=filters,
-            similarity=self.similarity,
             top_k=top_k,
             return_embedding=return_embedding,
             index=index,

diff --git a/haystack/json-schemas/haystack-pipeline-master.schema.json b/haystack/json-schemas/haystack-pipeline-master.schema.json
@@ -394,6 +394,11 @@
               "title": "Label Index",
               "default": "default",
               "type": "string"
+            },
+            "embedding_dim": {
+              "title": "Embedding Dim",
+              "default": 768,
+              "type": "integer"
             }
           },
           "additionalProperties": false,

diff --git a/haystack/utils/deepsetcloud.py b/haystack/utils/deepsetcloud.py
@@ -371,7 +371,6 @@ def query(
         custom_query: Optional[str] = None,
         query_emb: Optional[List[float]] = None,
         return_embedding: Optional[bool] = None,
-        similarity: Optional[str] = None,
         workspace: Optional[str] = None,
         index: Optional[str] = None,
         all_terms_must_match: Optional[bool] = None,
@@ -386,7 +385,6 @@ def query(
             "top_k": top_k,
             "custom_query": custom_query,
             "query_emb": query_emb,
-            "similarity": similarity,
             "return_embedding": return_embedding,
             "all_terms_must_match": all_terms_must_match,
             "scale_score": scale_score,
@@ -408,18 +406,10 @@ def stream_documents(
         response = self.client.post(url=query_url, json=request, headers=headers, stream=True)
         return response.iter_lines()
 
-    def get_document(
-        self,
-        id: str,
-        return_embedding: Optional[bool] = False,
-        workspace: Optional[str] = None,
-        index: Optional[str] = None,
-        headers: dict = None,
-    ):
+    def get_document(self, id: str, workspace: Optional[str] = None, index: Optional[str] = None, headers: dict = None):
         index_url = self._build_index_url(workspace=workspace, index=index)
         document_url = f"{index_url}/documents/{id}"
-        query_params = {"return_embedding": return_embedding}
-        response = self.client.get(url=document_url, headers=headers, query_params=query_params, raise_on_error=False)
+        response = self.client.get(url=document_url, headers=headers, raise_on_error=False)
         doc: Optional[dict] = None
         if response.status_code == 200:
             doc = response.json()

diff --git a/test/document_stores/test_document_store.py b/test/document_stores/test_document_store.py
@@ -1880,13 +1880,7 @@ def test_DeepsetCloudDocumentStore_query_by_embedding(deepset_cloud_document_sto
             url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-query",
             match=[
                 matchers.json_params_matcher(
-                    {
-                        "query_emb": query_emb.tolist(),
-                        "top_k": 10,
-                        "return_embedding": False,
-                        "similarity": "dot_product",
-                        "scale_score": True,
-                    }
+                    {"query_emb": query_emb.tolist(), "top_k": 10, "return_embedding": False, "scale_score": True}
                 )
             ],
             json=[],