Skip to content

Commit

Permalink
feat: Support embedding dimensions on DeepsetCloudDocumentStore (#2995)
Browse files Browse the repository at this point in the history
* Add embedding_dim to dc store

* Remove similarity from query params, it is not used

* Remove unused `return_embedding` parameter

* Remove unused param

* Update the documentation

* Update schemas

* Revert openapi changes

* Revert openapi changes

* Fix openapi

* Fix json schema

* Improve docstrings

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

* Improve logs

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

* Update the docs

* Fix similarity

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
  • Loading branch information
dmigo and agnieszka-m committed Aug 12, 2022
1 parent c0fbe45 commit da7836a
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 25 deletions.
3 changes: 2 additions & 1 deletion docs/_src/api/api/document_store.md
Original file line number Diff line number Diff line change
Expand Up @@ -4265,7 +4265,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore)
#### DeepsetCloudDocumentStore.\_\_init\_\_

```python
def __init__(api_key: str = None, workspace: str = "default", index: Optional[str] = None, duplicate_documents: str = "overwrite", api_endpoint: Optional[str] = None, similarity: str = "dot_product", return_embedding: bool = False, label_index: str = "default")
def __init__(api_key: str = None, workspace: str = "default", index: Optional[str] = None, duplicate_documents: str = "overwrite", api_endpoint: Optional[str] = None, similarity: str = "dot_product", return_embedding: bool = False, label_index: str = "default", embedding_dim: int = 768)
```

A DocumentStore facade enabling you to interact with the documents stored in deepset Cloud.
Expand Down Expand Up @@ -4308,6 +4308,7 @@ If DEEPSET_CLOUD_API_ENDPOINT environment variable is not specified either, defa
more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence Transformer model.
- `label_index`: index for the evaluation set interface
- `return_embedding`: To return document embedding.
- `embedding_dim`: Specifies the dimensionality of the embedding vector (only needed when using a dense retriever, for example, DensePassageRetriever pr EmbeddingRetriever, on top).

<a id="deepsetcloud.DeepsetCloudDocumentStore.get_all_documents"></a>

Expand Down
10 changes: 5 additions & 5 deletions haystack/document_stores/deepsetcloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def __init__(
similarity: str = "dot_product",
return_embedding: bool = False,
label_index: str = "default",
embedding_dim: int = 768,
):
"""
A DocumentStore facade enabling you to interact with the documents stored in deepset Cloud.
Expand Down Expand Up @@ -83,15 +84,15 @@ def __init__(
:param similarity: The similarity function used to compare document vectors. 'dot_product' is the default since it is
more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence Transformer model.
:param label_index: index for the evaluation set interface
:param return_embedding: To return document embedding.
:param embedding_dim: Specifies the dimensionality of the embedding vector (only needed when using a dense retriever, for example, DensePassageRetriever pr EmbeddingRetriever, on top).
"""
self.index = index
self.label_index = label_index
self.duplicate_documents = duplicate_documents
self.similarity = similarity
self.return_embedding = return_embedding
self.embedding_dim = embedding_dim
self.client = DeepsetCloud.get_index_client(
api_key=api_key, api_endpoint=api_endpoint, workspace=workspace, index=index
)
Expand Down Expand Up @@ -128,7 +129,7 @@ def __init__(
else:
logger.info(
f"You are using a DeepsetCloudDocumentStore with an index that does not exist on deepset Cloud. "
f"This document store will always return empty responses. This is especially useful if you want to "
f"This document store always returns empty responses. This can be useful if you want to "
f"create a new pipeline within deepset Cloud.\n"
f"In order to create a new pipeline on deepset Cloud, take the following steps: \n"
f" - create query and indexing pipelines using this DocumentStore\n"
Expand Down Expand Up @@ -271,7 +272,7 @@ def get_document_by_id(
if index is None:
index = self.index

doc_dict = self.client.get_document(id=id, return_embedding=self.return_embedding, index=index, headers=headers)
doc_dict = self.client.get_document(id=id, index=index, headers=headers)
doc: Optional[Document] = None
if doc_dict:
doc = Document.from_dict(doc_dict)
Expand Down Expand Up @@ -407,7 +408,6 @@ def query_by_embedding(
doc_dicts = self.client.query(
query_emb=query_emb.tolist(),
filters=filters,
similarity=self.similarity,
top_k=top_k,
return_embedding=return_embedding,
index=index,
Expand Down
5 changes: 5 additions & 0 deletions haystack/json-schemas/haystack-pipeline-master.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,11 @@
"title": "Label Index",
"default": "default",
"type": "string"
},
"embedding_dim": {
"title": "Embedding Dim",
"default": 768,
"type": "integer"
}
},
"additionalProperties": false,
Expand Down
14 changes: 2 additions & 12 deletions haystack/utils/deepsetcloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,6 @@ def query(
custom_query: Optional[str] = None,
query_emb: Optional[List[float]] = None,
return_embedding: Optional[bool] = None,
similarity: Optional[str] = None,
workspace: Optional[str] = None,
index: Optional[str] = None,
all_terms_must_match: Optional[bool] = None,
Expand All @@ -386,7 +385,6 @@ def query(
"top_k": top_k,
"custom_query": custom_query,
"query_emb": query_emb,
"similarity": similarity,
"return_embedding": return_embedding,
"all_terms_must_match": all_terms_must_match,
"scale_score": scale_score,
Expand All @@ -408,18 +406,10 @@ def stream_documents(
response = self.client.post(url=query_url, json=request, headers=headers, stream=True)
return response.iter_lines()

def get_document(
self,
id: str,
return_embedding: Optional[bool] = False,
workspace: Optional[str] = None,
index: Optional[str] = None,
headers: dict = None,
):
def get_document(self, id: str, workspace: Optional[str] = None, index: Optional[str] = None, headers: dict = None):
index_url = self._build_index_url(workspace=workspace, index=index)
document_url = f"{index_url}/documents/{id}"
query_params = {"return_embedding": return_embedding}
response = self.client.get(url=document_url, headers=headers, query_params=query_params, raise_on_error=False)
response = self.client.get(url=document_url, headers=headers, raise_on_error=False)
doc: Optional[dict] = None
if response.status_code == 200:
doc = response.json()
Expand Down
8 changes: 1 addition & 7 deletions test/document_stores/test_document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -1880,13 +1880,7 @@ def test_DeepsetCloudDocumentStore_query_by_embedding(deepset_cloud_document_sto
url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-query",
match=[
matchers.json_params_matcher(
{
"query_emb": query_emb.tolist(),
"top_k": 10,
"return_embedding": False,
"similarity": "dot_product",
"scale_score": True,
}
{"query_emb": query_emb.tolist(), "top_k": 10, "return_embedding": False, "scale_score": True}
)
],
json=[],
Expand Down

0 comments on commit da7836a

Please sign in to comment.