From adb2b2c312fff55754a4778ad2eb67c8556bdfca Mon Sep 17 00:00:00 2001 From: Zoltan Fedor Date: Wed, 27 Jul 2022 04:07:13 -0400 Subject: [PATCH] Add support for BM25 with the Weaviate document store (#2860) * Upgrading Weaviate used for testing to 1.14.1 from 1.11.0 This has also brought up an issue with one of the test filtering for value "a". This test has started to fail, as "a" is a default stopword in Weaviate, so I have changed this test to look for value "c" instead of value "a" to get around the stopword issue. * Weaviate client upgrade From v3.3.3 to v3.6.0 * Adding BM25 Retrieval to Weaviate Weaviate now supports BM25 retrieval in experiment mode and with some limitations (like it cannot be combined with filters). This commit adds support for inverted index (BM25) querying against Weaviate. * Running Black on the recent code changes * Update Documentation & Code Style * Fixing linting issues after code changes by black * The BM25 query needs to be in all lowercase for now The BM25 query needs to be provided all lowercase while the functionality is in experimental mode in Weaviate. See https://app.slack.com/client/T0181DYT9KN/C017EG2SL3H/thread/C017EG2SL3H-1658790227.208119 * Fixing method parameter docstring to highlight that they are not supported in Weaviate * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- .github/workflows/tests.yml | 6 +- CONTRIBUTING.md | 2 +- docs/_src/api/api/document_store.md | 4 +- haystack/document_stores/weaviate.py | 91 +++++++++++++++++---- setup.cfg | 2 +- test/document_stores/test_document_store.py | 4 +- test/document_stores/test_weaviate.py | 6 +- 7 files changed, 91 insertions(+), 24 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 8c561d893f..02e0191a88 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -369,7 +369,7 @@ jobs: uses: ./.github/actions/python_cache/ - name: Setup Weaviate - run: docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.11.0 + run: docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' --env ENABLE_EXPERIMENTAL_BM25='true' semitechnologies/weaviate:1.14.1 # TODO Let's try to remove this one from the unit tests - name: Install pdftotext @@ -401,7 +401,7 @@ jobs: # prefix: windows # - name: Setup Weaviate - # run: docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.11.0 + # run: docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' --env ENABLE_EXPERIMENTAL_BM25='true' semitechnologies/weaviate:1.14.1 # - name: Install pdftotext # run: | @@ -540,7 +540,7 @@ jobs: sudo docker-compose ps - name: Run Weaviate - run: docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.11.0 + run: docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' --env ENABLE_EXPERIMENTAL_BM25='true' semitechnologies/weaviate:1.14.1 - name: Run GraphDB run: docker run -d -p 7200:7200 --name haystack_test_graphdb deepset/graphdb-free:9.4.1-adoptopenjdk11 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f12e24fc15..92a7853fea 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -170,7 +170,7 @@ wget https://github.com/milvus-io/milvus/releases/download/v2.0.0/milvus-standal docker-compose up -d # Weaviate -docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.11.0 +docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' --env ENABLE_EXPERIMENTAL_BM25='true' semitechnologies/weaviate:1.14.1 # GraphDB docker run -d -p 7200:7200 --name haystack_test_graphdb deepset/graphdb-free:9.4.1-adoptopenjdk11 diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index e7d9bba9ef..4c71fac88e 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -3689,7 +3689,7 @@ operation. #### WeaviateDocumentStore.query ```python -def query(query: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, scale_score: bool = True) -> List[Document] +def query(query: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, all_terms_must_match: bool = False, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -3763,9 +3763,11 @@ operation. } ``` - `top_k`: How many documents to return per query. +- `all_terms_must_match`: Not used in Weaviate. - `custom_query`: Custom query that will executed using query.raw method, for more details refer https://weaviate.io/developers/weaviate/current/graphql-references/filters.html - `index`: The name of the index in the DocumentStore from which to retrieve documents +- `headers`: Not used in Weaviate. - `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. diff --git a/haystack/document_stores/weaviate.py b/haystack/document_stores/weaviate.py index f24c83ce60..c845a59259 100644 --- a/haystack/document_stores/weaviate.py +++ b/haystack/document_stores/weaviate.py @@ -11,7 +11,7 @@ try: import weaviate - from weaviate import client, AuthClientPassword + from weaviate import client, AuthClientPassword, gql except (ImportError, ModuleNotFoundError) as ie: from haystack.utils.import_utils import _optional_component_not_installed @@ -814,8 +814,10 @@ def query( query: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, + all_terms_must_match: bool = False, custom_query: Optional[str] = None, index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None, scale_score: bool = True, ) -> List[Document]: """ @@ -887,36 +889,95 @@ def query( } ``` :param top_k: How many documents to return per query. + :param all_terms_must_match: Not used in Weaviate. :param custom_query: Custom query that will executed using query.raw method, for more details refer https://weaviate.io/developers/weaviate/current/graphql-references/filters.html :param index: The name of the index in the DocumentStore from which to retrieve documents + :param headers: Not used in Weaviate. :param scale_score: Whether to scale the similarity score to the unit interval (range of [0,1]). If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. """ + if headers: + raise NotImplementedError("Weaviate does not support Custom HTTP headers!") + + if all_terms_must_match: + raise NotImplementedError("The `all_terms_must_match` option is not supported in Weaviate!") + index = self._sanitize_index_name(index) or self.index # Build the properties to retrieve from Weaviate properties = self._get_current_properties(index) properties.append("_additional {id, certainty, vector}") - if custom_query: - query_output = self.weaviate_client.query.raw(custom_query) - elif filters: - filter_dict = LogicalFilterClause.parse(filters).convert_to_weaviate() - query_output = ( - self.weaviate_client.query.get(class_name=index, properties=properties) - .with_where(filter_dict) - .with_limit(top_k) - .do() - ) + if query is None: + + # Retrieval via custom query, no BM25 + if custom_query: + query_output = self.weaviate_client.query.raw(custom_query) + + # Naive retrieval without BM25, only filtering + elif filters: + filter_dict = LogicalFilterClause.parse(filters).convert_to_weaviate() + query_output = ( + self.weaviate_client.query.get(class_name=index, properties=properties) + .with_where(filter_dict) + .with_limit(top_k) + .do() + ) + else: + raise NotImplementedError( + "Weaviate does not support the retrieval of records without specifying a query or a filter!" + ) + + # Default Retrieval via BM25 using the user's query on `self.content_field` else: - raise NotImplementedError( - "Weaviate does not support inverted index text query. However, " - "it allows to search by filters example : {'content': 'some text'} or " - "use a custom GraphQL query in text format!" + logger.warning( + "As of v1.14.1 Weaviate's BM25 retrieval is still in experimental phase, " + "so use it with care! To turn on the BM25 experimental feature in Weaviate " + "you need to start it with the `ENABLE_EXPERIMENTAL_BM25='true'` " + "environmental variable." + ) + + # Retrieval with BM25 AND filtering + if filters: + raise NotImplementedError( + "Weaviate currently (v1.14.1) does not support filters WITH inverted index text query (eg BM25)!" + ) + + # Once Weaviate starts supporting filters with BM25: + # filter_dict = LogicalFilterClause.parse(filters).convert_to_weaviate() + # gql_query = weaviate.gql.get.GetBuilder(class_name=index, + # properties=properties, + # connection=self.weaviate_client) \ + # .with_near_vector({'vector': [0, 0]}) \ + # .with_where(filter_dict) \ + # .with_limit(top_k) \ + # .build() + + # BM25 retrieval without filtering + gql_query = ( + gql.get.GetBuilder(class_name=index, properties=properties, connection=self.weaviate_client) + .with_near_vector({"vector": [0, 0]}) + .with_limit(top_k) + .build() ) + # Build the BM25 part of the GQL manually. + # Currently the GetBuilder of the Weaviate-client (v3.6.0) + # does not support the BM25 part of GQL building, so + # the BM25 part needs to be added manually. + # The BM25 query needs to be provided all lowercase while + # the functionality is in experimental mode in Weaviate, + # see https://app.slack.com/client/T0181DYT9KN/C017EG2SL3H/thread/C017EG2SL3H-1658790227.208119 + bm25_gql_query = f"""bm25: {{ + query: "{query.replace('"', ' ').lower()}", + properties: ["{self.content_field}"] + }}""" + gql_query = gql_query.replace("nearVector: {vector: [0, 0]}", bm25_gql_query) + + query_output = self.weaviate_client.query.raw(gql_query) + results = [] if query_output and "data" in query_output and "Get" in query_output.get("data"): if query_output.get("data").get("Get").get(index): diff --git a/setup.cfg b/setup.cfg index 6b40c8d2f1..8c035d7d65 100644 --- a/setup.cfg +++ b/setup.cfg @@ -141,7 +141,7 @@ only-milvus = milvus = farm-haystack[sql,only-milvus] weaviate = - weaviate-client==3.3.3 + weaviate-client==3.6.0 only-pinecone = pinecone-client pinecone = diff --git a/test/document_stores/test_document_store.py b/test/document_stores/test_document_store.py index fbdcb3cb3f..79361f065e 100644 --- a/test/document_stores/test_document_store.py +++ b/test/document_stores/test_document_store.py @@ -389,14 +389,14 @@ def test_get_documents_by_id(document_store: BaseDocumentStore): def test_get_document_count(document_store: BaseDocumentStore): documents = [ - {"content": "text1", "id": "1", "meta_field_for_count": "a"}, + {"content": "text1", "id": "1", "meta_field_for_count": "c"}, {"content": "text2", "id": "2", "meta_field_for_count": "b"}, {"content": "text3", "id": "3", "meta_field_for_count": "b"}, {"content": "text4", "id": "4", "meta_field_for_count": "b"}, ] document_store.write_documents(documents) assert document_store.get_document_count() == 4 - assert document_store.get_document_count(filters={"meta_field_for_count": ["a"]}) == 1 + assert document_store.get_document_count(filters={"meta_field_for_count": ["c"]}) == 1 assert document_store.get_document_count(filters={"meta_field_for_count": ["b"]}) == 3 diff --git a/test/document_stores/test_weaviate.py b/test/document_stores/test_weaviate.py index 1c020ce85f..2ad144fa17 100644 --- a/test/document_stores/test_weaviate.py +++ b/test/document_stores/test_weaviate.py @@ -97,8 +97,12 @@ def test_query_by_embedding(document_store_with_docs): @pytest.mark.parametrize("document_store_with_docs", ["weaviate"], indirect=True) def test_query(document_store_with_docs): query_text = "My name is Carla and I live in Berlin" + docs = document_store_with_docs.query(query_text) + assert len(docs) == 3 + + # BM25 retrieval WITH filters is not yet supported as of Weaviate v1.14.1 with pytest.raises(Exception): - docs = document_store_with_docs.query(query_text) + docs = document_store_with_docs.query(query_text, filters={"name": ["filename2"]}) docs = document_store_with_docs.query(filters={"name": ["filename2"]}) assert len(docs) == 1