From 83724b74e30707c1bf8ab8c9758e8e52a1c59eed Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 28 Sep 2023 14:42:19 +0200 Subject: [PATCH 01/33] feat: Make `metadata` optional in AnswerBuilder (#5909) * optional metadata * improve docstring --- .../components/builders/answer_builder.py | 10 +++++--- .../builders/test_answer_builder.py | 24 ++++++++++++++++++- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/haystack/preview/components/builders/answer_builder.py b/haystack/preview/components/builders/answer_builder.py index af644d22a0..201d43b9c7 100644 --- a/haystack/preview/components/builders/answer_builder.py +++ b/haystack/preview/components/builders/answer_builder.py @@ -42,7 +42,7 @@ def run( self, query: str, replies: List[str], - metadata: List[Dict[str, Any]], + metadata: Optional[List[Dict[str, Any]]] = None, documents: Optional[List[Document]] = None, pattern: Optional[str] = None, reference_pattern: Optional[str] = None, @@ -52,7 +52,8 @@ def run( :param query: The query used in the prompts for the Generator. A strings. :param replies: The output of the Generator. A list of strings. - :param metadata: The metadata returned by the Generator. A list of dictionaries. + :param metadata: The metadata returned by the Generator. An optional list of dictionaries. If not specified, + the generated answer will contain no metadata. :param documents: The documents used as input to the Generator. A list of `Document` objects. If `documents` are specified, they are added to the `Answer` objects. If both `documents` and `reference_pattern` are specified, the documents referenced in the @@ -73,8 +74,11 @@ def run( If not specified, no parsing is done, and all documents are referenced. Default: `None`. """ - if len(replies) != len(metadata): + if not metadata: + metadata = [{}] * len(replies) + elif len(replies) != len(metadata): raise ValueError(f"Number of replies ({len(replies)}), and metadata ({len(metadata)}) must match.") + if pattern: AnswerBuilder._check_num_groups_in_regex(pattern) diff --git a/test/preview/components/builders/test_answer_builder.py b/test/preview/components/builders/test_answer_builder.py index 03b4a42f9a..e93ae2f030 100644 --- a/test/preview/components/builders/test_answer_builder.py +++ b/test/preview/components/builders/test_answer_builder.py @@ -36,7 +36,29 @@ def test_from_dict(self): def test_run_unmatching_input_len(self): component = AnswerBuilder() with pytest.raises(ValueError): - component.run(query="query", replies=["reply1", "reply2"], metadata=[]) + component.run(query="query", replies=["reply1"], metadata=[{"test": "meta"}, {"test": "meta2"}]) + + @pytest.mark.unit + def test_run_without_meta(self): + component = AnswerBuilder() + output = component.run(query="query", replies=["reply1"]) + answers = output["answers"] + assert answers[0].data == "reply1" + assert answers[0].metadata == {} + assert answers[0].query == "query" + assert answers[0].documents == [] + assert isinstance(answers[0], GeneratedAnswer) + + @pytest.mark.unit + def test_run_meta_is_an_empty_list(self): + component = AnswerBuilder() + output = component.run(query="query", replies=["reply1"], metadata=[]) + answers = output["answers"] + assert answers[0].data == "reply1" + assert answers[0].metadata == {} + assert answers[0].query == "query" + assert answers[0].documents == [] + assert isinstance(answers[0], GeneratedAnswer) def test_run_without_pattern(self): component = AnswerBuilder() From d4aacad5f9ebf141ad663d8620626871b85734f7 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Date: Thu, 28 Sep 2023 15:42:51 +0200 Subject: [PATCH 02/33] feat: `OpenAIDocumentEmbedder` (#5822) * first draft * release note * mypy fix * fix test * corrections * pr feedback * better secrets handling and new tests * missing imports in embedders/__init__.py * better format condition * address feedback --- .../preview/components/embedders/__init__.py | 9 +- .../embedders/openai_document_embedder.py | 164 +++++++++ ...ai-document-embedder-d2f59ba1f21babcb.yaml | 6 + .../test_openai_document_embedder.py | 334 ++++++++++++++++++ 4 files changed, 512 insertions(+), 1 deletion(-) create mode 100644 haystack/preview/components/embedders/openai_document_embedder.py create mode 100644 releasenotes/notes/openai-document-embedder-d2f59ba1f21babcb.yaml create mode 100644 test/preview/components/embedders/test_openai_document_embedder.py diff --git a/haystack/preview/components/embedders/__init__.py b/haystack/preview/components/embedders/__init__.py index de8b93958f..a0840d7e0a 100644 --- a/haystack/preview/components/embedders/__init__.py +++ b/haystack/preview/components/embedders/__init__.py @@ -2,5 +2,12 @@ from haystack.preview.components.embedders.sentence_transformers_document_embedder import ( SentenceTransformersDocumentEmbedder, ) +from haystack.preview.components.embedders.openai_document_embedder import OpenAIDocumentEmbedder +from haystack.preview.components.embedders.openai_text_embedder import OpenAITextEmbedder -__all__ = ["SentenceTransformersTextEmbedder", "SentenceTransformersDocumentEmbedder"] +__all__ = [ + "SentenceTransformersTextEmbedder", + "SentenceTransformersDocumentEmbedder", + "OpenAITextEmbedder", + "OpenAIDocumentEmbedder", +] diff --git a/haystack/preview/components/embedders/openai_document_embedder.py b/haystack/preview/components/embedders/openai_document_embedder.py new file mode 100644 index 0000000000..a2e51530bd --- /dev/null +++ b/haystack/preview/components/embedders/openai_document_embedder.py @@ -0,0 +1,164 @@ +from typing import List, Optional, Dict, Any, Tuple +import os + +import openai +from tqdm import tqdm + + +from haystack.preview import component, Document, default_to_dict, default_from_dict + + +@component +class OpenAIDocumentEmbedder: + """ + A component for computing Document embeddings using OpenAI models. + The embedding of each Document is stored in the `embedding` field of the Document. + """ + + def __init__( + self, + api_key: Optional[str] = None, + model_name: str = "text-embedding-ada-002", + organization: Optional[str] = None, + prefix: str = "", + suffix: str = "", + batch_size: int = 32, + progress_bar: bool = True, + metadata_fields_to_embed: Optional[List[str]] = None, + embedding_separator: str = "\n", + ): + """ + Create a OpenAIDocumentEmbedder component. + :param api_key: The OpenAI API key. It can be explicitly provided or automatically read from the + environment variable OPENAI_API_KEY (recommended). + :param model_name: The name of the model to use. + :param api_base_url: The OpenAI API Base url, defaults to `https://api.openai.com/v1`. + :param organization: The OpenAI-Organization ID, defaults to `None`. For more details, see OpenAI + [documentation](https://platform.openai.com/docs/api-reference/requesting-organization). + :param prefix: A string to add to the beginning of each text. + :param suffix: A string to add to the end of each text. + :param batch_size: Number of Documents to encode at once. + :param progress_bar: Whether to show a progress bar or not. Can be helpful to disable in production deployments + to keep the logs clean. + :param metadata_fields_to_embed: List of meta fields that should be embedded along with the Document text. + :param embedding_separator: Separator used to concatenate the meta fields to the Document text. + """ + + if api_key is None: + try: + api_key = os.environ["OPENAI_API_KEY"] + except KeyError as e: + raise ValueError( + "OpenAIDocumentEmbedder expects an OpenAI API key. " + "Set the OPENAI_API_KEY environment variable (recommended) or pass it explicitly." + ) from e + + self.model_name = model_name + self.organization = organization + self.prefix = prefix + self.suffix = suffix + self.batch_size = batch_size + self.progress_bar = progress_bar + self.metadata_fields_to_embed = metadata_fields_to_embed or [] + self.embedding_separator = embedding_separator + + openai.api_key = api_key + if organization is not None: + openai.organization = organization + + def to_dict(self) -> Dict[str, Any]: + """ + This method overrides the default serializer in order to avoid leaking the `api_key` value passed + to the constructor. + """ + return default_to_dict( + self, + model_name=self.model_name, + organization=self.organization, + prefix=self.prefix, + suffix=self.suffix, + batch_size=self.batch_size, + progress_bar=self.progress_bar, + metadata_fields_to_embed=self.metadata_fields_to_embed, + embedding_separator=self.embedding_separator, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "OpenAIDocumentEmbedder": + """ + Deserialize this component from a dictionary. + """ + return default_from_dict(cls, data) + + def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]: + """ + Prepare the texts to embed by concatenating the Document text with the metadata fields to embed. + """ + texts_to_embed = [] + for doc in documents: + meta_values_to_embed = [ + str(doc.metadata[key]) + for key in self.metadata_fields_to_embed + if key in doc.metadata and doc.metadata[key] is not None + ] + + text_to_embed = ( + self.prefix + self.embedding_separator.join(meta_values_to_embed + [doc.text or ""]) + self.suffix + ) + + # copied from OpenAI embedding_utils (https://github.com/openai/openai-python/blob/main/openai/embeddings_utils.py) + # replace newlines, which can negatively affect performance. + text_to_embed = text_to_embed.replace("\n", " ") + texts_to_embed.append(text_to_embed) + return texts_to_embed + + def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> Tuple[List[str], Dict[str, Any]]: + """ + Embed a list of texts in batches. + """ + + all_embeddings = [] + metadata = {} + for i in tqdm( + range(0, len(texts_to_embed), batch_size), disable=not self.progress_bar, desc="Calculating embeddings" + ): + batch = texts_to_embed[i : i + batch_size] + response = openai.Embedding.create(model=self.model_name, input=batch) + embeddings = [el["embedding"] for el in response.data] + all_embeddings.extend(embeddings) + + if "model" not in metadata: + metadata["model"] = response.model + if "usage" not in metadata: + metadata["usage"] = dict(response.usage.items()) + else: + metadata["usage"]["prompt_tokens"] += response.usage.prompt_tokens + metadata["usage"]["total_tokens"] += response.usage.total_tokens + + return all_embeddings, metadata + + @component.output_types(documents=List[Document], metadata=Dict[str, Any]) + def run(self, documents: List[Document]): + """ + Embed a list of Documents. + The embedding of each Document is stored in the `embedding` field of the Document. + + :param documents: A list of Documents to embed. + """ + if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): + raise TypeError( + "OpenAIDocumentEmbedder expects a list of Documents as input." + "In case you want to embed a string, please use the OpenAITextEmbedder." + ) + + texts_to_embed = self._prepare_texts_to_embed(documents=documents) + + embeddings, metadata = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self.batch_size) + + documents_with_embeddings = [] + for doc, emb in zip(documents, embeddings): + doc_as_dict = doc.to_dict() + doc_as_dict["embedding"] = emb + documents_with_embeddings.append(Document.from_dict(doc_as_dict)) + + return {"documents": documents_with_embeddings, "metadata": metadata} diff --git a/releasenotes/notes/openai-document-embedder-d2f59ba1f21babcb.yaml b/releasenotes/notes/openai-document-embedder-d2f59ba1f21babcb.yaml new file mode 100644 index 0000000000..eaf44199c6 --- /dev/null +++ b/releasenotes/notes/openai-document-embedder-d2f59ba1f21babcb.yaml @@ -0,0 +1,6 @@ +--- +preview: + - | + Add OpenAI Document Embedder. + It computes embeddings of Documents using OpenAI models. + The embedding of each Document is stored in the `embedding` field of the Document. diff --git a/test/preview/components/embedders/test_openai_document_embedder.py b/test/preview/components/embedders/test_openai_document_embedder.py new file mode 100644 index 0000000000..b6f3ea9b37 --- /dev/null +++ b/test/preview/components/embedders/test_openai_document_embedder.py @@ -0,0 +1,334 @@ +from unittest.mock import patch +import pytest +from typing import List +import numpy as np +import openai +from openai.util import convert_to_openai_object + +from haystack.preview import Document +from haystack.preview.components.embedders.openai_document_embedder import OpenAIDocumentEmbedder + + +def mock_openai_response( + input: List[str], model: str = "text-embedding-ada-002", **kwargs +) -> openai.openai_object.OpenAIObject: + dict_response = { + "object": "list", + "data": [ + {"object": "embedding", "index": i, "embedding": np.random.rand(1536).tolist()} for i in range(len(input)) + ], + "model": model, + "usage": {"prompt_tokens": 4, "total_tokens": 4}, + } + + return convert_to_openai_object(dict_response) + + +class TestOpenAIDocumentEmbedder: + @pytest.mark.unit + def test_init_default(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "fake-api-key") + embedder = OpenAIDocumentEmbedder() + + assert openai.api_key == "fake-api-key" + + assert embedder.model_name == "text-embedding-ada-002" + assert embedder.organization is None + assert embedder.prefix == "" + assert embedder.suffix == "" + assert embedder.batch_size == 32 + assert embedder.progress_bar is True + assert embedder.metadata_fields_to_embed == [] + assert embedder.embedding_separator == "\n" + + @pytest.mark.unit + def test_init_with_parameters(self): + embedder = OpenAIDocumentEmbedder( + api_key="fake-api-key", + model_name="model", + organization="my-org", + prefix="prefix", + suffix="suffix", + batch_size=64, + progress_bar=False, + metadata_fields_to_embed=["test_field"], + embedding_separator=" | ", + ) + assert openai.api_key == "fake-api-key" + assert openai.organization == "my-org" + + assert embedder.organization == "my-org" + assert embedder.model_name == "model" + assert embedder.prefix == "prefix" + assert embedder.suffix == "suffix" + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.metadata_fields_to_embed == ["test_field"] + assert embedder.embedding_separator == " | " + + @pytest.mark.unit + def test_init_fail_wo_api_key(self, monkeypatch): + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + with pytest.raises(ValueError, match="OpenAIDocumentEmbedder expects an OpenAI API key"): + OpenAIDocumentEmbedder() + + @pytest.mark.unit + def test_to_dict(self): + component = OpenAIDocumentEmbedder(api_key="fake-api-key") + data = component.to_dict() + assert data == { + "type": "OpenAIDocumentEmbedder", + "init_parameters": { + "model_name": "text-embedding-ada-002", + "organization": None, + "prefix": "", + "suffix": "", + "batch_size": 32, + "progress_bar": True, + "metadata_fields_to_embed": [], + "embedding_separator": "\n", + }, + } + + @pytest.mark.unit + def test_to_dict_with_custom_init_parameters(self): + component = OpenAIDocumentEmbedder( + api_key="fake-api-key", + model_name="model", + organization="my-org", + prefix="prefix", + suffix="suffix", + batch_size=64, + progress_bar=False, + metadata_fields_to_embed=["test_field"], + embedding_separator=" | ", + ) + data = component.to_dict() + assert data == { + "type": "OpenAIDocumentEmbedder", + "init_parameters": { + "model_name": "model", + "organization": "my-org", + "prefix": "prefix", + "suffix": "suffix", + "batch_size": 64, + "progress_bar": False, + "metadata_fields_to_embed": ["test_field"], + "embedding_separator": " | ", + }, + } + + @pytest.mark.unit + def test_from_dict(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "fake-api-key") + data = { + "type": "OpenAIDocumentEmbedder", + "init_parameters": { + "model_name": "model", + "organization": "my-org", + "prefix": "prefix", + "suffix": "suffix", + "batch_size": 64, + "progress_bar": False, + "metadata_fields_to_embed": ["test_field"], + "embedding_separator": " | ", + }, + } + component = OpenAIDocumentEmbedder.from_dict(data) + assert openai.api_key == "fake-api-key" + assert component.model_name == "model" + assert component.organization == "my-org" + assert openai.organization == "my-org" + assert component.prefix == "prefix" + assert component.suffix == "suffix" + assert component.batch_size == 64 + assert component.progress_bar is False + assert component.metadata_fields_to_embed == ["test_field"] + assert component.embedding_separator == " | " + + @pytest.mark.unit + def test_from_dict_fail_wo_env_var(self, monkeypatch): + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + data = { + "type": "OpenAIDocumentEmbedder", + "init_parameters": { + "model_name": "model", + "organization": "my-org", + "prefix": "prefix", + "suffix": "suffix", + "batch_size": 64, + "progress_bar": False, + "metadata_fields_to_embed": ["test_field"], + "embedding_separator": " | ", + }, + } + with pytest.raises(ValueError, match="OpenAIDocumentEmbedder expects an OpenAI API key"): + OpenAIDocumentEmbedder.from_dict(data) + + @pytest.mark.unit + def test_prepare_texts_to_embed_w_metadata(self): + documents = [ + Document(text=f"document number {i}:\ncontent", metadata={"meta_field": f"meta_value {i}"}) + for i in range(5) + ] + + embedder = OpenAIDocumentEmbedder( + api_key="fake-api-key", metadata_fields_to_embed=["meta_field"], embedding_separator=" | " + ) + + prepared_texts = embedder._prepare_texts_to_embed(documents) + + # note that newline is replaced by space + assert prepared_texts == [ + "meta_value 0 | document number 0: content", + "meta_value 1 | document number 1: content", + "meta_value 2 | document number 2: content", + "meta_value 3 | document number 3: content", + "meta_value 4 | document number 4: content", + ] + + @pytest.mark.unit + def test_prepare_texts_to_embed_w_suffix(self): + documents = [Document(text=f"document number {i}") for i in range(5)] + + embedder = OpenAIDocumentEmbedder(api_key="fake-api-key", prefix="my_prefix ", suffix=" my_suffix") + + prepared_texts = embedder._prepare_texts_to_embed(documents) + + assert prepared_texts == [ + "my_prefix document number 0 my_suffix", + "my_prefix document number 1 my_suffix", + "my_prefix document number 2 my_suffix", + "my_prefix document number 3 my_suffix", + "my_prefix document number 4 my_suffix", + ] + + @pytest.mark.unit + def test_embed_batch(self): + texts = ["text 1", "text 2", "text 3", "text 4", "text 5"] + + with patch( + "haystack.preview.components.embedders.openai_document_embedder.openai.Embedding" + ) as openai_embedding_patch: + openai_embedding_patch.create.side_effect = mock_openai_response + embedder = OpenAIDocumentEmbedder(api_key="fake-api-key", model_name="model") + + embeddings, metadata = embedder._embed_batch(texts_to_embed=texts, batch_size=2) + + assert openai_embedding_patch.create.call_count == 3 + + assert isinstance(embeddings, list) + assert len(embeddings) == len(texts) + for embedding in embeddings: + assert isinstance(embedding, list) + assert len(embedding) == 1536 + assert all(isinstance(x, float) for x in embedding) + + # openai.Embedding.create is called 3 times + assert metadata == {"model": "model", "usage": {"prompt_tokens": 3 * 4, "total_tokens": 3 * 4}} + + @pytest.mark.unit + def test_run(self): + docs = [ + Document(text="I love cheese", metadata={"topic": "Cuisine"}), + Document(text="A transformer is a deep learning architecture", metadata={"topic": "ML"}), + ] + + model = "text-similarity-ada-001" + with patch( + "haystack.preview.components.embedders.openai_document_embedder.openai.Embedding" + ) as openai_embedding_patch: + openai_embedding_patch.create.side_effect = mock_openai_response + embedder = OpenAIDocumentEmbedder( + api_key="fake-api-key", + model_name=model, + prefix="prefix ", + suffix=" suffix", + metadata_fields_to_embed=["topic"], + embedding_separator=" | ", + ) + + result = embedder.run(documents=docs) + + openai_embedding_patch.create.assert_called_once_with( + model=model, + input=[ + "prefix Cuisine | I love cheese suffix", + "prefix ML | A transformer is a deep learning architecture suffix", + ], + ) + documents_with_embeddings = result["documents"] + metadata = result["metadata"] + + assert isinstance(documents_with_embeddings, list) + assert len(documents_with_embeddings) == len(docs) + for doc in documents_with_embeddings: + assert isinstance(doc, Document) + assert isinstance(doc.embedding, list) + assert len(doc.embedding) == 1536 + assert all(isinstance(x, float) for x in doc.embedding) + assert metadata == {"model": model, "usage": {"prompt_tokens": 4, "total_tokens": 4}} + + @pytest.mark.unit + def test_run_custom_batch_size(self): + docs = [ + Document(text="I love cheese", metadata={"topic": "Cuisine"}), + Document(text="A transformer is a deep learning architecture", metadata={"topic": "ML"}), + ] + + model = "text-similarity-ada-001" + with patch( + "haystack.preview.components.embedders.openai_document_embedder.openai.Embedding" + ) as openai_embedding_patch: + openai_embedding_patch.create.side_effect = mock_openai_response + embedder = OpenAIDocumentEmbedder( + api_key="fake-api-key", + model_name=model, + prefix="prefix ", + suffix=" suffix", + metadata_fields_to_embed=["topic"], + embedding_separator=" | ", + batch_size=1, + ) + + result = embedder.run(documents=docs) + + assert openai_embedding_patch.create.call_count == 2 + + documents_with_embeddings = result["documents"] + metadata = result["metadata"] + + assert isinstance(documents_with_embeddings, list) + assert len(documents_with_embeddings) == len(docs) + for doc in documents_with_embeddings: + assert isinstance(doc, Document) + assert isinstance(doc.embedding, list) + assert len(doc.embedding) == 1536 + assert all(isinstance(x, float) for x in doc.embedding) + + # openai.Embedding.create is called 2 times + assert metadata == {"model": model, "usage": {"prompt_tokens": 2 * 4, "total_tokens": 2 * 4}} + + @pytest.mark.unit + def test_run_wrong_input_format(self): + embedder = OpenAIDocumentEmbedder(api_key="fake-api-key") + + # wrong formats + string_input = "text" + list_integers_input = [1, 2, 3] + + with pytest.raises(TypeError, match="OpenAIDocumentEmbedder expects a list of Documents as input"): + embedder.run(documents=string_input) + + with pytest.raises(TypeError, match="OpenAIDocumentEmbedder expects a list of Documents as input"): + embedder.run(documents=list_integers_input) + + @pytest.mark.unit + def test_run_on_empty_list(self): + embedder = OpenAIDocumentEmbedder(api_key="fake-api-key") + + empty_list_input = [] + result = embedder.run(documents=empty_list_input) + + assert result["documents"] is not None + assert not result["documents"] # empty list From dfa48eece90938e25e4759435d48faa7f1e5f857 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Thu, 28 Sep 2023 15:49:19 +0200 Subject: [PATCH 03/33] clean up the Slack integrations (#5908) --- .github/workflows/examples_tests.yml | 2 -- .github/workflows/pypi_release.yml | 2 +- .github/workflows/rest_api_tests.yml | 2 -- .github/workflows/tests.yml | 2 -- 4 files changed, 1 insertion(+), 7 deletions(-) diff --git a/.github/workflows/examples_tests.yml b/.github/workflows/examples_tests.yml index 241306571d..ac1acb0951 100644 --- a/.github/workflows/examples_tests.yml +++ b/.github/workflows/examples_tests.yml @@ -15,8 +15,6 @@ on: - ready_for_review env: - SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} - SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} COHERE_API_KEY: ${{ secrets.COHERE_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} diff --git a/.github/workflows/pypi_release.yml b/.github/workflows/pypi_release.yml index ee52541410..3118198e65 100644 --- a/.github/workflows/pypi_release.yml +++ b/.github/workflows/pypi_release.yml @@ -33,5 +33,5 @@ jobs: uses: act10ns/slack@v2 with: status: ${{ job.status }} - channel: "#haystack" + channel: "#haystack-notifications" config: .github/config/pypi-release-slack-notification.yml diff --git a/.github/workflows/rest_api_tests.yml b/.github/workflows/rest_api_tests.yml index 33e8a5ff7a..e3fc5a5971 100644 --- a/.github/workflows/rest_api_tests.yml +++ b/.github/workflows/rest_api_tests.yml @@ -18,8 +18,6 @@ on: - "rest_api/pyproject.toml" env: - SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} - SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK PYTHON_VERSION: "3.8" jobs: diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 98845da2d4..35ce7995cc 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -22,8 +22,6 @@ on: - "!docs/**/*.py" env: - SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} - SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} COHERE_API_KEY: ${{ secrets.COHERE_API_KEY }} PYTHON_VERSION: "3.8" From e882a7d5c812befc2b74798c10c4b9212d4beba1 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Thu, 28 Sep 2023 17:22:28 +0200 Subject: [PATCH 04/33] feat: Add HTMLToDocument component (v2) (#5907) --- .../components/file_converters/__init__.py | 10 +- .../components/file_converters/html.py | 70 + ...add-html-to-document-21fe38b244388f4d.yaml | 4 + .../file_converters/test_html_to_document.py | 63 + .../test_files/html/what_is_haystack.html | 1634 +++++++++++++++++ 5 files changed, 1780 insertions(+), 1 deletion(-) create mode 100644 haystack/preview/components/file_converters/html.py create mode 100644 releasenotes/notes/add-html-to-document-21fe38b244388f4d.yaml create mode 100644 test/preview/components/file_converters/test_html_to_document.py create mode 100644 test/preview/test_files/html/what_is_haystack.html diff --git a/haystack/preview/components/file_converters/__init__.py b/haystack/preview/components/file_converters/__init__.py index 08f7a46132..ac75560757 100644 --- a/haystack/preview/components/file_converters/__init__.py +++ b/haystack/preview/components/file_converters/__init__.py @@ -1,5 +1,13 @@ from haystack.preview.components.file_converters.txt import TextFileToDocument from haystack.preview.components.file_converters.tika import TikaDocumentConverter from haystack.preview.components.file_converters.azure import AzureOCRDocumentConverter +from haystack.preview.components.file_converters.pypdf import PyPDFToDocument +from haystack.preview.components.file_converters.html import HTMLToDocument -__all__ = ["TextFileToDocument", "TikaDocumentConverter", "AzureOCRDocumentConverter"] +__all__ = [ + "TextFileToDocument", + "TikaDocumentConverter", + "AzureOCRDocumentConverter", + "PyPDFToDocument", + "HTMLToDocument", +] diff --git a/haystack/preview/components/file_converters/html.py b/haystack/preview/components/file_converters/html.py new file mode 100644 index 0000000000..3cb7c03377 --- /dev/null +++ b/haystack/preview/components/file_converters/html.py @@ -0,0 +1,70 @@ +import logging +from typing import List, Optional, Dict, Any, Union +from pathlib import Path + +from haystack.preview.lazy_imports import LazyImport +from haystack.preview import Document, component, default_to_dict, default_from_dict + +with LazyImport("Run 'pip install boilerpy3'") as boilerpy3_import: + from boilerpy3 import extractors + + +logger = logging.getLogger(__name__) + + +@component +class HTMLToDocument: + """ + A component for converting an HTML file to a Document. + """ + + def __init__(self, id_hash_keys: Optional[List[str]] = None): + """ + Create a HTMLToDocument component. + + :param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's + attributes. Default: `None` + """ + boilerpy3_import.check() + self.id_hash_keys = id_hash_keys or [] + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + return default_to_dict(self, id_hash_keys=self.id_hash_keys) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "HTMLToDocument": + """ + Deserialize this component from a dictionary. + """ + return default_from_dict(cls, data) + + @component.output_types(documents=List[Document]) + def run(self, paths: List[Union[str, Path]]): + """ + Convert HTML files to Documents. + + :param paths: A list of paths to HTML files. + :return: A list of Documents. + """ + documents = [] + extractor = extractors.ArticleExtractor(raise_on_failure=False) + for path in paths: + try: + file_content = extractor.read_from_file(path) + except Exception as e: + logger.warning("Could not read file %s. Skipping it. Error message: %s", path, e) + continue + # although raise_on_failure is set to False, the extractor can still raise an exception + try: + text = extractor.get_content(file_content) + except Exception as conversion_e: + logger.warning("Could not extract raw txt from %s. Skipping it. Error message: %s", path, conversion_e) + continue + + document = Document(text=text, id_hash_keys=self.id_hash_keys) + documents.append(document) + + return {"documents": documents} diff --git a/releasenotes/notes/add-html-to-document-21fe38b244388f4d.yaml b/releasenotes/notes/add-html-to-document-21fe38b244388f4d.yaml new file mode 100644 index 0000000000..1ff9b25b6b --- /dev/null +++ b/releasenotes/notes/add-html-to-document-21fe38b244388f4d.yaml @@ -0,0 +1,4 @@ +--- +preview: + - | + Adds HTMLToDocument component to convert HTML to a Document. diff --git a/test/preview/components/file_converters/test_html_to_document.py b/test/preview/components/file_converters/test_html_to_document.py new file mode 100644 index 0000000000..b46f841282 --- /dev/null +++ b/test/preview/components/file_converters/test_html_to_document.py @@ -0,0 +1,63 @@ +import logging + +import pytest + +from haystack.preview.components.file_converters import HTMLToDocument + + +class TestHTMLToDocument: + @pytest.mark.unit + def test_to_dict(self): + component = HTMLToDocument() + data = component.to_dict() + assert data == {"type": "HTMLToDocument", "init_parameters": {"id_hash_keys": []}} + + @pytest.mark.unit + def test_to_dict_with_custom_init_parameters(self): + component = HTMLToDocument(id_hash_keys=["name"]) + data = component.to_dict() + assert data == {"type": "HTMLToDocument", "init_parameters": {"id_hash_keys": ["name"]}} + + @pytest.mark.unit + def test_from_dict(self): + data = {"type": "HTMLToDocument", "init_parameters": {"id_hash_keys": ["name"]}} + component = HTMLToDocument.from_dict(data) + assert component.id_hash_keys == ["name"] + + @pytest.mark.unit + def test_run(self, preview_samples_path): + """ + Test if the component runs correctly. + """ + paths = [preview_samples_path / "html" / "what_is_haystack.html"] + converter = HTMLToDocument() + output = converter.run(paths=paths) + docs = output["documents"] + assert len(docs) == 1 + assert "Haystack" in docs[0].text + + @pytest.mark.unit + def test_run_wrong_file_type(self, preview_samples_path, caplog): + """ + Test if the component runs correctly when an input file is not of the expected type. + """ + paths = [preview_samples_path / "audio" / "answer.wav"] + converter = HTMLToDocument() + with caplog.at_level(logging.WARNING): + output = converter.run(paths=paths) + assert "codec can't decode byte" in caplog.text + + docs = output["documents"] + assert docs == [] + + @pytest.mark.unit + def test_run_error_handling(self, preview_samples_path, caplog): + """ + Test if the component correctly handles errors. + """ + paths = ["non_existing_file.html"] + converter = HTMLToDocument() + with caplog.at_level(logging.WARNING): + result = converter.run(paths=paths) + assert "Could not read file non_existing_file.html" in caplog.text + assert result["documents"] == [] diff --git a/test/preview/test_files/html/what_is_haystack.html b/test/preview/test_files/html/what_is_haystack.html new file mode 100644 index 0000000000..2d62b206c0 --- /dev/null +++ b/test/preview/test_files/html/what_is_haystack.html @@ -0,0 +1,1634 @@ + + + + + + + + + + What is Haystack? | Haystack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ 🎃 We're participating in Hacktoberfest 2023! + + + + + +
+
+ + + +
+ +
+
+
+ + + + + +
+

What is Haystack?

+

Haystack is the open source Python framework by deepset for building custom apps with large language models (LLMs). It lets you quickly try out the latest models in natural language processing (NLP) while being flexible and easy to use. Our inspiring community of users and builders has helped shape Haystack into what it is today: a complete framework for building production-ready NLP apps.

+

Building with Haystack

+

Haystack offers comprehensive tooling for developing state-of-the-art NLP systems that use LLMs (such as GPT-4, Falcon and similar) and Transformer models . With Haystack, you can effortlessly experiment with various models hosted on platforms like Hugging Face, OpenAI, Cohere, or even models deployed on SageMaker and your local models to find the perfect fit for your use case.

+ + + + + + + + + + + + + + + + + + + + + Model Providers + + +

Some examples of what you can build include:

+
    +
  • Semantic search on a large collection of documents in any language
  • +
  • Generative question answering on a knowledge base containing mixed types of information: images, text, and tables.
  • +
  • Natural language chatbots powered by cutting-edge generative models like GPT-4
  • +
  • An LLM-based Haystack Agent capable of resolving complex queries
  • +
  • Information extraction from documents to populate your database or build a knowledge graph
  • +
+

This is just a small subset of the kinds of systems that can be created in Haystack.

+

Functionality for all stages of an NLP project

+

A successful NLP project requires more than just the language models. As an end-to-end framework, Haystack assists you in building your system every step of the way, offering tooling for each stage of the NLP project life cycle:

+ +

But that’s not all: +metadata filtering, +model distillation, or the prompt hub, whatever your NLP heart desires, you’re likely to find it in Haystack. And if not? We’ll build it together.

+ + + + + + + + + + + + + + + + + + + + + + + Rest API + + +

Building blocks

+

Haystack uses a few simple but effective concepts to help you build fully functional and customized end-to-end NLP systems.

+

Components

+

At the core of Haystack are its components—fundamental building blocks that can perform tasks like document retrieval, text generation, or summarization. A single component is already quite powerful. It can manage local language models or communicate with a hosted model through an API.

+

While Haystack offers a bunch of components you can use out of the box, it also lets you create your own custom components. Explore the +collection of integrations that includes custom components developed by our community, which you can freely use.

+

You can chain components together to build pipelines, which are the foundation of the NLP app architecture in Haystack.

+

Pipelines

+

Pipelines are powerful structures made up of components, such as a Retriever and Reader, connected to infrastructure building blocks, such as a DocumentStore (for example, Elasticsearch or Weaviate) to form complex systems.

+

Haystack offers ready-made pipelines for most common tasks, such as question answering, document retrieval, or summarization. But it’s just as easy to design and create a custom pipeline for NLP scenarios that are way more complex than question answering.

+

Agents

+

The Haystack Agent makes use of a large language model to resolve complex tasks. When initializing the Agent, you give it a set of tools, which can be pipeline components or whole pipelines. The Agent can use to those tools iteratively to arrive at an answer. When given a query, the Agent determines which tools are useful to answer this query and calls them in a loop until it gets the answer. This way, it can achieve much more than extractive or generative question answering pipelines.

+ + + + + + + + + + + + + + + + + + + + + Agent Tools + + +

Who’s it for?

+

Haystack is for everyone looking to build natural language apps—NLP enthusiasts and newbies alike. You don’t need to understand how the models work under the hood. With Haystack’s modular and flexible components, pipelines, and agents, all you need is some basic knowledge of Python to dive right in.

+

Our community

+

At the heart of Haystack is the vibrant open source community that thrives on the diverse backgrounds and skill sets of its members. We value collaboration greatly and encourage our users to shape Haystack actively through GitHub contributions. Our Discord channel is a space where community members can connect, seek help, and learn from each other.

+

We also organize live online and in-person events, webinars, and office hours, which are an opportunity to learn and grow.

+ + + + + + + + +
+ + + +
+ Join Discord +
+ + + +
+
+ +

Enter the Haystack universe

+ + + + +
+ + + +
+ +
+
+
+
+
+ + + + + + + + + + + + + + + +
+
+
+ +
+ + + +
+ + + +
+
+
+ + + + + + + + + + + + + + + + + + + + + + From 578f2b4bbfb51c0e77e891b1f52ecd739713aae1 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 28 Sep 2023 17:50:46 +0200 Subject: [PATCH 05/33] feat: update canals to 0.8.1 (#5900) * Update canals to 0.8.1 * scale up runner --- .github/workflows/license_compliance.yml | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/license_compliance.yml b/.github/workflows/license_compliance.yml index 0c5e504660..b3e0aba19a 100644 --- a/.github/workflows/license_compliance.yml +++ b/.github/workflows/license_compliance.yml @@ -171,7 +171,7 @@ jobs: name: All extras env: REQUIREMENTS_FILE: requirements_all.txt - runs-on: ubuntu-latest + runs-on: ubuntu-latest-4-cores steps: - name: Checkout the code uses: actions/checkout@v4 diff --git a/pyproject.toml b/pyproject.toml index 601e2f385c..a721ee9147 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,7 +86,7 @@ dependencies = [ [project.optional-dependencies] preview = [ - "canals==0.8.0", + "canals==0.8.1", "requests", "pandas", "rank_bm25", From 0947f59545a23fcd2ebdab07255b4d81b25a734c Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Fri, 29 Sep 2023 08:40:01 +0200 Subject: [PATCH 06/33] feat: add async PromptNode run (#5890) * add async promptnode * Remove unecessary calls to dict.keys() --------- Co-authored-by: Silvano Cerza Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> --- haystack/nodes/prompt/prompt_model.py | 10 ++ haystack/nodes/prompt/prompt_node.py | 120 +++++++++++++++--- .../add-promptnode-arun-bc4c2bcc9c653015.yaml | 4 + test/prompt/test_prompt_model.py | 24 +++- test/prompt/test_prompt_node.py | 85 ++++++++++++- 5 files changed, 222 insertions(+), 21 deletions(-) create mode 100644 releasenotes/notes/add-promptnode-arun-bc4c2bcc9c653015.yaml diff --git a/haystack/nodes/prompt/prompt_model.py b/haystack/nodes/prompt/prompt_model.py index f8a406004e..c4c9030897 100644 --- a/haystack/nodes/prompt/prompt_model.py +++ b/haystack/nodes/prompt/prompt_model.py @@ -111,6 +111,16 @@ def invoke(self, prompt: Union[str, List[str], List[Dict[str, str]]], **kwargs) output = self.model_invocation_layer.invoke(prompt=prompt, **kwargs) return output + async def ainvoke(self, prompt: Union[str, List[str], List[Dict[str, str]]], **kwargs) -> List[str]: + """ + Drop-in replacement asyncio version of the `invoke` method, see there for documentation. + """ + try: + return await self.model_invocation_layer.invoke(prompt=prompt, **kwargs) + except TypeError: + # The `invoke` method of the underlying invocation layer doesn't support asyncio + return self.model_invocation_layer.invoke(prompt=prompt, **kwargs) + @overload def _ensure_token_limit(self, prompt: str) -> str: ... diff --git a/haystack/nodes/prompt/prompt_node.py b/haystack/nodes/prompt/prompt_node.py index 060bec69c5..2360d5b44c 100644 --- a/haystack/nodes/prompt/prompt_node.py +++ b/haystack/nodes/prompt/prompt_node.py @@ -232,6 +232,37 @@ def prompt_template_params(self, prompt_template: str) -> List[str]: return list(template.prompt_params) return [] + def _prepare( # type: ignore + self, query, file_paths, labels, documents, meta, invocation_context, prompt_template, generation_kwargs + ) -> Dict: + """ + Prepare prompt invocation. + """ + invocation_context = invocation_context or {} + + if query and "query" not in invocation_context: + invocation_context["query"] = query + + if file_paths and "file_paths" not in invocation_context: + invocation_context["file_paths"] = file_paths + + if labels and "labels" not in invocation_context: + invocation_context["labels"] = labels + + if documents and "documents" not in invocation_context: + invocation_context["documents"] = documents + + if meta and "meta" not in invocation_context: + invocation_context["meta"] = meta + + if "prompt_template" not in invocation_context: + invocation_context["prompt_template"] = self.get_prompt_template(prompt_template) + + if generation_kwargs: + invocation_context.update(generation_kwargs) + + return invocation_context + def run( self, query: Optional[str] = None, @@ -272,29 +303,86 @@ def run( # so that they can be returned by `run()` as part of the pipeline's debug output. prompt_collector: List[str] = [] - invocation_context = invocation_context or {} - if query and "query" not in invocation_context.keys(): - invocation_context["query"] = query + invocation_context = self._prepare( + query, file_paths, labels, documents, meta, invocation_context, prompt_template, generation_kwargs + ) - if file_paths and "file_paths" not in invocation_context.keys(): - invocation_context["file_paths"] = file_paths + results = self(**invocation_context, prompt_collector=prompt_collector) - if labels and "labels" not in invocation_context.keys(): - invocation_context["labels"] = labels + prompt_template_resolved: PromptTemplate = invocation_context.pop("prompt_template") - if documents and "documents" not in invocation_context.keys(): - invocation_context["documents"] = documents + try: + output_variable = self.output_variable or prompt_template_resolved.output_variable or "results" + except: + output_variable = "results" - if meta and "meta" not in invocation_context.keys(): - invocation_context["meta"] = meta + invocation_context[output_variable] = results + invocation_context["prompts"] = prompt_collector + final_result: Dict[str, Any] = {output_variable: results, "invocation_context": invocation_context} - if "prompt_template" not in invocation_context.keys(): - invocation_context["prompt_template"] = self.get_prompt_template(prompt_template) + if self.debug: + final_result["_debug"] = {"prompts_used": prompt_collector} - if generation_kwargs: - invocation_context.update(generation_kwargs) + return final_result, "output_1" + + async def _aprompt(self, prompt_template: Optional[Union[str, PromptTemplate]], *args, **kwargs): + """ + Async version of the actual prompt invocation. + """ + results = [] + # we pop the prompt_collector kwarg to avoid passing it to the model + prompt_collector: List[Union[str, List[Dict[str, str]]]] = kwargs.pop("prompt_collector", []) + + # kwargs override model kwargs + kwargs = {**self._prepare_model_kwargs(), **kwargs} + template_to_fill = self.get_prompt_template(prompt_template) + if template_to_fill: + # prompt template used, yield prompts from inputs args + for prompt in template_to_fill.fill(*args, **kwargs): + kwargs_copy = template_to_fill.remove_template_params(copy.copy(kwargs)) + # and pass the prepared prompt and kwargs copy to the model + prompt = self.prompt_model._ensure_token_limit(prompt) + prompt_collector.append(prompt) + logger.debug("Prompt being sent to LLM with prompt %s and kwargs %s", prompt, kwargs_copy) + output = await self.prompt_model.ainvoke(prompt, **kwargs_copy) + results.extend(output) + + kwargs["prompts"] = prompt_collector + results = template_to_fill.post_process(results, **kwargs) + else: + # straightforward prompt, no templates used + for prompt in list(args): + kwargs_copy = copy.copy(kwargs) + prompt = self.prompt_model._ensure_token_limit(prompt) + prompt_collector.append(prompt) + logger.debug("Prompt being sent to LLM with prompt %s and kwargs %s ", prompt, kwargs_copy) + output = await self.prompt_model.ainvoke(prompt, **kwargs_copy) + results.extend(output) + return results + + async def arun( + self, + query: Optional[str] = None, + file_paths: Optional[List[str]] = None, + labels: Optional[MultiLabel] = None, + documents: Optional[List[Document]] = None, + meta: Optional[dict] = None, + invocation_context: Optional[Dict[str, Any]] = None, + prompt_template: Optional[Union[str, PromptTemplate]] = None, + generation_kwargs: Optional[Dict[str, Any]] = None, + ) -> Tuple[Dict, str]: + """ + Drop-in replacement asyncio version of the `run` method, see there for documentation. + """ + prompt_collector: List[str] = [] + + invocation_context = self._prepare( + query, file_paths, labels, documents, meta, invocation_context, prompt_template, generation_kwargs + ) - results = self(prompt_collector=prompt_collector, **invocation_context) + # Let's skip the call to __call__, because all it does is injecting a prompt template + # if there isn't any, while we know for sure it'll be in `invocation_context`. + results = await self._aprompt(prompt_collector=prompt_collector, **invocation_context) prompt_template_resolved: PromptTemplate = invocation_context.pop("prompt_template") diff --git a/releasenotes/notes/add-promptnode-arun-bc4c2bcc9c653015.yaml b/releasenotes/notes/add-promptnode-arun-bc4c2bcc9c653015.yaml new file mode 100644 index 0000000000..e9e3305eab --- /dev/null +++ b/releasenotes/notes/add-promptnode-arun-bc4c2bcc9c653015.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + PromptNode can now be run asynchronously by calling the `arun` method. diff --git a/test/prompt/test_prompt_model.py b/test/prompt/test_prompt_model.py index 1ac1702a1d..9e7c2b0f51 100644 --- a/test/prompt/test_prompt_model.py +++ b/test/prompt/test_prompt_model.py @@ -1,4 +1,5 @@ -from unittest.mock import patch, Mock +import asyncio +from unittest.mock import patch, MagicMock import pytest @@ -36,3 +37,24 @@ def test_construtor_with_custom_model(): def test_constructor_with_no_supported_model(): with pytest.raises(ValueError, match="Model some-random-model is not supported"): PromptModel("some-random-model") + + +@pytest.mark.asyncio +async def test_ainvoke(): + def async_return(result): + f = asyncio.Future() + f.set_result(result) + return f + + mock_layer = MagicMock() # no async-defined methods, await will fail and fall back to regular `invoke` + mock_layer.return_value.invoke.return_value = async_return("Async Bar!") + model = PromptModel(invocation_layer_class=mock_layer) + assert await model.ainvoke("Foo") == "Async Bar!" + + +@pytest.mark.asyncio +async def test_ainvoke_falls_back_to_sync(): + mock_layer = MagicMock() # no async-defined methods, await will fail and fall back to regular `invoke` + mock_layer.return_value.invoke.return_value = "Bar!" + model = PromptModel(invocation_layer_class=mock_layer) + assert await model.ainvoke("Foo") == "Bar!" diff --git a/test/prompt/test_prompt_node.py b/test/prompt/test_prompt_node.py index fe9d034f1d..c0ae8480e4 100644 --- a/test/prompt/test_prompt_node.py +++ b/test/prompt/test_prompt_node.py @@ -1,18 +1,15 @@ import os import logging from typing import Optional, Union, List, Dict, Any, Tuple -from unittest.mock import patch, Mock, MagicMock +from unittest.mock import patch, Mock, MagicMock, AsyncMock import pytest from prompthub import Prompt -from transformers import GenerationConfig, TextStreamer from haystack import Document, Pipeline, BaseComponent, MultiLabel from haystack.nodes.prompt import PromptTemplate, PromptNode, PromptModel from haystack.nodes.prompt.prompt_template import LEGACY_DEFAULT_TEMPLATES from haystack.nodes.prompt.invocation_layer import ( - HFLocalInvocationLayer, - DefaultTokenStreamingHandler, AzureChatGPTInvocationLayer, AzureOpenAIInvocationLayer, OpenAIInvocationLayer, @@ -1098,3 +1095,83 @@ def test_prompt_node_warns_about_missing_documents(mock_model, caplog): "Expected prompt parameter 'documents' to be provided but it is missing. " "Continuing with an empty list of documents." in caplog.text ) + + +@pytest.mark.unit +@patch("haystack.nodes.prompt.prompt_node.PromptModel") +def test__prepare_invocation_context_is_empty(mock_model): + node = PromptNode() + node.get_prompt_template = MagicMock(return_value="Test Template") + + kwargs = { + "query": "query", + "file_paths": ["foo", "bar"], + "labels": ["label", "another"], + "documents": ["A", "B"], + "meta": {"meta_key": "meta_value"}, + "prompt_template": "my-test-prompt", + "invocation_context": None, + "generation_kwargs": {"gen_key": "gen_value"}, + } + + invocation_context = node._prepare(**kwargs) + + node.get_prompt_template.assert_called_once_with("my-test-prompt") + assert invocation_context == { + "query": "query", + "file_paths": ["foo", "bar"], + "labels": ["label", "another"], + "documents": ["A", "B"], + "meta": {"meta_key": "meta_value"}, + "prompt_template": "Test Template", + "gen_key": "gen_value", + } + + +@pytest.mark.unit +@patch("haystack.nodes.prompt.prompt_node.PromptModel") +def test__prepare_invocation_context_was_passed(mock_model): + node = PromptNode() + + # Test invocation_context is left untouched + invocation_context = { + "query": "query", + "file_paths": ["foo", "bar"], + "labels": ["label", "another"], + "documents": ["A", "B"], + "meta": {"meta_key": "meta_value"}, + "prompt_template": "my-test-prompt", + "invocation_context": None, + } + kwargs = { + "query": None, + "file_paths": None, + "labels": None, + "documents": None, + "meta": None, + "prompt_template": None, + "invocation_context": invocation_context, + "generation_kwargs": None, + } + + assert node._prepare(**kwargs) == invocation_context + + +@pytest.mark.unit +@pytest.mark.asyncio +@patch("haystack.nodes.prompt.prompt_node.PromptModel") +async def test_arun(mock_model): + node = PromptNode() + node._aprompt = AsyncMock() + await node.arun("a query") + node._aprompt.assert_awaited_once_with(prompt_collector=[], query="a query", prompt_template=None) + + +@pytest.mark.unit +@pytest.mark.asyncio +@patch("haystack.nodes.prompt.prompt_node.PromptModel") +async def test_aprompt(mock_model): + node = PromptNode() + mock_model.return_value.ainvoke = AsyncMock() + await node._aprompt(PromptTemplate("test template")) + mock_model.return_value.ainvoke.assert_awaited_once() From d61df24b27602249d69bb129750b3aad119ee0a7 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Fri, 29 Sep 2023 10:38:33 +0200 Subject: [PATCH 07/33] chore: Remove classifiers directory from preview package (#5918) --- haystack/preview/components/classifiers/__init__.py | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 haystack/preview/components/classifiers/__init__.py diff --git a/haystack/preview/components/classifiers/__init__.py b/haystack/preview/components/classifiers/__init__.py deleted file mode 100644 index 208ec92cbe..0000000000 --- a/haystack/preview/components/classifiers/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from haystack.preview.components.classifiers.file_classifier import FileExtensionClassifier - -__all__ = ["FileExtensionClassifier"] From 81b2e83d04dc9e695d2bd02c5031c20fb53bf841 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Fri, 29 Sep 2023 13:16:08 +0200 Subject: [PATCH 08/33] feat: separate out `preview` tests (#5639) * add preview workflows * feedback * feedback * use preview extra * remove coverage and add separate e2e * rename workflow file for consistency * trigger ci * undo trigger * torch import in testing * add deps to unit tests * feedback * run container instead of service * comment * add if statement * fix tika version * separate out win integration tests * separate out all CIs * try installing docker on macos * exclude tika * remove tika docker --- .github/workflows/ci_metrics.yml | 2 + .github/workflows/e2e.yml | 1 + .github/workflows/e2e_preview.yml | 42 +++ .github/workflows/tests.yml | 2 + .github/workflows/tests_preview.yml | 324 ++++++++++++++++++++ .github/workflows/tests_preview_skipper.yml | 21 ++ .github/workflows/tests_skipper.yml | 2 + haystack/preview/testing/test_utils.py | 22 +- 8 files changed, 410 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/e2e_preview.yml create mode 100644 .github/workflows/tests_preview.yml create mode 100644 .github/workflows/tests_preview_skipper.yml diff --git a/.github/workflows/ci_metrics.yml b/.github/workflows/ci_metrics.yml index 688f3b9e1b..31a329e334 100644 --- a/.github/workflows/ci_metrics.yml +++ b/.github/workflows/ci_metrics.yml @@ -4,8 +4,10 @@ on: workflow_run: workflows: - "end-to-end" + - "end-to-end (Preview)" - "Linting" - "Tests" + - "Tests (Preview)" - "REST API Tests" types: - completed diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml index 188fd02c2a..b4fcfc1444 100644 --- a/.github/workflows/e2e.yml +++ b/.github/workflows/e2e.yml @@ -13,6 +13,7 @@ on: - ready_for_review paths: - "e2e/**/*.py" + - "!e2e/preview/**/*.py" # See e2e_preview.yml - ".github/workflows/e2e.yml" env: diff --git a/.github/workflows/e2e_preview.yml b/.github/workflows/e2e_preview.yml new file mode 100644 index 0000000000..aa54f8963e --- /dev/null +++ b/.github/workflows/e2e_preview.yml @@ -0,0 +1,42 @@ +# If you change this name also do it in ci_metrics.yml +name: end-to-end (Preview) + +on: + workflow_dispatch: # Activate this workflow manually + schedule: + - cron: "0 0 * * *" + pull_request: + types: + - opened + - reopened + - synchronize + - ready_for_review + paths: + - "e2e/preview/**/*.py" + - ".github/workflows/e2e_preview.yml" + +env: + PYTHON_VERSION: "3.8" + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + +jobs: + run: + timeout-minutes: 60 + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt install ffmpeg # for local Whisper tests + + - name: Install Haystack + run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2' + + - name: Run tests + run: pytest e2e/preview diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 35ce7995cc..4ecdd1fe85 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -17,6 +17,8 @@ on: paths: - "**.py" - "pyproject.toml" + - "!haystack/preview/**/*.py" # See tests_preview.yml + - "!test/preview/**/*.py" # See tests_preview.yml - "!.github/**/*.py" - "!rest_api/**/*.py" - "!docs/**/*.py" diff --git a/.github/workflows/tests_preview.yml b/.github/workflows/tests_preview.yml new file mode 100644 index 0000000000..26babf01ed --- /dev/null +++ b/.github/workflows/tests_preview.yml @@ -0,0 +1,324 @@ +# If you change this name also do it in tests_preview_skipper.yml +name: Tests (Preview) + +on: + workflow_dispatch: # Activate this workflow manually + push: + branches: + - main + # release branches have the form v1.9.x + - "v[0-9].*[0-9].x" + pull_request: + types: + - opened + - reopened + - synchronize + - ready_for_review + paths: + - "haystack/preview/**/*.py" + - "test/preview/**/*.py" + +env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + COHERE_API_KEY: ${{ secrets.COHERE_API_KEY }} + CORE_AZURE_CS_ENDPOINT: ${{ secrets.CORE_AZURE_CS_ENDPOINT }} + CORE_AZURE_CS_API_KEY: ${{ secrets.CORE_AZURE_CS_API_KEY }} + PYTHON_VERSION: "3.8" + +jobs: + black: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install Black + run: | + pip install --upgrade pip + pip install .[formatting] + + - name: Check status + run: | + if ! black . --check; then + git status + echo "###################################################################################################" + echo "# " + echo "# CHECK FAILED! Black found issues with your code formatting." + echo "# " + echo "# Either:" + echo "# 1. Run Black locally before committing:" + echo "# " + echo "# pip install .[formatting]" + echo "# black ." + echo "# " + echo "# 2. Install the pre-commit hook:" + echo "# " + echo "# pre-commit install" + echo "# " + echo "# 3. See https://github.com/deepset-ai/haystack/blob/main/CONTRIBUTING.md for help." + echo "# " + echo "# If you have further problems, please open an issue: https://github.com/deepset-ai/haystack/issues" + echo "# " + echo "##################################################################################################" + exit 1 + fi + + - name: Calculate alert data + id: calculator + shell: bash + if: (success() || failure()) && github.ref_name == 'main' + run: | + if [ "${{ job.status }}" = "success" ]; then + echo "alert_type=success" >> "$GITHUB_OUTPUT"; + else + echo "alert_type=error" >> "$GITHUB_OUTPUT"; + fi + + - name: Send event to Datadog + if: (success() || failure()) && github.ref_name == 'main' + uses: masci/datadog@v1 + with: + api-key: ${{ secrets.CORE_DATADOG_API_KEY }} + api-url: https://api.datadoghq.eu + events: | + - title: "${{ github.workflow }} workflow" + text: "Job ${{ github.job }} in branch ${{ github.ref_name }}" + alert_type: "${{ steps.calculator.outputs.alert_type }}" + source_type_name: "Github" + host: ${{ github.repository_owner }} + tags: + - "project:${{ github.repository }}" + - "job:${{ github.job }}" + - "run_id:${{ github.run_id }}" + - "workflow:${{ github.workflow }}" + - "branch:${{ github.ref_name }}" + - "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + + unit-tests: + name: Unit / ${{ matrix.os }} + needs: black + strategy: + fail-fast: false + matrix: + os: + - ubuntu-latest + - windows-latest + - macos-latest + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install Haystack + run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2' + + - name: Run + run: pytest -m "unit" test/preview + + - name: Calculate alert data + id: calculator + shell: bash + if: (success() || failure()) && github.ref_name == 'main' + run: | + if [ "${{ job.status }}" = "success" ]; then + echo "alert_type=success" >> "$GITHUB_OUTPUT"; + else + echo "alert_type=error" >> "$GITHUB_OUTPUT"; + fi + + - name: Send event to Datadog + if: (success() || failure()) && github.ref_name == 'main' + uses: masci/datadog@v1 + with: + api-key: ${{ secrets.CORE_DATADOG_API_KEY }} + api-url: https://api.datadoghq.eu + events: | + - title: "${{ github.workflow }} workflow" + text: "Job ${{ github.job }} in branch ${{ github.ref_name }}" + alert_type: "${{ steps.calculator.outputs.alert_type }}" + source_type_name: "Github" + host: ${{ github.repository_owner }} + tags: + - "project:${{ github.repository }}" + - "job:${{ github.job }}" + - "run_id:${{ github.run_id }}" + - "workflow:${{ github.workflow }}" + - "branch:${{ github.ref_name }}" + - "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + + + integration-tests-linux: + name: Integration / ubuntu-latest + needs: unit-tests + strategy: + fail-fast: false + matrix: + os: + - ubuntu-latest + runs-on: ${{ matrix.os }} + services: + tika: + image: apache/tika:2.9.0.0 + ports: + - 9998:9998 + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install dependencies + run: | + sudo apt update + sudo apt install ffmpeg # for local Whisper tests + + - name: Install Haystack + run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2' + + - name: Run + run: pytest --maxfail=5 -m "integration" test/preview + + - name: Calculate alert data + id: calculator + shell: bash + if: (success() || failure()) && github.ref_name == 'main' + run: | + if [ "${{ job.status }}" = "success" ]; then + echo "alert_type=success" >> "$GITHUB_OUTPUT"; + else + echo "alert_type=error" >> "$GITHUB_OUTPUT"; + fi + + - name: Send event to Datadog + if: (success() || failure()) && github.ref_name == 'main' + uses: masci/datadog@v1 + with: + api-key: ${{ secrets.CORE_DATADOG_API_KEY }} + api-url: https://api.datadoghq.eu + events: | + - title: "${{ github.workflow }} workflow" + text: "Job ${{ github.job }} in branch ${{ github.ref_name }}" + alert_type: "${{ steps.calculator.outputs.alert_type }}" + source_type_name: "Github" + host: ${{ github.repository_owner }} + tags: + - "project:${{ github.repository }}" + - "job:${{ github.job }}" + - "run_id:${{ github.run_id }}" + - "workflow:${{ github.workflow }}" + - "branch:${{ github.ref_name }}" + - "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + + + integration-tests-macos: + name: Integration / macos-latest + needs: unit-tests + runs-on: macos-latest + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install dependencies + run: | + brew install ffmpeg # for local Whisper tests + brew install docker + colima start + + - name: Install Haystack + run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2' + + - name: Run Tika + run: docker run -d -p 9998:9998 apache/tika:2.9.0.0 + + - name: Run + run: pytest --maxfail=5 -m "integration" test/preview + + - name: Calculate alert data + id: calculator + shell: bash + if: (success() || failure()) && github.ref_name == 'main' + run: | + if [ "${{ job.status }}" = "success" ]; then + echo "alert_type=success" >> "$GITHUB_OUTPUT"; + else + echo "alert_type=error" >> "$GITHUB_OUTPUT"; + fi + + - name: Send event to Datadog + if: (success() || failure()) && github.ref_name == 'main' + uses: masci/datadog@v1 + with: + api-key: ${{ secrets.CORE_DATADOG_API_KEY }} + api-url: https://api.datadoghq.eu + events: | + - title: "${{ github.workflow }} workflow" + text: "Job ${{ github.job }} in branch ${{ github.ref_name }}" + alert_type: "${{ steps.calculator.outputs.alert_type }}" + source_type_name: "Github" + host: ${{ github.repository_owner }} + tags: + - "project:${{ github.repository }}" + - "job:${{ github.job }}" + - "run_id:${{ github.run_id }}" + - "workflow:${{ github.workflow }}" + - "branch:${{ github.ref_name }}" + - "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + + + integration-tests-windows: + name: Integration / windows-latest + needs: unit-tests + runs-on: windows-latest + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install Haystack + run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2' + + - name: Run + run: pytest --maxfail=5 -m "integration" test/preview -k 'not tika' + + - name: Calculate alert data + id: calculator + shell: bash + if: (success() || failure()) && github.ref_name == 'main' + run: | + if [ "${{ job.status }}" = "success" ]; then + echo "alert_type=success" >> "$GITHUB_OUTPUT"; + else + echo "alert_type=error" >> "$GITHUB_OUTPUT"; + fi + + - name: Send event to Datadog + if: (success() || failure()) && github.ref_name == 'main' + uses: masci/datadog@v1 + with: + api-key: ${{ secrets.CORE_DATADOG_API_KEY }} + api-url: https://api.datadoghq.eu + events: | + - title: "${{ github.workflow }} workflow" + text: "Job ${{ github.job }} in branch ${{ github.ref_name }}" + alert_type: "${{ steps.calculator.outputs.alert_type }}" + source_type_name: "Github" + host: ${{ github.repository_owner }} + tags: + - "project:${{ github.repository }}" + - "job:${{ github.job }}" + - "run_id:${{ github.run_id }}" + - "workflow:${{ github.workflow }}" + - "branch:${{ github.ref_name }}" + - "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" diff --git a/.github/workflows/tests_preview_skipper.yml b/.github/workflows/tests_preview_skipper.yml new file mode 100644 index 0000000000..2f64eaae99 --- /dev/null +++ b/.github/workflows/tests_preview_skipper.yml @@ -0,0 +1,21 @@ +# If you change this name also do it in tests_preview.yml +name: Tests (Preview) + +on: + pull_request: + types: + - opened + - reopened + - synchronize + - ready_for_review + paths-ignore: + - "haystack/preview/**/*.py" + - "test/preview/**/*.py" + +jobs: + catch-all: + name: Catch-all check + runs-on: ubuntu-latest + steps: + - name: Skip preview tests + run: echo "Skipped!" diff --git a/.github/workflows/tests_skipper.yml b/.github/workflows/tests_skipper.yml index c4e4dbae09..5e0ad9c691 100644 --- a/.github/workflows/tests_skipper.yml +++ b/.github/workflows/tests_skipper.yml @@ -10,6 +10,8 @@ on: - ready_for_review paths-ignore: - "**.py" + - "!haystack/preview/**/*.py" # See tests_preview.yml + - "!test/preview/**/*.py" # See tests_preview.yml - "pyproject.toml" - "!.github/**/*.py" - "!rest_api/**/*.py" diff --git a/haystack/preview/testing/test_utils.py b/haystack/preview/testing/test_utils.py index 333685c8d3..596feb7001 100644 --- a/haystack/preview/testing/test_utils.py +++ b/haystack/preview/testing/test_utils.py @@ -1,7 +1,10 @@ import os import random +import logging import numpy as np -import torch + + +logger = logging.getLogger(__name__) def set_all_seeds(seed: int, deterministic_cudnn: bool = False) -> None: @@ -16,9 +19,16 @@ def set_all_seeds(seed: int, deterministic_cudnn: bool = False) -> None: """ random.seed(seed) np.random.seed(seed) - torch.manual_seed(seed) os.environ["PYTHONHASHSEED"] = str(seed) - torch.cuda.manual_seed_all(seed) - if deterministic_cudnn: - torch.backends.cudnn.deterministic = True - torch.backends.cudnn.benchmark = False + + try: + import torch + + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + if deterministic_cudnn: + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + except (ImportError, ModuleNotFoundError) as exc: + logger.info("Could not set PyTorch seed because torch is not installed. Exception: %s", exc) From 69232612d0f3328e39fe4f4db18142e011e03cfe Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 1 Oct 2023 12:38:57 +0200 Subject: [PATCH 09/33] build(deps): bump actions/checkout from 3 to 4 (#5928) Bumps [actions/checkout](https://github.com/actions/checkout) from 3 to 4. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v3...v4) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/tests_preview.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/tests_preview.yml b/.github/workflows/tests_preview.yml index 26babf01ed..1f3a865f44 100644 --- a/.github/workflows/tests_preview.yml +++ b/.github/workflows/tests_preview.yml @@ -29,7 +29,7 @@ jobs: black: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: @@ -109,7 +109,7 @@ jobs: - macos-latest runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: @@ -168,7 +168,7 @@ jobs: ports: - 9998:9998 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: @@ -222,7 +222,7 @@ jobs: needs: unit-tests runs-on: macos-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: @@ -280,7 +280,7 @@ jobs: needs: unit-tests runs-on: windows-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: From b8c3b68141b93f9d1f5a790f5f5cc70bd590c1db Mon Sep 17 00:00:00 2001 From: Zubeen Date: Tue, 3 Oct 2023 01:47:55 +0530 Subject: [PATCH 10/33] Update release_notes.yml (#5949) Ignoring release notes check for PRs of type doc/ci/test --- .github/workflows/release_notes.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/release_notes.yml b/.github/workflows/release_notes.yml index 6e6e4cb9e6..fd54da5d9e 100644 --- a/.github/workflows/release_notes.yml +++ b/.github/workflows/release_notes.yml @@ -35,5 +35,10 @@ jobs: - name: Check release notes if: steps.changed-files.outputs.any_changed == 'false' && !contains( github.event.pull_request.labels.*.name, 'ignore-for-release-notes') run: | + # Check if any of the commit messages contain tags ci/docs/test + if git log --pretty=%s origin/main..HEAD | grep -E '^(ci:|docs:|test:)' > /dev/null; then + echo "Skipping release note check for commits with 'ci:', 'docs:', or 'test:' tags." + else echo "::error::The release notes file is missing, please add one or attach the label 'ignore-for-release-notes' to this PR." exit 1 + fi From a933a4274952ea47b7f8a6dc858b4f23e99aac3a Mon Sep 17 00:00:00 2001 From: Silvano Cerza Date: Mon, 2 Oct 2023 13:24:08 -0700 Subject: [PATCH 11/33] Fix release_notes.yml syntax --- .github/workflows/release_notes.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/release_notes.yml b/.github/workflows/release_notes.yml index fd54da5d9e..72fabb7ff8 100644 --- a/.github/workflows/release_notes.yml +++ b/.github/workflows/release_notes.yml @@ -22,9 +22,9 @@ jobs: - name: Checkout uses: actions/checkout@v4 with: - # With the default value of 1, there are corner cases where tj-actions/changed-files - # fails with a `no merge base` error - fetch-depth: 0 + # With the default value of 1, there are corner cases where tj-actions/changed-files + # fails with a `no merge base` error + fetch-depth: 0 - name: Get release note files id: changed-files @@ -35,7 +35,7 @@ jobs: - name: Check release notes if: steps.changed-files.outputs.any_changed == 'false' && !contains( github.event.pull_request.labels.*.name, 'ignore-for-release-notes') run: | - # Check if any of the commit messages contain tags ci/docs/test + # Check if any of the commit messages contain tags ci/docs/test if git log --pretty=%s origin/main..HEAD | grep -E '^(ci:|docs:|test:)' > /dev/null; then echo "Skipping release note check for commits with 'ci:', 'docs:', or 'test:' tags." else From dfd9870bcdb45f1c905c8dd4ee72bb1774f5cc82 Mon Sep 17 00:00:00 2001 From: Timo Moeller Date: Tue, 3 Oct 2023 09:37:07 +0200 Subject: [PATCH 12/33] Remove language validation (#5948) --- haystack/utils/getting_started.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/utils/getting_started.py b/haystack/utils/getting_started.py index 5a227b7213..cd54e7169d 100644 --- a/haystack/utils/getting_started.py +++ b/haystack/utils/getting_started.py @@ -65,7 +65,7 @@ def add_example_data(document_store, dir): output_dir=dir, ) files_to_index = [dir + "/" + f for f in os.listdir(dir)] - converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"]) + converter = TextConverter(remove_numeric_tables=True) docs = [converter.convert(file_path=file, meta=None)[0] for file in files_to_index] else: # Here you can add a local folder with your files(.txt, .pdf, .docx). From 1ccf674d734d83e88b1550370125c9b48da30708 Mon Sep 17 00:00:00 2001 From: Lavesh Akhadkar <117550850+Lavesh-Akhadkar@users.noreply.github.com> Date: Tue, 3 Oct 2023 13:32:33 +0530 Subject: [PATCH 13/33] feat: `DocumentWriter` returns number of documents written (#5939) * Make DocumentWriter return the number of documents it wrote * Fixed return type --- haystack/preview/components/writers/document_writer.py | 5 +++-- ...-writer-number-of-documents-written-2c57f3a5d6ae2131.yaml | 4 ++++ 2 files changed, 7 insertions(+), 2 deletions(-) create mode 100644 releasenotes/notes/add-document-writer-number-of-documents-written-2c57f3a5d6ae2131.yaml diff --git a/haystack/preview/components/writers/document_writer.py b/haystack/preview/components/writers/document_writer.py index aef0d823b3..5ce8c9d4c2 100644 --- a/haystack/preview/components/writers/document_writer.py +++ b/haystack/preview/components/writers/document_writer.py @@ -44,13 +44,14 @@ def from_dict(cls, data: Dict[str, Any]) -> "DocumentWriter": data["init_parameters"]["policy"] = DuplicatePolicy[data["init_parameters"]["policy"]] return default_from_dict(cls, data) + @component.output_types(documents_written=int) def run(self, documents: List[Document], policy: Optional[DuplicatePolicy] = None): """ Run DocumentWriter on the given input data. :param documents: A list of documents to write to the store. :param policy: The policy to use when encountering duplicate documents. - :return: None + :return: Number of documents written :raises ValueError: If the specified document store is not found. """ @@ -58,4 +59,4 @@ def run(self, documents: List[Document], policy: Optional[DuplicatePolicy] = Non policy = self.policy self.document_store.write_documents(documents=documents, policy=policy) - return {} + return {"documents_written": len(documents)} diff --git a/releasenotes/notes/add-document-writer-number-of-documents-written-2c57f3a5d6ae2131.yaml b/releasenotes/notes/add-document-writer-number-of-documents-written-2c57f3a5d6ae2131.yaml new file mode 100644 index 0000000000..f742601594 --- /dev/null +++ b/releasenotes/notes/add-document-writer-number-of-documents-written-2c57f3a5d6ae2131.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + Document writer returns the number of documents written. From ac408134f4476e23a0ec4d7b6e46dd0a1f853fc2 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Tue, 3 Oct 2023 10:42:21 +0200 Subject: [PATCH 14/33] feat: add support for async openai calls (#5946) * add support for async openai calls * add actual async call * split the async api * ask permission * Update haystack/utils/openai_utils.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * Fix OpenAI content moderation tests * Fix ChatGPT invocation layer tests --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Co-authored-by: Silvano Cerza --- .../nodes/prompt/invocation_layer/open_ai.py | 87 ++++++++++++++----- haystack/nodes/prompt/prompt_model.py | 10 +-- haystack/utils/openai_utils.py | 71 ++++++++++++++- pyproject.toml | 1 + .../add-openai-async-1f65701142f77181.yaml | 4 + test/prompt/invocation_layer/test_chatgpt.py | 4 +- test/prompt/test_prompt_node.py | 55 ++++++++---- 7 files changed, 184 insertions(+), 48 deletions(-) create mode 100644 releasenotes/notes/add-openai-async-1f65701142f77181.yaml diff --git a/haystack/nodes/prompt/invocation_layer/open_ai.py b/haystack/nodes/prompt/invocation_layer/open_ai.py index 759fd7a925..85bae198b0 100644 --- a/haystack/nodes/prompt/invocation_layer/open_ai.py +++ b/haystack/nodes/prompt/invocation_layer/open_ai.py @@ -7,11 +7,13 @@ from haystack.errors import OpenAIError from haystack.nodes.prompt.invocation_layer.utils import has_azure_parameters from haystack.utils.openai_utils import ( - openai_request, _openai_text_completion_tokenization_details, load_openai_tokenizer, _check_openai_finish_reason, + check_openai_async_policy_violation, check_openai_policy_violation, + openai_async_request, + openai_request, ) from haystack.nodes.prompt.invocation_layer.base import PromptModelInvocationLayer from haystack.nodes.prompt.invocation_layer.handlers import TokenStreamingHandler, DefaultTokenStreamingHandler @@ -112,17 +114,13 @@ def headers(self) -> Dict[str, str]: headers["OpenAI-Organization"] = self.openai_organization return headers - def invoke(self, *args, **kwargs): - """ - Invokes a prompt on the model. Based on the model, it takes in a prompt (or either a prompt or a list of messages) - and returns a list of responses using a REST invocation. - - :return: The responses are being returned. - - Note: Only kwargs relevant to OpenAI are passed to OpenAI rest API. Others kwargs are ignored. - For more details, see OpenAI [documentation](https://platform.openai.com/docs/api-reference/completions/create). - """ + def _prepare_invoke(self, *args, **kwargs): prompt = kwargs.get("prompt") + if not prompt: + raise ValueError( + f"No prompt provided. Model {self.model_name_or_path} requires prompt." + f"Make sure to provide prompt in kwargs." + ) # either stream is True (will use default handler) or stream_handler is provided kwargs_with_defaults = self.model_input_kwargs if kwargs: @@ -150,23 +148,59 @@ def invoke(self, *args, **kwargs): "frequency_penalty": kwargs_with_defaults.get("frequency_penalty", 0), "logit_bias": kwargs_with_defaults.get("logit_bias", {}), } + + return (prompt, base_payload, kwargs_with_defaults, stream, moderation) + + def invoke(self, *args, **kwargs): + """ + Invokes a prompt on the model. Based on the model, it takes in a prompt (or either a prompt or a list of messages) + and returns a list of responses using a REST invocation. + + :return: The responses are being returned. + + Note: Only kwargs relevant to OpenAI are passed to OpenAI rest API. Others kwargs are ignored. + For more details, see OpenAI [documentation](https://platform.openai.com/docs/api-reference/completions/create). + """ + prompt, base_payload, kwargs_with_defaults, stream, moderation = self._prepare_invoke(*args, **kwargs) + if moderation and check_openai_policy_violation(input=prompt, headers=self.headers): logger.info("Prompt '%s' will not be sent to OpenAI due to potential policy violation.", prompt) return [] - responses = self._execute_openai_request( - prompt=prompt, base_payload=base_payload, kwargs_with_defaults=kwargs_with_defaults, stream=stream - ) + + extra_payload = { + "prompt": prompt, + "suffix": kwargs_with_defaults.get("suffix", None), + "logprobs": kwargs_with_defaults.get("logprobs", None), + "echo": kwargs_with_defaults.get("echo", False), + "best_of": kwargs_with_defaults.get("best_of", 1), + } + payload = {**base_payload, **extra_payload} + if not stream: + res = openai_request(url=self.url, headers=self.headers, payload=payload) + _check_openai_finish_reason(result=res, payload=payload) + responses = [ans["text"].strip() for ans in res["choices"]] + else: + response = openai_request( + url=self.url, headers=self.headers, payload=payload, read_response=False, stream=True + ) + handler: TokenStreamingHandler = kwargs_with_defaults.pop("stream_handler", DefaultTokenStreamingHandler()) + responses = self._process_streaming_response(response=response, stream_handler=handler) + if moderation and check_openai_policy_violation(input=responses, headers=self.headers): logger.info("Response '%s' will not be returned due to potential policy violation.", responses) return [] + return responses - def _execute_openai_request(self, prompt: str, base_payload: Dict, kwargs_with_defaults: Dict, stream: bool): - if not prompt: - raise ValueError( - f"No prompt provided. Model {self.model_name_or_path} requires prompt." - f"Make sure to provide prompt in kwargs." - ) + async def ainvoke(self, *args, **kwargs): + """ + asyncio version of the `invoke` method. + """ + prompt, base_payload, kwargs_with_defaults, stream, moderation = self._prepare_invoke(*args, **kwargs) + if moderation and await check_openai_async_policy_violation(input=prompt, headers=self.headers): + logger.info("Prompt '%s' will not be sent to OpenAI due to potential policy violation.", prompt) + return [] + extra_payload = { "prompt": prompt, "suffix": kwargs_with_defaults.get("suffix", None), @@ -176,16 +210,21 @@ def _execute_openai_request(self, prompt: str, base_payload: Dict, kwargs_with_d } payload = {**base_payload, **extra_payload} if not stream: - res = openai_request(url=self.url, headers=self.headers, payload=payload) + res = await openai_async_request(url=self.url, headers=self.headers, payload=payload) _check_openai_finish_reason(result=res, payload=payload) responses = [ans["text"].strip() for ans in res["choices"]] - return responses else: - response = openai_request( + response = await openai_async_request( url=self.url, headers=self.headers, payload=payload, read_response=False, stream=True ) handler: TokenStreamingHandler = kwargs_with_defaults.pop("stream_handler", DefaultTokenStreamingHandler()) - return self._process_streaming_response(response=response, stream_handler=handler) + responses = self._process_streaming_response(response=response, stream_handler=handler) + + if moderation and await check_openai_async_policy_violation(input=responses, headers=self.headers): + logger.info("Response '%s' will not be returned due to potential policy violation.", responses) + return [] + + return responses def _process_streaming_response(self, response, stream_handler: TokenStreamingHandler): client = sseclient.SSEClient(response) diff --git a/haystack/nodes/prompt/prompt_model.py b/haystack/nodes/prompt/prompt_model.py index c4c9030897..100a2b46ea 100644 --- a/haystack/nodes/prompt/prompt_model.py +++ b/haystack/nodes/prompt/prompt_model.py @@ -115,11 +115,11 @@ async def ainvoke(self, prompt: Union[str, List[str], List[Dict[str, str]]], **k """ Drop-in replacement asyncio version of the `invoke` method, see there for documentation. """ - try: - return await self.model_invocation_layer.invoke(prompt=prompt, **kwargs) - except TypeError: - # The `invoke` method of the underlying invocation layer doesn't support asyncio - return self.model_invocation_layer.invoke(prompt=prompt, **kwargs) + if hasattr(self.model_invocation_layer, "ainvoke"): + return await self.model_invocation_layer.ainvoke(prompt=prompt, **kwargs) + + # The underlying invocation layer doesn't support asyncio + return self.model_invocation_layer.invoke(prompt=prompt, **kwargs) @overload def _ensure_token_limit(self, prompt: str) -> str: diff --git a/haystack/utils/openai_utils.py b/haystack/utils/openai_utils.py index fdde7306ce..439d045989 100644 --- a/haystack/utils/openai_utils.py +++ b/haystack/utils/openai_utils.py @@ -3,7 +3,9 @@ import logging import platform import json -from typing import Dict, Union, Tuple, Optional, List +from typing import Dict, Union, Tuple, Optional, List, cast + +import httpx import requests import tenacity import tiktoken @@ -143,6 +145,53 @@ def openai_request( return response +@tenacity.retry( + reraise=True, + retry=tenacity.retry_if_exception_type(OpenAIError) + and tenacity.retry_if_not_exception_type(OpenAIUnauthorizedError), + wait=tenacity.wait_exponential(multiplier=OPENAI_BACKOFF), + stop=tenacity.stop_after_attempt(OPENAI_MAX_RETRIES), +) +async def openai_async_request( + url: str, + headers: Dict, + payload: Dict, + timeout: Union[float, Tuple[float, float]] = OPENAI_TIMEOUT, + read_response: bool = True, + **kwargs, +): + """Make a request to the OpenAI API given a `url`, `headers`, `payload`, and `timeout`. + + See `openai_request`. + """ + async with httpx.AsyncClient() as client: + response = await client.request( + "POST", url, headers=headers, json=payload, timeout=cast(float, timeout), **kwargs + ) + + if read_response: + json_response = json.loads(response.text) + + if response.status_code != 200: + openai_error: OpenAIError + if response.status_code == 429: + openai_error = OpenAIRateLimitError(f"API rate limit exceeded: {response.text}") + elif response.status_code == 401: + openai_error = OpenAIUnauthorizedError(f"API key is invalid: {response.text}") + else: + openai_error = OpenAIError( + f"OpenAI returned an error.\n" + f"Status code: {response.status_code}\n" + f"Response body: {response.text}", + status_code=response.status_code, + ) + raise openai_error + if read_response: + return json_response + else: + return response + + def check_openai_policy_violation(input: Union[List[str], str], headers: Dict) -> bool: """ Calls the moderation endpoint to check if the text(s) violate the policy. @@ -163,6 +212,26 @@ def check_openai_policy_violation(input: Union[List[str], str], headers: Dict) - return flagged +async def check_openai_async_policy_violation(input: Union[List[str], str], headers: Dict) -> bool: + """ + Calls the moderation endpoint to check if the text(s) violate the policy. + See [OpenAI Moderation API](https://platform.openai.com/docs/guides/moderation) for more details. + Returns true if any of the input is flagged as any of ['sexual', 'hate', 'violence', 'self-harm', 'sexual/minors', 'hate/threatening', 'violence/graphic']. + """ + response = await openai_async_request(url=OPENAI_MODERATION_URL, headers=headers, payload={"input": input}) + results = response["results"] + flagged = any(res["flagged"] for res in results) + if flagged: + for result in results: + if result["flagged"]: + logger.debug( + "OpenAI Moderation API flagged the text '%s' as a potential policy violation of the following categories: %s", + input, + result["categories"], + ) + return flagged + + def _check_openai_finish_reason(result: Dict, payload: Dict) -> None: """Check the `finish_reason` the answers returned by OpenAI completions endpoint. If the `finish_reason` is `length` or `content_filter`, log a warning to the user. diff --git a/pyproject.toml b/pyproject.toml index a721ee9147..3c11d398c1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,7 @@ classifiers = [ ] dependencies = [ "requests", + "httpx", "pydantic<2", "transformers==4.32.1", "pandas", diff --git a/releasenotes/notes/add-openai-async-1f65701142f77181.yaml b/releasenotes/notes/add-openai-async-1f65701142f77181.yaml new file mode 100644 index 0000000000..7d97d8064e --- /dev/null +++ b/releasenotes/notes/add-openai-async-1f65701142f77181.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + Add asyncio support to the OpenAI invocation layer. diff --git a/test/prompt/invocation_layer/test_chatgpt.py b/test/prompt/invocation_layer/test_chatgpt.py index 9036353901..8c6fb1b409 100644 --- a/test/prompt/invocation_layer/test_chatgpt.py +++ b/test/prompt/invocation_layer/test_chatgpt.py @@ -7,7 +7,7 @@ @pytest.mark.unit -@patch("haystack.nodes.prompt.invocation_layer.chatgpt.openai_request") +@patch("haystack.nodes.prompt.invocation_layer.open_ai.openai_request") def test_default_api_base(mock_request): with patch("haystack.nodes.prompt.invocation_layer.open_ai.load_openai_tokenizer"): invocation_layer = ChatGPTInvocationLayer(api_key="fake_api_key") @@ -19,7 +19,7 @@ def test_default_api_base(mock_request): @pytest.mark.unit -@patch("haystack.nodes.prompt.invocation_layer.chatgpt.openai_request") +@patch("haystack.nodes.prompt.invocation_layer.open_ai.openai_request") def test_custom_api_base(mock_request): with patch("haystack.nodes.prompt.invocation_layer.open_ai.load_openai_tokenizer"): invocation_layer = ChatGPTInvocationLayer(api_key="fake_api_key", api_base="https://fake_api_base.com") diff --git a/test/prompt/test_prompt_node.py b/test/prompt/test_prompt_node.py index c0ae8480e4..6cd4bd1a21 100644 --- a/test/prompt/test_prompt_node.py +++ b/test/prompt/test_prompt_node.py @@ -1046,36 +1046,59 @@ def test_chatgpt_direct_prompting_w_messages(chatgpt_prompt_model): @pytest.mark.unit @patch("haystack.nodes.prompt.invocation_layer.open_ai.load_openai_tokenizer", lambda tokenizer_name: None) @patch("haystack.nodes.prompt.prompt_model.PromptModel._ensure_token_limit", lambda self, prompt: prompt) -def test_content_moderation_gpt_3_and_gpt_3_5(): +def test_content_moderation_gpt_3(): """ - Check all possible cases of the moderation checks passing / failing in a PromptNode call - for both ChatGPTInvocationLayer and OpenAIInvocationLayer. + Check all possible cases of the moderation checks passing / failing in a PromptNode uses + OpenAIInvocationLayer. """ - prompt_node_gpt_3_5 = PromptNode( - model_name_or_path="gpt-3.5-turbo", api_key="key", model_kwargs={"moderate_content": True} - ) - prompt_node_gpt_3 = PromptNode( + prompt_node = PromptNode( model_name_or_path="text-davinci-003", api_key="key", model_kwargs={"moderate_content": True} ) with patch("haystack.nodes.prompt.invocation_layer.open_ai.check_openai_policy_violation") as mock_check, patch( - "haystack.nodes.prompt.invocation_layer.chatgpt.ChatGPTInvocationLayer._execute_openai_request" - ) as mock_execute_gpt_3_5, patch( - "haystack.nodes.prompt.invocation_layer.open_ai.OpenAIInvocationLayer._execute_openai_request" - ) as mock_execute_gpt_3: + "haystack.nodes.prompt.invocation_layer.open_ai.openai_request" + ) as mock_request: + VIOLENT_TEXT = "some violent text" + mock_check.side_effect = lambda input, headers: input == VIOLENT_TEXT or input == [VIOLENT_TEXT] + # case 1: prompt fails the moderation check + # prompt should not be sent to OpenAi & function should return an empty list + mock_check.return_value = True + assert prompt_node(VIOLENT_TEXT) == [] + # case 2: prompt passes the moderation check but the generated output fails the check + # function should also return an empty list + mock_request.return_value = {"choices": [{"text": VIOLENT_TEXT, "finish_reason": ""}]} + assert prompt_node("normal prompt") == [] + # case 3: both prompt and output pass the moderation check + # function should return the output + mock_request.return_value = {"choices": [{"text": "normal output", "finish_reason": ""}]} + assert prompt_node("normal prompt") == ["normal output"] + + +@pytest.mark.unit +@patch("haystack.nodes.prompt.invocation_layer.open_ai.load_openai_tokenizer", lambda tokenizer_name: None) +@patch("haystack.nodes.prompt.prompt_model.PromptModel._ensure_token_limit", lambda self, prompt: prompt) +def test_content_moderation_gpt_35(): + """ + Check all possible cases of the moderation checks passing / failing in a PromptNode uses + ChatGPTInvocationLayer. + """ + prompt_node = PromptNode(model_name_or_path="gpt-3.5-turbo", api_key="key", model_kwargs={"moderate_content": True}) + with patch("haystack.nodes.prompt.invocation_layer.open_ai.check_openai_policy_violation") as mock_check, patch( + "haystack.nodes.prompt.invocation_layer.open_ai.openai_request" + ) as mock_request: VIOLENT_TEXT = "some violent text" mock_check.side_effect = lambda input, headers: input == VIOLENT_TEXT or input == [VIOLENT_TEXT] # case 1: prompt fails the moderation check # prompt should not be sent to OpenAi & function should return an empty list mock_check.return_value = True - assert prompt_node_gpt_3_5(VIOLENT_TEXT) == prompt_node_gpt_3(VIOLENT_TEXT) == [] + assert prompt_node(VIOLENT_TEXT) == [] # case 2: prompt passes the moderation check but the generated output fails the check # function should also return an empty list - mock_execute_gpt_3_5.return_value = mock_execute_gpt_3.return_value = [VIOLENT_TEXT] - assert prompt_node_gpt_3_5("normal prompt") == prompt_node_gpt_3("normal prompt") == [] + mock_request.return_value = {"choices": [{"text": VIOLENT_TEXT, "finish_reason": ""}]} + assert prompt_node("normal prompt") == [] # case 3: both prompt and output pass the moderation check # function should return the output - mock_execute_gpt_3_5.return_value = mock_execute_gpt_3.return_value = ["normal output"] - assert prompt_node_gpt_3_5("normal prompt") == prompt_node_gpt_3("normal prompt") == ["normal output"] + mock_request.return_value = {"choices": [{"text": "normal output", "finish_reason": ""}]} + assert prompt_node("normal prompt") == ["normal output"] @patch("haystack.nodes.prompt.prompt_node.PromptModel") From cc70b4b6137257f2f2f3cea2ac2ab1378cd7e719 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Date: Tue, 3 Oct 2023 12:48:06 +0200 Subject: [PATCH 15/33] deprecation (#5954) --- haystack/nodes/answer_generator/openai.py | 11 +++++++++++ ...ecate-openai-answergenerator-537266612ba1ffff.yaml | 5 +++++ test/nodes/test_generator.py | 10 ++++++++++ 3 files changed, 26 insertions(+) create mode 100644 releasenotes/notes/deprecate-openai-answergenerator-537266612ba1ffff.yaml diff --git a/haystack/nodes/answer_generator/openai.py b/haystack/nodes/answer_generator/openai.py index ae1ac362ed..6b8893c9e6 100644 --- a/haystack/nodes/answer_generator/openai.py +++ b/haystack/nodes/answer_generator/openai.py @@ -1,6 +1,7 @@ import logging import os from typing import List, Optional, Tuple, Union +import warnings from haystack import Document from haystack.environment import HAYSTACK_REMOTE_API_TIMEOUT_SEC @@ -21,6 +22,10 @@ class OpenAIAnswerGenerator(BaseGenerator): """ + This component is now deprecated and will be removed in future versions. + Use `PromptNode` instead of `OpenAIAnswerGenerator`, + as explained in https://haystack.deepset.ai/tutorials/22_pipeline_with_promptnode. + Uses the GPT-3 models from the OpenAI API to generate Answers based on the Documents it receives. The Documents can come from a Retriever or you can supply them manually. @@ -109,6 +114,12 @@ def __init__( :param openai_organization: The OpenAI-Organization ID, defaults to `None`. For more details, see see OpenAI [documentation](https://platform.openai.com/docs/api-reference/requesting-organization). """ + warnings.warn( + "`OpenAIAnswerGenerator component is deprecated and will be removed in future versions. Use `PromptNode` " + "instead of `OpenAIAnswerGenerator`.", + category=DeprecationWarning, + ) + super().__init__(progress_bar=progress_bar) if (examples is None and examples_context is not None) or (examples is not None and examples_context is None): logger.warning( diff --git a/releasenotes/notes/deprecate-openai-answergenerator-537266612ba1ffff.yaml b/releasenotes/notes/deprecate-openai-answergenerator-537266612ba1ffff.yaml new file mode 100644 index 0000000000..3e42dfacab --- /dev/null +++ b/releasenotes/notes/deprecate-openai-answergenerator-537266612ba1ffff.yaml @@ -0,0 +1,5 @@ +--- +deprecations: + - | + Deprecate `OpenAIAnswerGenerator` in favor of `PromptNode`. + `OpenAIAnswerGenerator` will be removed in Haystack 1.23. diff --git a/test/nodes/test_generator.py b/test/nodes/test_generator.py index 1fea0fc4d2..f69663aeba 100644 --- a/test/nodes/test_generator.py +++ b/test/nodes/test_generator.py @@ -6,9 +6,19 @@ from haystack.nodes.answer_generator import OpenAIAnswerGenerator from haystack.nodes import PromptTemplate +from ..conftest import fail_at_version + import logging +@pytest.mark.unit +@fail_at_version(1, 23) +@patch("haystack.nodes.answer_generator.openai.load_openai_tokenizer") +def test_openaianswergenerator_deprecation(mock_load_tokenizer): + with pytest.warns(DeprecationWarning): + OpenAIAnswerGenerator(api_key="fake_api_key") + + @pytest.mark.unit @patch("haystack.nodes.answer_generator.openai.openai_request") def test_no_openai_organization(mock_request): From b844ab8e221e71ca0ed4e369ac61b1bc0732e5dc Mon Sep 17 00:00:00 2001 From: ZanSara Date: Tue, 3 Oct 2023 17:39:04 +0200 Subject: [PATCH 16/33] chore: remove matrix from Linux CI (#5955) * remove matrix * workflow names --- .github/workflows/tests_preview.yml | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/.github/workflows/tests_preview.yml b/.github/workflows/tests_preview.yml index 1f3a865f44..9f570491b7 100644 --- a/.github/workflows/tests_preview.yml +++ b/.github/workflows/tests_preview.yml @@ -156,12 +156,7 @@ jobs: integration-tests-linux: name: Integration / ubuntu-latest needs: unit-tests - strategy: - fail-fast: false - matrix: - os: - - ubuntu-latest - runs-on: ${{ matrix.os }} + runs-on: ubuntu-latest services: tika: image: apache/tika:2.9.0.0 From 58192d35f132bc92b0849c9c4079d6ac0afa3398 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 3 Oct 2023 17:39:22 +0200 Subject: [PATCH 17/33] build(deps): bump iterative/setup-cml from 1 to 2 (#5911) Bumps [iterative/setup-cml](https://github.com/iterative/setup-cml) from 1 to 2. - [Release notes](https://github.com/iterative/setup-cml/releases) - [Commits](https://github.com/iterative/setup-cml/compare/v1...v2) --- updated-dependencies: - dependency-name: iterative/setup-cml dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/benchmarks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 4632a51cdd..0458102a99 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -21,7 +21,7 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: iterative/setup-cml@v1 + - uses: iterative/setup-cml@v2 - name: AWS authentication uses: aws-actions/configure-aws-credentials@8c3f20df09ac63af7b3ae3d7c91f105f857d8497 From 64243540fb1f2cb6d4dfbb5b12db3aaf59a21b4a Mon Sep 17 00:00:00 2001 From: Matt Speck <20689127+mjspeck@users.noreply.github.com> Date: Tue, 3 Oct 2023 19:01:26 -0400 Subject: [PATCH 18/33] ci: added isort to pyproject.toml and pre-commit (#5933) --- .pre-commit-config.yaml | 6 ++++++ pyproject.toml | 3 +++ 2 files changed, 9 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5375c94d75..0ce4be95c2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -39,4 +39,10 @@ repos: - id: actionlint-docker args: ["-ignore", "SC2102"] +- repo: https://github.com/pycqa/isort + rev: 5.12.0 + hooks: + - id: isort + name: isort (python) + # TODO we can make pylint run at this stage too, once their execution gets normalized diff --git a/pyproject.toml b/pyproject.toml index 3c11d398c1..cc4cef492e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -246,6 +246,7 @@ formatting = [ # Version specified following Black stability policy: # https://black.readthedocs.io/en/stable/the_black_code_style/index.html#stability-policy "black[jupyter]~=23.0", + "isort~=5.12" ] all = [ @@ -450,3 +451,5 @@ max-statements = 105 # Default is 50 omit = [ "haystack/testing/*", ] +[tool.isort] +profile = "black" From de4592f6f157166fe24463d813ac63f93bf2e905 Mon Sep 17 00:00:00 2001 From: Mike Lay Date: Wed, 4 Oct 2023 03:01:51 -0700 Subject: [PATCH 19/33] Fix contributing link (#5960) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix broken contributor link. As this ultimately links to the actual contributing page, simply redirect to *CONTRIBUTING.md* instead of `#-contributing` The 💙 emoji in the anchor does not actually resolve. Should be `#contributing`. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9aab541214..61a9706896 100644 --- a/README.md +++ b/README.md @@ -106,7 +106,7 @@ Then move into the cloned folder and install the project with `pip`, including t cd haystack && pip install -e '.[dev]' ``` -If you want to contribute to the Haystack repo, check our [Contributor Guidelines](#💙-contributing) first. +If you want to contribute to the Haystack repo, check our [Contributor Guidelines](https://github.com/deepset-ai/haystack/blob/main/CONTRIBUTING.md) first. See the list of [dependencies](https://github.com/deepset-ai/haystack/blob/main/pyproject.toml) to check which ones you want to install (for example, `[all]`, `[dev]`, or other). From a4beec301304238dc49f3837b6c9f486776e90d2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 4 Oct 2023 17:20:17 +0200 Subject: [PATCH 20/33] build(deps): bump aws-actions/configure-aws-credentials (#5968) Bumps [aws-actions/configure-aws-credentials](https://github.com/aws-actions/configure-aws-credentials) from 4.0.0 to 4.0.1. - [Release notes](https://github.com/aws-actions/configure-aws-credentials/releases) - [Changelog](https://github.com/aws-actions/configure-aws-credentials/blob/main/CHANGELOG.md) - [Commits](https://github.com/aws-actions/configure-aws-credentials/compare/8c3f20df09ac63af7b3ae3d7c91f105f857d8497...010d0da01d0b5a38af31e9c3470dbfdabdecca3a) --- updated-dependencies: - dependency-name: aws-actions/configure-aws-credentials dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/benchmarks.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 0458102a99..4c310118d3 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -24,7 +24,7 @@ jobs: - uses: iterative/setup-cml@v2 - name: AWS authentication - uses: aws-actions/configure-aws-credentials@8c3f20df09ac63af7b3ae3d7c91f105f857d8497 + uses: aws-actions/configure-aws-credentials@010d0da01d0b5a38af31e9c3470dbfdabdecca3a with: aws-region: ${{ env.AWS_REGION }} role-to-assume: ${{ secrets.AWS_CI_ROLE_ARN }} @@ -237,7 +237,7 @@ jobs: runs-on: ubuntu-latest steps: - name: AWS authentication - uses: aws-actions/configure-aws-credentials@8c3f20df09ac63af7b3ae3d7c91f105f857d8497 + uses: aws-actions/configure-aws-credentials@010d0da01d0b5a38af31e9c3470dbfdabdecca3a with: aws-region: ${{ env.AWS_REGION }} role-to-assume: ${{ secrets.AWS_CI_ROLE_ARN }} From c2ec3f5fde2138466182ea7f5cfa7612e3fb2d6e Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Wed, 4 Oct 2023 17:23:12 +0200 Subject: [PATCH 21/33] feat: add File type to preview package (#5873) * add Blob type * review feedback * fix tests and naming * Update add-blob-type-2a9476a39841f54d.yaml * removed unused import --------- Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> --- haystack/preview/dataclasses/__init__.py | 3 +- haystack/preview/dataclasses/byte_stream.py | 37 +++++++++++++++++++ .../notes/add-blob-type-2a9476a39841f54d.yaml | 5 +++ test/preview/dataclasses/test_byte_stream.py | 33 +++++++++++++++++ 4 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 haystack/preview/dataclasses/byte_stream.py create mode 100644 releasenotes/notes/add-blob-type-2a9476a39841f54d.yaml create mode 100644 test/preview/dataclasses/test_byte_stream.py diff --git a/haystack/preview/dataclasses/__init__.py b/haystack/preview/dataclasses/__init__.py index 5a0d8489f8..6873ac0ccb 100644 --- a/haystack/preview/dataclasses/__init__.py +++ b/haystack/preview/dataclasses/__init__.py @@ -1,4 +1,5 @@ from haystack.preview.dataclasses.document import Document from haystack.preview.dataclasses.answer import ExtractedAnswer, GeneratedAnswer, Answer +from haystack.preview.dataclasses.byte_stream import ByteStream -__all__ = ["Document", "ExtractedAnswer", "GeneratedAnswer", "Answer"] +__all__ = ["Document", "ExtractedAnswer", "GeneratedAnswer", "Answer", "ByteStream"] diff --git a/haystack/preview/dataclasses/byte_stream.py b/haystack/preview/dataclasses/byte_stream.py new file mode 100644 index 0000000000..fe006fcf85 --- /dev/null +++ b/haystack/preview/dataclasses/byte_stream.py @@ -0,0 +1,37 @@ +from dataclasses import dataclass, field +from pathlib import Path +from typing import Dict, Any + + +@dataclass(frozen=True) +class ByteStream: + """ + Base data class representing a binary object in the Haystack API. + """ + + data: bytes + metadata: Dict[str, Any] = field(default_factory=dict, hash=False) + + def to_file(self, destination_path: Path): + with open(destination_path, "wb") as fd: + fd.write(self.data) + + @classmethod + def from_file_path(cls, filepath: Path) -> "ByteStream": + """ + Create a ByteStream from the contents read from a file. + + :param filepath: A valid path to a file. + """ + with open(filepath, "rb") as fd: + return cls(data=fd.read()) + + @classmethod + def from_string(cls, text: str, encoding: str = "utf-8") -> "ByteStream": + """ + Create a ByteStream encoding a string. + + :param text: The string to encode + :param encoding: The encoding used to convert the string into bytes + """ + return cls(data=text.encode(encoding)) diff --git a/releasenotes/notes/add-blob-type-2a9476a39841f54d.yaml b/releasenotes/notes/add-blob-type-2a9476a39841f54d.yaml new file mode 100644 index 0000000000..163d9631ef --- /dev/null +++ b/releasenotes/notes/add-blob-type-2a9476a39841f54d.yaml @@ -0,0 +1,5 @@ +--- +preview: + - | + Add ByteStream type to send binary raw data across components + in a pipeline. diff --git a/test/preview/dataclasses/test_byte_stream.py b/test/preview/dataclasses/test_byte_stream.py new file mode 100644 index 0000000000..05d40eb79e --- /dev/null +++ b/test/preview/dataclasses/test_byte_stream.py @@ -0,0 +1,33 @@ +import io + +from haystack.preview.dataclasses import ByteStream + +import pytest + + +@pytest.mark.unit +def test_from_file_path(tmp_path, request): + test_bytes = "Hello, world!\n".encode() + test_path = tmp_path / request.node.name + with open(test_path, "wb") as fd: + assert fd.write(test_bytes) + + b = ByteStream.from_file_path(test_path) + assert b.data == test_bytes + + +@pytest.mark.unit +def test_from_string(): + test_string = "Hello, world!" + b = ByteStream.from_string(test_string) + assert b.data.decode() == test_string + + +@pytest.mark.unit +def test_to_file(tmp_path, request): + test_str = "Hello, world!\n" + test_path = tmp_path / request.node.name + + ByteStream(test_str.encode()).to_file(test_path) + with open(test_path, "rb") as fd: + assert fd.read().decode() == test_str From d5d3a9eef4d4e66445e37b65bec3578a101d3075 Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Thu, 5 Oct 2023 08:56:28 +0200 Subject: [PATCH 22/33] chore: adapt deepset cloud sdk endpoint format for saving pipelines (#5969) * chore: adapt to new endpoints formats * docs: add release notes --- haystack/utils/deepsetcloud.py | 13 ++-- ...save-pipeline-config-ff820838846f5f38.yaml | 4 ++ test/samples/dc/pipeline_config.json | 1 - test/utils/test_deepset_cloud.py | 70 +++++++++++++++++++ 4 files changed, 80 insertions(+), 8 deletions(-) create mode 100644 releasenotes/notes/update-deepset-cloud-sdk-save-pipeline-config-ff820838846f5f38.yaml create mode 100644 test/utils/test_deepset_cloud.py diff --git a/haystack/utils/deepsetcloud.py b/haystack/utils/deepsetcloud.py index 24ab50b3c4..dfe1bc09f5 100644 --- a/haystack/utils/deepsetcloud.py +++ b/haystack/utils/deepsetcloud.py @@ -1,12 +1,11 @@ import json -from mimetypes import guess_type -from pathlib import Path -from typing import Any, Dict, Generator, List, Optional, Tuple, Union, Literal - import logging import os import time from enum import Enum +from mimetypes import guess_type +from pathlib import Path +from typing import Any, Dict, Generator, List, Literal, Optional, Tuple, Union import pandas as pd import requests @@ -560,10 +559,11 @@ def save_pipeline_config( :param workspace: Specifies the name of the workspace on deepset Cloud. :param headers: Headers to pass to the API call. """ - config["name"] = pipeline_config_name workspace_url = self._build_workspace_url(workspace=workspace) pipelines_url = f"{workspace_url}/pipelines" - response = self.client.post(url=pipelines_url, data=yaml.dump(config), headers=headers).json() + response = self.client.post( + url=pipelines_url, json={"name": pipeline_config_name, "config": yaml.dump(config)}, headers=headers + ).json() if "name" not in response or response["name"] != pipeline_config_name: logger.warning("Unexpected response from saving pipeline config: %s", response) @@ -582,7 +582,6 @@ def update_pipeline_config( :param workspace: Specifies the name of the workspace on deepset Cloud. :param headers: Headers to pass to the API call. """ - config["name"] = pipeline_config_name pipeline_url = self._build_pipeline_url(workspace=workspace, pipeline_config_name=pipeline_config_name) yaml_url = f"{pipeline_url}/yaml" response = self.client.put(url=yaml_url, data=yaml.dump(config), headers=headers).json() diff --git a/releasenotes/notes/update-deepset-cloud-sdk-save-pipeline-config-ff820838846f5f38.yaml b/releasenotes/notes/update-deepset-cloud-sdk-save-pipeline-config-ff820838846f5f38.yaml new file mode 100644 index 0000000000..a5ca3ac5e4 --- /dev/null +++ b/releasenotes/notes/update-deepset-cloud-sdk-save-pipeline-config-ff820838846f5f38.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + Update the deepset Cloud SDK to the new endpoint format for new saving pipeline configs. diff --git a/test/samples/dc/pipeline_config.json b/test/samples/dc/pipeline_config.json index 25d4b29e5a..f10fd32190 100644 --- a/test/samples/dc/pipeline_config.json +++ b/test/samples/dc/pipeline_config.json @@ -1,6 +1,5 @@ { "version": "ignore", - "name": "document_retrieval_1", "components": [ { "name": "DocumentStore", diff --git a/test/utils/test_deepset_cloud.py b/test/utils/test_deepset_cloud.py new file mode 100644 index 0000000000..e4951a1fcf --- /dev/null +++ b/test/utils/test_deepset_cloud.py @@ -0,0 +1,70 @@ +import json +from pathlib import Path +from typing import Any, Dict +from unittest.mock import Mock + +import pytest +import yaml + +from haystack.utils.deepsetcloud import DeepsetCloudClient, PipelineClient + + +@pytest.fixture +def pipeline_config(samples_path: Path) -> Dict[str, Any]: + with (samples_path / "dc" / "pipeline_config.json").open() as f: + return json.load(f) + + +@pytest.fixture() +def mocked_client() -> Mock: + api_client = Mock(spec=DeepsetCloudClient) + + api_client.build_workspace_url.return_value = "https://dc" + + return api_client + + +@pytest.fixture() +def mock_success_response() -> Mock: + mock_response = Mock() + mock_response.json.return_value = {"name": "test_pipeline"} + + return mock_response + + +class TestSaveConfig: + def test_save_config( + self, pipeline_config: Dict[str, Any], mocked_client: Mock, mock_success_response: Mock + ) -> None: + mocked_client.post.return_value = mock_success_response + + pipeline_name = "test_pipeline" + workspace_name = "test_workspace" + + pipeline_client = PipelineClient(client=mocked_client) + + pipeline_client.save_pipeline_config( + config=pipeline_config, pipeline_config_name=pipeline_name, workspace=workspace_name + ) + + expected_payload = {"name": pipeline_name, "config": yaml.dump(pipeline_config)} + mocked_client.post.assert_called_once_with(url="https://dc/pipelines", json=expected_payload, headers=None) + + +class TestUpdateConfig: + def test_update_config( + self, pipeline_config: Dict[str, Any], mocked_client: Mock, mock_success_response: Mock + ) -> None: + mocked_client.put.return_value = mock_success_response + pipeline_name = "test_pipeline" + workspace_name = "test_workspace" + + pipeline_client = PipelineClient(client=mocked_client) + + pipeline_client.update_pipeline_config( + config=pipeline_config, pipeline_config_name=pipeline_name, workspace=workspace_name + ) + + mocked_client.put.assert_called_once_with( + url=f"https://dc/pipelines/{pipeline_name}/yaml", data=yaml.dump(pipeline_config), headers=None + ) From f983e605c7baefed3cd854ba3da5e27e52cfd679 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Thu, 5 Oct 2023 17:45:28 +0200 Subject: [PATCH 23/33] Revert "ci: added isort to pyproject.toml and pre-commit (#5933)" (#5980) This reverts commit 64243540fb1f2cb6d4dfbb5b12db3aaf59a21b4a. --- .pre-commit-config.yaml | 6 ------ pyproject.toml | 3 --- 2 files changed, 9 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0ce4be95c2..5375c94d75 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -39,10 +39,4 @@ repos: - id: actionlint-docker args: ["-ignore", "SC2102"] -- repo: https://github.com/pycqa/isort - rev: 5.12.0 - hooks: - - id: isort - name: isort (python) - # TODO we can make pylint run at this stage too, once their execution gets normalized diff --git a/pyproject.toml b/pyproject.toml index cc4cef492e..3c11d398c1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -246,7 +246,6 @@ formatting = [ # Version specified following Black stability policy: # https://black.readthedocs.io/en/stable/the_black_code_style/index.html#stability-policy "black[jupyter]~=23.0", - "isort~=5.12" ] all = [ @@ -451,5 +450,3 @@ max-statements = 105 # Default is 50 omit = [ "haystack/testing/*", ] -[tool.isort] -profile = "black" From 282419d82ba0d20fa19c46a697cfba81dad33f5f Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Thu, 5 Oct 2023 17:55:07 +0200 Subject: [PATCH 24/33] feat: Unfreeze Document in Haystack 2.0 (#5974) * Unfreeze document * Remove immutability test --- haystack/preview/dataclasses/document.py | 10 ++++------ test/preview/dataclasses/test_document.py | 17 +++++------------ 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/haystack/preview/dataclasses/document.py b/haystack/preview/dataclasses/document.py index eb0704b867..76c3b78421 100644 --- a/haystack/preview/dataclasses/document.py +++ b/haystack/preview/dataclasses/document.py @@ -1,15 +1,13 @@ -from typing import List, Any, Dict, Optional, Type - -import json import hashlib +import json import logging +from dataclasses import asdict, dataclass, field, fields from pathlib import Path -from dataclasses import dataclass, field, fields, asdict +from typing import Any, Dict, List, Optional, Type import numpy import pandas - logger = logging.getLogger(__name__) @@ -50,7 +48,7 @@ def document_decoder(self, dictionary): return dictionary -@dataclass(frozen=True) +@dataclass class Document: """ Base data class containing some data to be queried. diff --git a/test/preview/dataclasses/test_document.py b/test/preview/dataclasses/test_document.py index f53b146fea..1f2610c849 100644 --- a/test/preview/dataclasses/test_document.py +++ b/test/preview/dataclasses/test_document.py @@ -1,21 +1,14 @@ -from pathlib import Path import dataclasses -import textwrap import json +import textwrap +from pathlib import Path -import pytest -import pandas as pd import numpy as np +import pandas as pd +import pytest from haystack.preview import Document -from haystack.preview.dataclasses.document import DocumentEncoder, DocumentDecoder - - -@pytest.mark.unit -def test_document_is_immutable(): - doc = Document(text="test text") - with pytest.raises(dataclasses.FrozenInstanceError): - doc.text = "won't work" +from haystack.preview.dataclasses.document import DocumentDecoder, DocumentEncoder @pytest.mark.unit From ccc9f010bbdc75e1c5f1d269a43a5f06f36acba1 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Date: Thu, 5 Oct 2023 18:43:26 +0200 Subject: [PATCH 25/33] fix: fix ChatGPT invocation layer (and add async support) (#5979) * ChatGPT async * release note * fix tests --- .../nodes/prompt/invocation_layer/chatgpt.py | 156 +++++++++++++----- ...gpt-invocation-layer-bc25d0ea5f77f05c.yaml | 6 + test/prompt/invocation_layer/test_chatgpt.py | 6 +- test/prompt/test_prompt_node.py | 26 +-- 4 files changed, 139 insertions(+), 55 deletions(-) create mode 100644 releasenotes/notes/fix-chatgpt-invocation-layer-bc25d0ea5f77f05c.yaml diff --git a/haystack/nodes/prompt/invocation_layer/chatgpt.py b/haystack/nodes/prompt/invocation_layer/chatgpt.py index 371b86d6a8..f3e1a3ef6b 100644 --- a/haystack/nodes/prompt/invocation_layer/chatgpt.py +++ b/haystack/nodes/prompt/invocation_layer/chatgpt.py @@ -1,10 +1,17 @@ import logging -from typing import Optional, List, Dict, Union, Any +from typing import Any, Dict, List, Optional, Union from haystack.nodes.prompt.invocation_layer.handlers import DefaultTokenStreamingHandler, TokenStreamingHandler from haystack.nodes.prompt.invocation_layer.open_ai import OpenAIInvocationLayer from haystack.nodes.prompt.invocation_layer.utils import has_azure_parameters -from haystack.utils.openai_utils import openai_request, _check_openai_finish_reason, count_openai_tokens_messages +from haystack.utils.openai_utils import ( + _check_openai_finish_reason, + check_openai_async_policy_violation, + check_openai_policy_violation, + count_openai_tokens_messages, + openai_async_request, + openai_request, +) logger = logging.getLogger(__name__) @@ -43,45 +50,6 @@ def __init__( """ super().__init__(api_key, model_name_or_path, max_length, api_base=api_base, **kwargs) - def _execute_openai_request( - self, prompt: Union[str, List[Dict]], base_payload: Dict, kwargs_with_defaults: Dict, stream: bool - ): - """ - For more details, see [OpenAI ChatGPT API reference](https://platform.openai.com/docs/api-reference/chat). - """ - if isinstance(prompt, str): - messages = [{"role": "user", "content": prompt}] - elif isinstance(prompt, list) and len(prompt) > 0 and isinstance(prompt[0], dict): - messages = prompt - else: - raise ValueError( - f"The prompt format is different than what the model expects. " - f"The model {self.model_name_or_path} requires either a string or messages in the ChatML format. " - f"For more details, see this [GitHub discussion](https://github.com/openai/openai-python/blob/main/chatml.md)." - ) - extra_payload = {"messages": messages} - payload = {**base_payload, **extra_payload} - if not stream: - response = openai_request(url=self.url, headers=self.headers, payload=payload) - _check_openai_finish_reason(result=response, payload=payload) - assistant_response = [choice["message"]["content"].strip() for choice in response["choices"]] - else: - response = openai_request( - url=self.url, headers=self.headers, payload=payload, read_response=False, stream=True - ) - handler: TokenStreamingHandler = kwargs_with_defaults.pop("stream_handler", DefaultTokenStreamingHandler()) - assistant_response = self._process_streaming_response(response=response, stream_handler=handler) - - # Although ChatGPT generates text until stop words are encountered, unfortunately it includes the stop word - # We want to exclude it to be consistent with other invocation layers - if "stop" in kwargs_with_defaults and kwargs_with_defaults["stop"] is not None: - stop_words = kwargs_with_defaults["stop"] - for idx, _ in enumerate(assistant_response): - for stop_word in stop_words: - assistant_response[idx] = assistant_response[idx].replace(stop_word, "").strip() - - return assistant_response - def _extract_token(self, event_data: Dict[str, Any]): delta = event_data["choices"][0]["delta"] if "content" in delta: @@ -141,3 +109,109 @@ def supports(cls, model_name_or_path: str, **kwargs) -> bool: and not "gpt-3.5-turbo-instruct" in model_name_or_path ) return valid_model and not has_azure_parameters(**kwargs) + + async def ainvoke(self, *args, **kwargs): + """ + Invokes a prompt on the model. Based on the model, it takes in a prompt (or either a prompt or a list of messages) + and returns a list of responses using a REST invocation. + + :return: The responses are being returned. + + Note: Only kwargs relevant to OpenAI are passed to OpenAI rest API. Others kwargs are ignored. + For more details, see OpenAI [documentation](https://platform.openai.com/docs/api-reference/completions/create). + """ + prompt, base_payload, kwargs_with_defaults, stream, moderation = self._prepare_invoke(*args, **kwargs) + + if moderation and await check_openai_async_policy_violation(input=prompt, headers=self.headers): + logger.info("Prompt '%s' will not be sent to OpenAI due to potential policy violation.", prompt) + return [] + + if isinstance(prompt, str): + messages = [{"role": "user", "content": prompt}] + elif isinstance(prompt, list) and len(prompt) > 0 and isinstance(prompt[0], dict): + messages = prompt + else: + raise ValueError( + f"The prompt format is different than what the model expects. " + f"The model {self.model_name_or_path} requires either a string or messages in the ChatML format. " + f"For more details, see this [GitHub discussion](https://github.com/openai/openai-python/blob/main/chatml.md)." + ) + extra_payload = {"messages": messages} + payload = {**base_payload, **extra_payload} + if not stream: + response = await openai_async_request(url=self.url, headers=self.headers, payload=payload) + _check_openai_finish_reason(result=response, payload=payload) + assistant_response = [choice["message"]["content"].strip() for choice in response["choices"]] + else: + response = await openai_async_request( + url=self.url, headers=self.headers, payload=payload, read_response=False, stream=True + ) + handler: TokenStreamingHandler = kwargs_with_defaults.pop("stream_handler", DefaultTokenStreamingHandler()) + assistant_response = self._process_streaming_response(response=response, stream_handler=handler) + + # Although ChatGPT generates text until stop words are encountered, unfortunately it includes the stop word + # We want to exclude it to be consistent with other invocation layers + if "stop" in kwargs_with_defaults and kwargs_with_defaults["stop"] is not None: + stop_words = kwargs_with_defaults["stop"] + for idx, _ in enumerate(assistant_response): + for stop_word in stop_words: + assistant_response[idx] = assistant_response[idx].replace(stop_word, "").strip() + + if moderation and await check_openai_async_policy_violation(input=assistant_response, headers=self.headers): + logger.info("Response '%s' will not be returned due to potential policy violation.", assistant_response) + return [] + + return assistant_response + + def invoke(self, *args, **kwargs): + """ + Invokes a prompt on the model. Based on the model, it takes in a prompt (or either a prompt or a list of messages) + and returns a list of responses using a REST invocation. + + :return: The responses are being returned. + + Note: Only kwargs relevant to OpenAI are passed to OpenAI rest API. Others kwargs are ignored. + For more details, see OpenAI [documentation](https://platform.openai.com/docs/api-reference/completions/create). + """ + prompt, base_payload, kwargs_with_defaults, stream, moderation = self._prepare_invoke(*args, **kwargs) + + if moderation and check_openai_policy_violation(input=prompt, headers=self.headers): + logger.info("Prompt '%s' will not be sent to OpenAI due to potential policy violation.", prompt) + return [] + + if isinstance(prompt, str): + messages = [{"role": "user", "content": prompt}] + elif isinstance(prompt, list) and len(prompt) > 0 and isinstance(prompt[0], dict): + messages = prompt + else: + raise ValueError( + f"The prompt format is different than what the model expects. " + f"The model {self.model_name_or_path} requires either a string or messages in the ChatML format. " + f"For more details, see this [GitHub discussion](https://github.com/openai/openai-python/blob/main/chatml.md)." + ) + extra_payload = {"messages": messages} + payload = {**base_payload, **extra_payload} + if not stream: + response = openai_request(url=self.url, headers=self.headers, payload=payload) + _check_openai_finish_reason(result=response, payload=payload) + assistant_response = [choice["message"]["content"].strip() for choice in response["choices"]] + else: + response = openai_request( + url=self.url, headers=self.headers, payload=payload, read_response=False, stream=True + ) + handler: TokenStreamingHandler = kwargs_with_defaults.pop("stream_handler", DefaultTokenStreamingHandler()) + assistant_response = self._process_streaming_response(response=response, stream_handler=handler) + + # Although ChatGPT generates text until stop words are encountered, unfortunately it includes the stop word + # We want to exclude it to be consistent with other invocation layers + if "stop" in kwargs_with_defaults and kwargs_with_defaults["stop"] is not None: + stop_words = kwargs_with_defaults["stop"] + for idx, _ in enumerate(assistant_response): + for stop_word in stop_words: + assistant_response[idx] = assistant_response[idx].replace(stop_word, "").strip() + + if moderation and check_openai_policy_violation(input=assistant_response, headers=self.headers): + logger.info("Response '%s' will not be returned due to potential policy violation.", assistant_response) + return [] + + return assistant_response diff --git a/releasenotes/notes/fix-chatgpt-invocation-layer-bc25d0ea5f77f05c.yaml b/releasenotes/notes/fix-chatgpt-invocation-layer-bc25d0ea5f77f05c.yaml new file mode 100644 index 0000000000..2be3084dd7 --- /dev/null +++ b/releasenotes/notes/fix-chatgpt-invocation-layer-bc25d0ea5f77f05c.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + Fixed the bug that prevented the correct usage of ChatGPT invocation layer + in 1.21.1. + Added async support for ChatGPT invocation layer. diff --git a/test/prompt/invocation_layer/test_chatgpt.py b/test/prompt/invocation_layer/test_chatgpt.py index 8c6fb1b409..c1b816d493 100644 --- a/test/prompt/invocation_layer/test_chatgpt.py +++ b/test/prompt/invocation_layer/test_chatgpt.py @@ -1,13 +1,13 @@ +import logging from unittest.mock import patch -import logging import pytest from haystack.nodes.prompt.invocation_layer import ChatGPTInvocationLayer @pytest.mark.unit -@patch("haystack.nodes.prompt.invocation_layer.open_ai.openai_request") +@patch("haystack.nodes.prompt.invocation_layer.chatgpt.openai_request") def test_default_api_base(mock_request): with patch("haystack.nodes.prompt.invocation_layer.open_ai.load_openai_tokenizer"): invocation_layer = ChatGPTInvocationLayer(api_key="fake_api_key") @@ -19,7 +19,7 @@ def test_default_api_base(mock_request): @pytest.mark.unit -@patch("haystack.nodes.prompt.invocation_layer.open_ai.openai_request") +@patch("haystack.nodes.prompt.invocation_layer.chatgpt.openai_request") def test_custom_api_base(mock_request): with patch("haystack.nodes.prompt.invocation_layer.open_ai.load_openai_tokenizer"): invocation_layer = ChatGPTInvocationLayer(api_key="fake_api_key", api_base="https://fake_api_base.com") diff --git a/test/prompt/test_prompt_node.py b/test/prompt/test_prompt_node.py index 6cd4bd1a21..972a04be18 100644 --- a/test/prompt/test_prompt_node.py +++ b/test/prompt/test_prompt_node.py @@ -1,20 +1,20 @@ -import os import logging -from typing import Optional, Union, List, Dict, Any, Tuple -from unittest.mock import patch, Mock, MagicMock, AsyncMock +import os +from typing import Any, Dict, List, Optional, Tuple, Union +from unittest.mock import AsyncMock, MagicMock, Mock, patch import pytest from prompthub import Prompt -from haystack import Document, Pipeline, BaseComponent, MultiLabel -from haystack.nodes.prompt import PromptTemplate, PromptNode, PromptModel -from haystack.nodes.prompt.prompt_template import LEGACY_DEFAULT_TEMPLATES +from haystack import BaseComponent, Document, MultiLabel, Pipeline +from haystack.nodes.prompt import PromptModel, PromptNode, PromptTemplate from haystack.nodes.prompt.invocation_layer import ( AzureChatGPTInvocationLayer, AzureOpenAIInvocationLayer, - OpenAIInvocationLayer, ChatGPTInvocationLayer, + OpenAIInvocationLayer, ) +from haystack.nodes.prompt.prompt_template import LEGACY_DEFAULT_TEMPLATES @pytest.fixture @@ -1082,8 +1082,8 @@ def test_content_moderation_gpt_35(): ChatGPTInvocationLayer. """ prompt_node = PromptNode(model_name_or_path="gpt-3.5-turbo", api_key="key", model_kwargs={"moderate_content": True}) - with patch("haystack.nodes.prompt.invocation_layer.open_ai.check_openai_policy_violation") as mock_check, patch( - "haystack.nodes.prompt.invocation_layer.open_ai.openai_request" + with patch("haystack.nodes.prompt.invocation_layer.chatgpt.check_openai_policy_violation") as mock_check, patch( + "haystack.nodes.prompt.invocation_layer.chatgpt.openai_request" ) as mock_request: VIOLENT_TEXT = "some violent text" mock_check.side_effect = lambda input, headers: input == VIOLENT_TEXT or input == [VIOLENT_TEXT] @@ -1093,11 +1093,15 @@ def test_content_moderation_gpt_35(): assert prompt_node(VIOLENT_TEXT) == [] # case 2: prompt passes the moderation check but the generated output fails the check # function should also return an empty list - mock_request.return_value = {"choices": [{"text": VIOLENT_TEXT, "finish_reason": ""}]} + mock_request.return_value = { + "choices": [{"message": {"content": VIOLENT_TEXT, "role": "assistant"}, "finish_reason": ""}] + } assert prompt_node("normal prompt") == [] # case 3: both prompt and output pass the moderation check # function should return the output - mock_request.return_value = {"choices": [{"text": "normal output", "finish_reason": ""}]} + mock_request.return_value = { + "choices": [{"message": {"content": "normal output", "role": "assistant"}, "finish_reason": ""}] + } assert prompt_node("normal prompt") == ["normal output"] From 1cdff6427e7def472575d45104083068da697b5e Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Fri, 6 Oct 2023 16:01:34 +0200 Subject: [PATCH 26/33] feat: Add SimilarityRanker to Haystack 2.0 (#5923) * Initial SimilarityRanker --- .../preview/components/rankers/__init__.py | 3 + .../preview/components/rankers/similarity.py | 108 ++++++++++++++++++ ...dd-similarity-ranker-401bf595cea7318a.yaml | 4 + .../components/rankers/test_similarity.py | 74 ++++++++++++ 4 files changed, 189 insertions(+) create mode 100644 haystack/preview/components/rankers/__init__.py create mode 100644 haystack/preview/components/rankers/similarity.py create mode 100644 releasenotes/notes/add-similarity-ranker-401bf595cea7318a.yaml create mode 100644 test/preview/components/rankers/test_similarity.py diff --git a/haystack/preview/components/rankers/__init__.py b/haystack/preview/components/rankers/__init__.py new file mode 100644 index 0000000000..27337481eb --- /dev/null +++ b/haystack/preview/components/rankers/__init__.py @@ -0,0 +1,3 @@ +from haystack.preview.components.rankers.similarity import SimilarityRanker + +__all__ = ["SimilarityRanker"] diff --git a/haystack/preview/components/rankers/similarity.py b/haystack/preview/components/rankers/similarity.py new file mode 100644 index 0000000000..7abb370954 --- /dev/null +++ b/haystack/preview/components/rankers/similarity.py @@ -0,0 +1,108 @@ +import logging +from pathlib import Path +from typing import List, Union, Dict, Any + +from haystack.preview import ComponentError, Document, component, default_from_dict, default_to_dict +from haystack.preview.lazy_imports import LazyImport + +logger = logging.getLogger(__name__) + + +with LazyImport(message="Run 'pip install transformers[torch,sentencepiece]==4.32.1'") as torch_and_transformers_import: + import torch + from transformers import AutoModelForSequenceClassification, AutoTokenizer + + +@component +class SimilarityRanker: + """ + Ranks documents based on query similarity. + + Usage example: + ``` + from haystack.preview import Document + from haystack.preview.components.rankers import SimilarityRanker + + sampler = SimilarityRanker() + docs = [Document(text="Paris"), Document(text="Berlin")] + query = "City in Germany" + output = sampler.run(query=query, documents=docs) + docs = output["documents"] + assert len(docs) == 2 + assert docs[0].text == "Berlin" + ``` + """ + + def __init__( + self, model_name_or_path: Union[str, Path] = "cross-encoder/ms-marco-MiniLM-L-6-v2", device: str = "cpu" + ): + """ + Creates an instance of SimilarityRanker. + + :param model_name_or_path: Path to a pre-trained sentence-transformers model. + :param device: torch device (for example, cuda:0, cpu, mps) to limit model inference to a specific device. + """ + torch_and_transformers_import.check() + + self.model_name_or_path = model_name_or_path + self.device = device + self.model = None + self.tokenizer = None + + def warm_up(self): + """ + Warm up the model and tokenizer used in scoring the documents. + """ + if self.model_name_or_path and not self.model: + self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name_or_path) + self.model = self.model.to(self.device) + self.model.eval() + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path) + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + return default_to_dict(self, device=self.device, model_name_or_path=self.model_name_or_path) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "SimilarityRanker": + """ + Deserialize this component from a dictionary. + """ + return default_from_dict(cls, data) + + @component.output_types(documents=List[Document]) + def run(self, query: str, documents: List[Document]): + """ + Returns a list of documents ranked by their similarity to the given query + + :param query: Query string. + :param documents: List of Documents. + :return: List of Documents sorted by (desc.) similarity with the query. + """ + if not documents: + return {"documents": []} + + # If a model path is provided but the model isn't loaded + if self.model_name_or_path and not self.model: + raise ComponentError( + f"The component {self.__class__.__name__} not warmed up. Run 'warm_up()' before calling 'run()'." + ) + + query_doc_pairs = [[query, doc.text] for doc in documents] + features = self.tokenizer( + query_doc_pairs, padding=True, truncation=True, return_tensors="pt" + ).to( # type: ignore + self.device + ) + with torch.inference_mode(): + similarity_scores = self.model(**features).logits.squeeze() # type: ignore + + _, sorted_indices = torch.sort(similarity_scores, descending=True) + ranked_docs = [] + for sorted_index_tensor in sorted_indices: + i = sorted_index_tensor.item() + documents[i].score = similarity_scores[i].item() + ranked_docs.append(documents[i]) + return {"documents": ranked_docs} diff --git a/releasenotes/notes/add-similarity-ranker-401bf595cea7318a.yaml b/releasenotes/notes/add-similarity-ranker-401bf595cea7318a.yaml new file mode 100644 index 0000000000..a0d3217a67 --- /dev/null +++ b/releasenotes/notes/add-similarity-ranker-401bf595cea7318a.yaml @@ -0,0 +1,4 @@ +--- +preview: + - | + Adds SimilarityRanker, a component that ranks a list of Documents based on their similarity to the query. diff --git a/test/preview/components/rankers/test_similarity.py b/test/preview/components/rankers/test_similarity.py new file mode 100644 index 0000000000..5ddb3b18dc --- /dev/null +++ b/test/preview/components/rankers/test_similarity.py @@ -0,0 +1,74 @@ +import pytest + +from haystack.preview import Document, ComponentError +from haystack.preview.components.rankers.similarity import SimilarityRanker + + +class TestSimilarityRanker: + @pytest.mark.unit + def test_to_dict(self): + component = SimilarityRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-6-v2") + data = component.to_dict() + assert data == { + "type": "SimilarityRanker", + "init_parameters": {"device": "cpu", "model_name_or_path": "cross-encoder/ms-marco-MiniLM-L-6-v2"}, + } + + @pytest.mark.unit + def test_to_dict_with_custom_init_parameters(self): + component = SimilarityRanker() + data = component.to_dict() + assert data == { + "type": "SimilarityRanker", + "init_parameters": {"device": "cpu", "model_name_or_path": "cross-encoder/ms-marco-MiniLM-L-6-v2"}, + } + + @pytest.mark.integration + def test_from_dict(self): + data = { + "type": "SimilarityRanker", + "init_parameters": {"device": "cpu", "model_name_or_path": "cross-encoder/ms-marco-MiniLM-L-6-v2"}, + } + component = SimilarityRanker.from_dict(data) + assert component.model_name_or_path == "cross-encoder/ms-marco-MiniLM-L-6-v2" + + @pytest.mark.integration + @pytest.mark.parametrize( + "query,docs_before_texts,expected_first_text", + [ + ("City in Bosnia and Herzegovina", ["Berlin", "Belgrade", "Sarajevo"], "Sarajevo"), + ("Machine learning", ["Python", "Bakery in Paris", "Tesla Giga Berlin"], "Python"), + ("Cubist movement", ["Nirvana", "Pablo Picasso", "Coffee"], "Pablo Picasso"), + ], + ) + def test_run(self, query, docs_before_texts, expected_first_text): + """ + Test if the component ranks documents correctly. + """ + ranker = SimilarityRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-6-v2") + ranker.warm_up() + docs_before = [Document(text=text) for text in docs_before_texts] + output = ranker.run(query=query, documents=docs_before) + docs_after = output["documents"] + + assert len(docs_after) == 3 + assert docs_after[0].text == expected_first_text + + sorted_scores = sorted([doc.score for doc in docs_after], reverse=True) + assert [doc.score for doc in docs_after] == sorted_scores + + # Returns an empty list if no documents are provided + @pytest.mark.integration + def test_returns_empty_list_if_no_documents_are_provided(self): + sampler = SimilarityRanker() + sampler.warm_up() + output = sampler.run(query="City in Germany", documents=[]) + assert output["documents"] == [] + + # Raises ComponentError if model is not warmed up + @pytest.mark.integration + def test_raises_component_error_if_model_not_warmed_up(self): + sampler = SimilarityRanker() + + with pytest.raises(ComponentError): + sampler.run(query="query", documents=[Document(text="document")]) From 4e921c650e0ea82a16dd033f63ce48a2f9c63b63 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Date: Fri, 6 Oct 2023 18:26:08 +0200 Subject: [PATCH 27/33] rm useless pin (#5995) --- pyproject.toml | 3 --- 1 file changed, 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3c11d398c1..bd52018a29 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -198,9 +198,6 @@ onnx-gpu = [ metrics = [ # for metrics "scipy>=1.3.2", "rapidfuzz>=2.0.15,<2.8.0", # FIXME https://github.com/deepset-ai/haystack/pull/3199 - # Pin setuptools_scm to prevent errors in Windows tests while installing seqeval - # https://github.com/deepset-ai/haystack/issues/5889 - "setuptools_scm<8.0; platform_system == 'Windows'", "seqeval", "mlflow", ] From 0e9a51cfb128ea2e7519dec83444daacc3ec355e Mon Sep 17 00:00:00 2001 From: Greg <146503804+komputer-says-no@users.noreply.github.com> Date: Sun, 8 Oct 2023 19:48:30 +0200 Subject: [PATCH 28/33] fix: annotation-tool is missing `DOMAIN_WHITELIST` envvar (#5997) The docker-compose.yml file for annotation tool ('annotation_tool/domain-compose.yml') is giving an error with latest image. The annotation-tool demands the `DOMAIN_WHITELIST` envvar to be defined that is not a part of the given template referenced by the documentation. Signed-off-by: Greg Nagy --- annotation_tool/docker-compose.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/annotation_tool/docker-compose.yml b/annotation_tool/docker-compose.yml index 044004ed19..c5522cf081 100644 --- a/annotation_tool/docker-compose.yml +++ b/annotation_tool/docker-compose.yml @@ -13,6 +13,7 @@ services: # DEFAULT_ADMIN_PASSWORD: "DEMO_PASSWORD" # COOKIE_KEYS: "somesafecookiekeys" # JWT_SECRET: "somesafesecret" + # DOMAIN_WHITELIST: "*" ports: - "7001:7001" links: From 0cb9abb1c2f39c83180aacc8ef526f820b72c70d Mon Sep 17 00:00:00 2001 From: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Date: Mon, 9 Oct 2023 11:24:19 +0200 Subject: [PATCH 29/33] Rename proposal to respect specifications (#6002) --- ...0-evaluation-haystack-2.md => 5794-evaluation-haystack-2.md} | 2 +- ...0-evaluation-haystack-2.py => 5794-evaluation-haystack-2.py} | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) rename proposals/text/{0000-evaluation-haystack-2.md => 5794-evaluation-haystack-2.md} (99%) rename proposals/text/{0000-evaluation-haystack-2.py => 5794-evaluation-haystack-2.py} (99%) diff --git a/proposals/text/0000-evaluation-haystack-2.md b/proposals/text/5794-evaluation-haystack-2.md similarity index 99% rename from proposals/text/0000-evaluation-haystack-2.md rename to proposals/text/5794-evaluation-haystack-2.md index 7f7507e78e..9093a19e0a 100644 --- a/proposals/text/0000-evaluation-haystack-2.md +++ b/proposals/text/5794-evaluation-haystack-2.md @@ -1,7 +1,7 @@ - Title: Evaluation in Haystack 2.0 - Decision driver: (Silvano Cerza, Julian Risch) - Start Date: 2023-08-23 -- Proposal PR: #5794 +- Proposal PR: [#5794](https://github.com/deepset-ai/haystack/pull/5794/) - Github Issue or Discussion: https://github.com/deepset-ai/haystack/issues/5628 # Summary diff --git a/proposals/text/0000-evaluation-haystack-2.py b/proposals/text/5794-evaluation-haystack-2.py similarity index 99% rename from proposals/text/0000-evaluation-haystack-2.py rename to proposals/text/5794-evaluation-haystack-2.py index 46dd8af4b3..faaaec377d 100644 --- a/proposals/text/0000-evaluation-haystack-2.py +++ b/proposals/text/5794-evaluation-haystack-2.py @@ -135,5 +135,5 @@ ] eval_result = eval(pipe, inputs=inputs, expected_output=expected_output) -metrics = result.calculate_metrics(Metric.SAS) +metrics = result.calculate_metrics(Metric.SAS) # noqa metrics.save("path/to/file.csv") diff --git a/pyproject.toml b/pyproject.toml index bd52018a29..8ecde0c81b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -286,7 +286,7 @@ line-length = 120 skip_magic_trailing_comma = true # For compatibility with pydoc>=4.6, check if still needed. [tool.codespell] -ignore-words-list = "ans,astroid,nd,ned,nin,ue" +ignore-words-list = "ans,astroid,nd,ned,nin,ue,rouge" quiet-level = 3 skip = "test/nodes/*,test/others/*,test/samples/*" From 40b83d8a479091eb602d3bc39d5bf602060a4d1f Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Mon, 9 Oct 2023 13:44:01 +0200 Subject: [PATCH 30/33] feat: Add TopPSampler Haystack 2.0 component (#5924) --- .../preview/components/samplers/__init__.py | 3 + haystack/preview/components/samplers/top_p.py | 140 ++++++++++++++++++ .../add-top-p-sampler-ad6e0f5d623a6bb5.yaml | 4 + test/preview/components/samplers/__init__.py | 0 .../preview/components/samplers/test_top_p.py | 107 +++++++++++++ 5 files changed, 254 insertions(+) create mode 100644 haystack/preview/components/samplers/__init__.py create mode 100644 haystack/preview/components/samplers/top_p.py create mode 100644 releasenotes/notes/add-top-p-sampler-ad6e0f5d623a6bb5.yaml create mode 100644 test/preview/components/samplers/__init__.py create mode 100644 test/preview/components/samplers/test_top_p.py diff --git a/haystack/preview/components/samplers/__init__.py b/haystack/preview/components/samplers/__init__.py new file mode 100644 index 0000000000..cab0e878e8 --- /dev/null +++ b/haystack/preview/components/samplers/__init__.py @@ -0,0 +1,3 @@ +from haystack.preview.components.samplers.top_p import TopPSampler + +__all__ = ["TopPSampler"] diff --git a/haystack/preview/components/samplers/top_p.py b/haystack/preview/components/samplers/top_p.py new file mode 100644 index 0000000000..f0c9d6d992 --- /dev/null +++ b/haystack/preview/components/samplers/top_p.py @@ -0,0 +1,140 @@ +import logging +from typing import List, Optional, Dict, Any + +from haystack.preview import ComponentError, Document, component, default_from_dict, default_to_dict +from haystack.preview.lazy_imports import LazyImport + +logger = logging.getLogger(__name__) + + +with LazyImport(message="Run 'pip install torch>=1.13'") as torch_import: + import torch + + +@component +class TopPSampler: + """ + Filters documents using top-p (nucleus) sampling based on their similarity scores' cumulative probability. + + Usage example: + + ```python + from haystack.preview import Document + from haystack.preview.components.samplers import TopPSampler + + sampler = TopPSampler(top_p=0.95) + docs = [ + Document(text="Berlin", metadata={"similarity_score": -10.6}), + Document(text="Belgrade", metadata={"similarity_score": -8.9}), + Document(text="Sarajevo", metadata={"similarity_score": -4.6}), + ] + output = sampler.run(documents=docs) + docs = output["documents"] + assert len(docs) == 1 + assert docs[0].text == "Sarajevo" + ``` + """ + + def __init__(self, top_p: float = 1.0, score_field: Optional[str] = None): + """ + Creates an instance of TopPSampler. + + :param top_p: Cumulative probability threshold (usually between 0.9 and 0.99). + :param score_field: Field name in a document's metadata containing the scores. Defaults to the Document score + if not provided. + """ + torch_import.check() + + self.top_p = top_p + self.score_field = score_field + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + return default_to_dict(self, top_p=self.top_p, score_field=self.score_field) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "TopPSampler": + """ + Deserialize this component from a dictionary. + """ + return default_from_dict(cls, data) + + @component.output_types(documents=List[Document]) + def run(self, documents: List[Document], top_p: Optional[float] = None): + """ + Filter documents based on their similarity scores using top-p sampling. + + :param documents: List of Documents to filter. + :param top_p: Cumulative probability threshold. Defaults to the value set during initialization or 1.0 + if not set. + :return: List of filtered Documents. + """ + if not documents: + return {"documents": []} + + top_p = top_p or self.top_p or 1.0 # default to 1.0 if both are None + + if not 0 <= top_p <= 1: + raise ComponentError(f"top_p must be between 0 and 1. Got {top_p}.") + + similarity_scores = torch.tensor(self._collect_scores(documents), dtype=torch.float32) + + # Apply softmax normalization to the similarity scores + probs = torch.nn.functional.softmax(similarity_scores, dim=-1) + + # Sort the probabilities and calculate their cumulative sum + sorted_probs, sorted_indices = torch.sort(probs, descending=True) + cumulative_probs = torch.cumsum(sorted_probs, dim=-1) + + # Check if the cumulative probabilities are close to top_p with a 1e-6 tolerance + close_to_top_p = torch.isclose(cumulative_probs, torch.tensor(top_p, device=cumulative_probs.device), atol=1e-6) + + # Combine the close_to_top_p with original condition using logical OR + condition = (cumulative_probs <= top_p) | close_to_top_p + + # Find the indices with cumulative probabilities that exceed top_p + top_p_indices = torch.where(torch.BoolTensor(condition))[0] + + # Map the selected indices back to their original indices + original_indices = sorted_indices[top_p_indices] + selected_docs = [documents[i.item()] for i in original_indices] + + # If low p resulted in no documents being selected, then + # return at least one document + if not selected_docs: + logger.warning( + "Top-p sampling with p=%s resulted in no documents being selected. " + "Returning the document with the highest similarity score.", + top_p, + ) + highest_prob_indices = torch.argsort(probs, descending=True) + selected_docs = [documents[int(highest_prob_indices[0].item())]] + + return {"documents": selected_docs} + + def _collect_scores(self, documents: List[Document]) -> List[float]: + """ + Collect the scores from the documents' metadata. + :param documents: List of Documents. + :return: List of scores. + """ + if self.score_field: + missing_scores_docs = [d for d in documents if self.score_field not in d.metadata] + if missing_scores_docs: + missing_scores_docs_ids = [d.id for d in missing_scores_docs if d.id] + raise ComponentError( + f"Score field '{self.score_field}' not found in metadata of documents " + f"with IDs: {missing_scores_docs_ids}." + f"Make sure that all documents have a score field '{self.score_field}' in their metadata." + ) + return [d.metadata[self.score_field] for d in documents] + else: + missing_scores_docs = [d for d in documents if d.score is None] + if missing_scores_docs: + missing_scores_docs_ids = [d.id for d in missing_scores_docs if d.id] + raise ComponentError( + f"Ensure all documents have a valid score value. These docs {missing_scores_docs_ids} don't." + ) + return [d.score for d in documents] # type: ignore ## because Document score is Optional diff --git a/releasenotes/notes/add-top-p-sampler-ad6e0f5d623a6bb5.yaml b/releasenotes/notes/add-top-p-sampler-ad6e0f5d623a6bb5.yaml new file mode 100644 index 0000000000..729c8b6245 --- /dev/null +++ b/releasenotes/notes/add-top-p-sampler-ad6e0f5d623a6bb5.yaml @@ -0,0 +1,4 @@ +--- +preview: + - | + Adds TopPSampler, a component selects documents based on the cumulative probability of the Document scores using top p (nucleus) sampling. diff --git a/test/preview/components/samplers/__init__.py b/test/preview/components/samplers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/preview/components/samplers/test_top_p.py b/test/preview/components/samplers/test_top_p.py new file mode 100644 index 0000000000..111dffa25c --- /dev/null +++ b/test/preview/components/samplers/test_top_p.py @@ -0,0 +1,107 @@ +import random + +import pytest + +from haystack.preview import Document, ComponentError +from haystack.preview.components.samplers.top_p import TopPSampler + + +class TestTopPSampler: + @pytest.mark.unit + def test_to_dict(self): + component = TopPSampler() + data = component.to_dict() + assert data == {"type": "TopPSampler", "init_parameters": {"top_p": 1.0, "score_field": None}} + + @pytest.mark.unit + def test_to_dict_with_custom_init_parameters(self): + component = TopPSampler(top_p=0.92) + data = component.to_dict() + assert data == {"type": "TopPSampler", "init_parameters": {"top_p": 0.92, "score_field": None}} + + @pytest.mark.unit + def test_from_dict(self): + data = {"type": "TopPSampler", "init_parameters": {"top_p": 0.9, "score_field": None}} + component = TopPSampler.from_dict(data) + assert component.top_p == 0.9 + + @pytest.mark.unit + def test_run_scores_from_metadata(self): + """ + Test if the component runs correctly with scores already in the metadata. + """ + sampler = TopPSampler(top_p=0.95, score_field="similarity_score") + docs = [ + Document(text="Berlin", metadata={"similarity_score": -10.6}), + Document(text="Belgrade", metadata={"similarity_score": -8.9}), + Document(text="Sarajevo", metadata={"similarity_score": -4.6}), + ] + output = sampler.run(documents=docs) + docs = output["documents"] + assert len(docs) == 1 + assert docs[0].text == "Sarajevo" + + @pytest.mark.unit + def test_run_scores(self): + """ + Test if the component runs correctly with scores in the Document score field. + """ + sampler = TopPSampler(top_p=0.99) + docs = [ + Document(text="Berlin", score=-10.6), + Document(text="Belgrade", score=-8.9), + Document(text="Sarajevo", score=-4.6), + ] + + random.shuffle(docs) + sorted_scores = sorted([doc.score for doc in docs], reverse=True) + + # top_p = 0.99 will get the top 1 document + output = sampler.run(documents=docs) + docs_filtered = output["documents"] + assert len(docs_filtered) == 1 + assert docs_filtered[0].text == "Sarajevo" + + assert [doc.score for doc in docs_filtered] == sorted_scores[:1] + + @pytest.mark.unit + def test_run_scores_top_p_1(self): + """ + Test if the component runs correctly top_p=1. + """ + sampler = TopPSampler(top_p=1.0) + docs = [ + Document(text="Berlin", score=-10.6), + Document(text="Belgrade", score=-8.9), + Document(text="Sarajevo", score=-4.6), + ] + + random.shuffle(docs) + output = sampler.run(documents=docs) + docs_filtered = output["documents"] + assert len(docs_filtered) == len(docs) + assert docs_filtered[0].text == "Sarajevo" + + assert [doc.score for doc in docs_filtered] == sorted([doc.score for doc in docs], reverse=True) + + # Returns an empty list if no documents are provided + @pytest.mark.unit + def test_returns_empty_list_if_no_documents_are_provided(self): + sampler = TopPSampler() + output = sampler.run(documents=[]) + assert output["documents"] == [] + + @pytest.mark.unit + def test_run_scores_no_metadata_present(self): + """ + Test if the component runs correctly with scores missing from the metadata yet being specified in the + score_field. + """ + sampler = TopPSampler(top_p=0.95, score_field="similarity_score") + docs = [ + Document(text="Berlin", score=-10.6), + Document(text="Belgrade", score=-8.9), + Document(text="Sarajevo", score=-4.6), + ] + with pytest.raises(ComponentError, match="Score field 'similarity_score' not found"): + sampler.run(documents=docs) From 71f2430fd185fdfb6bdc4da42cf286dc4e936f29 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Mon, 9 Oct 2023 12:54:17 +0100 Subject: [PATCH 31/33] test: enhance e2e tests to also draw and serialize/deserialize the test pipelines (#5910) * add draw and serialization/deserialization to e2e pipeline examples * add comment about json serialization * fix a small gptgenerator bug and move indexing in tests * to json * review feedback --- .../pipelines/test_extractive_qa_pipeline.py | 32 +++++-- e2e/preview/pipelines/test_rag_pipelines.py | 93 ++++++++++++------- .../components/generators/openai/gpt.py | 2 +- 3 files changed, 82 insertions(+), 45 deletions(-) diff --git a/e2e/preview/pipelines/test_extractive_qa_pipeline.py b/e2e/preview/pipelines/test_extractive_qa_pipeline.py index c25b73a057..b7ee11a0e5 100644 --- a/e2e/preview/pipelines/test_extractive_qa_pipeline.py +++ b/e2e/preview/pipelines/test_extractive_qa_pipeline.py @@ -1,25 +1,39 @@ +import json + from haystack.preview import Pipeline, Document from haystack.preview.document_stores import MemoryDocumentStore from haystack.preview.components.retrievers import MemoryBM25Retriever from haystack.preview.components.readers import ExtractiveReader -def test_extractive_qa_pipeline(): - document_store = MemoryDocumentStore() +def test_extractive_qa_pipeline(tmp_path): + # Create the pipeline + qa_pipeline = Pipeline() + qa_pipeline.add_component(instance=MemoryBM25Retriever(document_store=MemoryDocumentStore()), name="retriever") + qa_pipeline.add_component(instance=ExtractiveReader(model_name_or_path="deepset/tinyroberta-squad2"), name="reader") + qa_pipeline.connect("retriever", "reader") + + # Draw the pipeline + qa_pipeline.draw(tmp_path / "test_extractive_qa_pipeline.png") + + # Serialize the pipeline to JSON + with open(tmp_path / "test_bm25_rag_pipeline.json", "w") as f: + print(json.dumps(qa_pipeline.to_dict(), indent=4)) + json.dump(qa_pipeline.to_dict(), f) + # Load the pipeline back + with open(tmp_path / "test_bm25_rag_pipeline.json", "r") as f: + qa_pipeline = Pipeline.from_dict(json.load(f)) + + # Populate the document store documents = [ Document(text="My name is Jean and I live in Paris."), Document(text="My name is Mark and I live in Berlin."), Document(text="My name is Giorgio and I live in Rome."), ] + qa_pipeline.get_component("retriever").document_store.write_documents(documents) - document_store.write_documents(documents) - - qa_pipeline = Pipeline() - qa_pipeline.add_component(instance=MemoryBM25Retriever(document_store=document_store), name="retriever") - qa_pipeline.add_component(instance=ExtractiveReader(model_name_or_path="deepset/tinyroberta-squad2"), name="reader") - qa_pipeline.connect("retriever", "reader") - + # Query and assert questions = ["Who lives in Paris?", "Who lives in Berlin?", "Who lives in Rome?"] answers_spywords = ["Jean", "Mark", "Giorgio"] diff --git a/e2e/preview/pipelines/test_rag_pipelines.py b/e2e/preview/pipelines/test_rag_pipelines.py index 335cc9a2c2..719fbd8ed0 100644 --- a/e2e/preview/pipelines/test_rag_pipelines.py +++ b/e2e/preview/pipelines/test_rag_pipelines.py @@ -1,4 +1,5 @@ import os +import json import pytest from haystack.preview import Pipeline, Document @@ -15,15 +16,8 @@ not os.environ.get("OPENAI_API_KEY", None), reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.", ) -def test_bm25_rag_pipeline(): - document_store = MemoryDocumentStore() - - documents = [ - Document(text="My name is Jean and I live in Paris."), - Document(text="My name is Mark and I live in Berlin."), - Document(text="My name is Giorgio and I live in Rome."), - ] - +def test_bm25_rag_pipeline(tmp_path): + # Create the RAG pipeline prompt_template = """ Given these documents, answer the question.\nDocuments: {% for doc in documents %} @@ -33,11 +27,8 @@ def test_bm25_rag_pipeline(): \nQuestion: {{question}} \nAnswer: """ - - document_store.write_documents(documents) - rag_pipeline = Pipeline() - rag_pipeline.add_component(instance=MemoryBM25Retriever(document_store=document_store), name="retriever") + rag_pipeline.add_component(instance=MemoryBM25Retriever(document_store=MemoryDocumentStore()), name="retriever") rag_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder") rag_pipeline.add_component(instance=GPTGenerator(api_key=os.environ.get("OPENAI_API_KEY")), name="llm") rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") @@ -47,6 +38,26 @@ def test_bm25_rag_pipeline(): rag_pipeline.connect("llm.metadata", "answer_builder.metadata") rag_pipeline.connect("retriever", "answer_builder.documents") + # Draw the pipeline + rag_pipeline.draw(tmp_path / "test_bm25_rag_pipeline.png") + + # Serialize the pipeline to JSON + with open(tmp_path / "test_bm25_rag_pipeline.json", "w") as f: + json.dump(rag_pipeline.to_dict(), f) + + # Load the pipeline back + with open(tmp_path / "test_bm25_rag_pipeline.json", "r") as f: + rag_pipeline = Pipeline.from_dict(json.load(f)) + + # Populate the document store + documents = [ + Document(text="My name is Jean and I live in Paris."), + Document(text="My name is Mark and I live in Berlin."), + Document(text="My name is Giorgio and I live in Rome."), + ] + rag_pipeline.get_component("retriever").document_store.write_documents(documents) + + # Query and assert questions = ["Who lives in Paris?", "Who lives in Berlin?", "Who lives in Rome?"] answers_spywords = ["Jean", "Mark", "Giorgio"] @@ -71,15 +82,8 @@ def test_bm25_rag_pipeline(): not os.environ.get("OPENAI_API_KEY", None), reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.", ) -def test_embedding_retrieval_rag_pipeline(): - document_store = MemoryDocumentStore() - - documents = [ - Document(text="My name is Jean and I live in Paris."), - Document(text="My name is Mark and I live in Berlin."), - Document(text="My name is Giorgio and I live in Rome."), - ] - +def test_embedding_retrieval_rag_pipeline(tmp_path): + # Create the RAG pipeline prompt_template = """ Given these documents, answer the question.\nDocuments: {% for doc in documents %} @@ -89,22 +93,14 @@ def test_embedding_retrieval_rag_pipeline(): \nQuestion: {{question}} \nAnswer: """ - - indexing_pipeline = Pipeline() - indexing_pipeline.add_component( - instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-mpnet-base-v2"), - name="document_embedder", - ) - indexing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="document_writer") - indexing_pipeline.connect("document_embedder", "document_writer") - indexing_pipeline.run({"document_embedder": {"documents": documents}}) - rag_pipeline = Pipeline() rag_pipeline.add_component( - instance=SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-mpnet-base-v2"), + instance=SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"), name="text_embedder", ) - rag_pipeline.add_component(instance=MemoryEmbeddingRetriever(document_store=document_store), name="retriever") + rag_pipeline.add_component( + instance=MemoryEmbeddingRetriever(document_store=MemoryDocumentStore()), name="retriever" + ) rag_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder") rag_pipeline.add_component(instance=GPTGenerator(api_key=os.environ.get("OPENAI_API_KEY")), name="llm") rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") @@ -115,6 +111,34 @@ def test_embedding_retrieval_rag_pipeline(): rag_pipeline.connect("llm.metadata", "answer_builder.metadata") rag_pipeline.connect("retriever", "answer_builder.documents") + # Draw the pipeline + rag_pipeline.draw(tmp_path / "test_embedding_rag_pipeline.png") + + # Serialize the pipeline to JSON + with open(tmp_path / "test_bm25_rag_pipeline.json", "w") as f: + json.dump(rag_pipeline.to_dict(), f) + + # Load the pipeline back + with open(tmp_path / "test_bm25_rag_pipeline.json", "r") as f: + rag_pipeline = Pipeline.from_dict(json.load(f)) + + # Populate the document store + documents = [ + Document(text="My name is Jean and I live in Paris."), + Document(text="My name is Mark and I live in Berlin."), + Document(text="My name is Giorgio and I live in Rome."), + ] + document_store = rag_pipeline.get_component("retriever").document_store + indexing_pipeline = Pipeline() + indexing_pipeline.add_component( + instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"), + name="document_embedder", + ) + indexing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="document_writer") + indexing_pipeline.connect("document_embedder", "document_writer") + indexing_pipeline.run({"document_embedder": {"documents": documents}}) + + # Query and assert questions = ["Who lives in Paris?", "Who lives in Berlin?", "Who lives in Rome?"] answers_spywords = ["Jean", "Mark", "Giorgio"] @@ -129,7 +153,6 @@ def test_embedding_retrieval_rag_pipeline(): assert len(result["answer_builder"]["answers"]) == 1 generated_answer = result["answer_builder"]["answers"][0] - print(generated_answer) assert spyword in generated_answer.data assert generated_answer.query == question assert hasattr(generated_answer, "documents") diff --git a/haystack/preview/components/generators/openai/gpt.py b/haystack/preview/components/generators/openai/gpt.py index 2aeb29f4e3..e5618f738e 100644 --- a/haystack/preview/components/generators/openai/gpt.py +++ b/haystack/preview/components/generators/openai/gpt.py @@ -121,7 +121,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "GPTGenerator": """ init_params = data.get("init_parameters", {}) streaming_callback = None - if "streaming_callback" in init_params: + if "streaming_callback" in init_params and init_params["streaming_callback"]: parts = init_params["streaming_callback"].split(".") module_name = ".".join(parts[:-1]) function_name = parts[-1] From aea63336376fcf73ea6e555967371d6d3c8d3ab3 Mon Sep 17 00:00:00 2001 From: Timo Moeller Date: Mon, 9 Oct 2023 15:03:24 +0200 Subject: [PATCH 32/33] Add end2end tests as getting started to HS2.0 readme (#5981) * Add end2end tests as getting started to HS2.0 readme * capital heading --------- Co-authored-by: Daria Fokina --- haystack/preview/README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/haystack/preview/README.md b/haystack/preview/README.md index 99fe7eb3e3..1dcd4c351b 100644 --- a/haystack/preview/README.md +++ b/haystack/preview/README.md @@ -23,5 +23,12 @@ pip install farm-haystack ``` The `farm-haystack` package includes all new features of Haystack 2.0. Note that updates to this package occur less frequently compared to `haystack-ai`. So, you might not get the all latest Haystack 2.0 features immediately when using `farm-haystack`. +## 🚗 Getting Started + +In our **end 2 end tests** you can find example code for the following pipelines: +- [RAG pipeline](https://github.com/deepset-ai/haystack/blob/main/e2e/preview/pipelines/test_rag_pipelines.py) +- [Extractive QA pipeline](https://github.com/deepset-ai/haystack/blob/main/e2e/preview/pipelines/test_extractive_qa_pipeline.py) +- more to come, check out the [folder](https://github.com/deepset-ai/haystack/blob/main/e2e/preview/) + ## 💙 Stay Updated To learn how and when components will be migrated to the new major version, have a look at the [Migrate Components to Pipeline v2](https://github.com/deepset-ai/haystack/issues/5265) roadmap item, where we keep track of issues and PRs about Haystack 2.0. When you have questions, you can always contact us using the [Shaping Haystack 2.0](https://github.com/deepset-ai/haystack/discussions/5568) discussion or [Haystack Discord server](https://discord.com/channels/993534733298450452/1141683185458094211). From d0ff3fa7c271780791319dd881a05ad49ac2c500 Mon Sep 17 00:00:00 2001 From: Daria Fokina Date: Mon, 9 Oct 2023 15:24:47 +0200 Subject: [PATCH 33/33] docs: readme-get-started (#5993) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * readme-get-started * lg update * lg update Co-authored-by: Bilge Yücel --------- Co-authored-by: Bilge Yücel --- README.md | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 61a9706896..b39c3cc70a 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,57 @@ | Meta | ![Discord](https://img.shields.io/discord/993534733298450452?logo=discord) ![Twitter Follow](https://img.shields.io/twitter/follow/deepset_ai) | -[Haystack](https://haystack.deepset.ai/) is an end-to-end NLP framework that enables you to build applications powered by LLMs, Transformer models, vector search and more. Whether you want to perform question answering, answer generation, semantic document search, or build tools that are capable of complex decision making and query resolution, you can use the state-of-the-art NLP models with Haystack to build end-to-end NLP applications solving your use case. +[Haystack](https://haystack.deepset.ai/) is an end-to-end NLP framework that enables you to build applications powered by LLMs, Transformer models, vector search and more. Whether you want to perform question answering, answer generation, semantic document search, or build tools that are capable of complex decision-making and query resolution, you can use the state-of-the-art NLP models with Haystack to build end-to-end NLP applications solving your use case. + +## Quickstart + +Haystack is built around the concept of pipelines. A pipeline is a powerful structure that performs an NLP task. It's made up of components connected together. For example, you can connect a `Retriever` and a `PromptNode` to build a Generative Question Answering pipeline that uses your own data. + +Try out how Haystack answers questions about Game of Thrones using the Retrieval Augmented Generation (RAG) approach 👇 + +First, run the minimal Haystack installation: + +```sh +pip install farm-haystack +``` + +Then, index your data to the DocumentStore, build a RAG pipeline, and ask a question on your data: + +```python +from haystack.document_stores import InMemoryDocumentStore +from haystack.utils import build_pipeline, add_example_data, print_answers + +# We are model agnostic :) Here, you can choose from: "anthropic", "cohere", "huggingface", and "openai". +provider = "openai" +API_KEY = "sk-..." # ADD YOUR KEY HERE + +# We support many different databases. Here, we load a simple and lightweight in-memory database. +document_store = InMemoryDocumentStore(use_bm25=True) + +# Download and add Game of Thrones TXT articles to Haystack DocumentStore. +# You can also provide a folder with your local documents. +add_example_data(document_store, "data/GoT_getting_started") + +# Build a pipeline with a Retriever to get relevant documents to the query and a PromptNode interacting with LLMs using a custom prompt. +pipeline = build_pipeline(provider, API_KEY, document_store) + +# Ask a question on the data you just added. +result = pipeline.run(query="Who is the father of Arya Stark?") + +# For details, like which documents were used to generate the answer, look into the object +print_answers(result, details="medium") +``` + +The output of the pipeline will reference the documents used to generate the answer: + +``` +'Query: Who is the father of Arya Stark?' +'Answers:' +[{'answer': 'The father of Arya Stark is Lord Eddard Stark of ' + 'Winterfell. [Document 1, Document 4, Document 5]'}] +``` + +Congratulations, you have just built your first Haystack app! ## Core Concepts