diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 4632a51cdd..4c310118d3 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -21,10 +21,10 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: iterative/setup-cml@v1 + - uses: iterative/setup-cml@v2 - name: AWS authentication - uses: aws-actions/configure-aws-credentials@8c3f20df09ac63af7b3ae3d7c91f105f857d8497 + uses: aws-actions/configure-aws-credentials@010d0da01d0b5a38af31e9c3470dbfdabdecca3a with: aws-region: ${{ env.AWS_REGION }} role-to-assume: ${{ secrets.AWS_CI_ROLE_ARN }} @@ -237,7 +237,7 @@ jobs: runs-on: ubuntu-latest steps: - name: AWS authentication - uses: aws-actions/configure-aws-credentials@8c3f20df09ac63af7b3ae3d7c91f105f857d8497 + uses: aws-actions/configure-aws-credentials@010d0da01d0b5a38af31e9c3470dbfdabdecca3a with: aws-region: ${{ env.AWS_REGION }} role-to-assume: ${{ secrets.AWS_CI_ROLE_ARN }} diff --git a/.github/workflows/ci_metrics.yml b/.github/workflows/ci_metrics.yml index 688f3b9e1b..31a329e334 100644 --- a/.github/workflows/ci_metrics.yml +++ b/.github/workflows/ci_metrics.yml @@ -4,8 +4,10 @@ on: workflow_run: workflows: - "end-to-end" + - "end-to-end (Preview)" - "Linting" - "Tests" + - "Tests (Preview)" - "REST API Tests" types: - completed diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml index 188fd02c2a..b4fcfc1444 100644 --- a/.github/workflows/e2e.yml +++ b/.github/workflows/e2e.yml @@ -13,6 +13,7 @@ on: - ready_for_review paths: - "e2e/**/*.py" + - "!e2e/preview/**/*.py" # See e2e_preview.yml - ".github/workflows/e2e.yml" env: diff --git a/.github/workflows/e2e_preview.yml b/.github/workflows/e2e_preview.yml new file mode 100644 index 0000000000..aa54f8963e --- /dev/null +++ b/.github/workflows/e2e_preview.yml @@ -0,0 +1,42 @@ +# If you change this name also do it in ci_metrics.yml +name: end-to-end (Preview) + +on: + workflow_dispatch: # Activate this workflow manually + schedule: + - cron: "0 0 * * *" + pull_request: + types: + - opened + - reopened + - synchronize + - ready_for_review + paths: + - "e2e/preview/**/*.py" + - ".github/workflows/e2e_preview.yml" + +env: + PYTHON_VERSION: "3.8" + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + +jobs: + run: + timeout-minutes: 60 + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt install ffmpeg # for local Whisper tests + + - name: Install Haystack + run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2' + + - name: Run tests + run: pytest e2e/preview diff --git a/.github/workflows/examples_tests.yml b/.github/workflows/examples_tests.yml index 241306571d..ac1acb0951 100644 --- a/.github/workflows/examples_tests.yml +++ b/.github/workflows/examples_tests.yml @@ -15,8 +15,6 @@ on: - ready_for_review env: - SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} - SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} COHERE_API_KEY: ${{ secrets.COHERE_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} diff --git a/.github/workflows/license_compliance.yml b/.github/workflows/license_compliance.yml index 0c5e504660..b3e0aba19a 100644 --- a/.github/workflows/license_compliance.yml +++ b/.github/workflows/license_compliance.yml @@ -171,7 +171,7 @@ jobs: name: All extras env: REQUIREMENTS_FILE: requirements_all.txt - runs-on: ubuntu-latest + runs-on: ubuntu-latest-4-cores steps: - name: Checkout the code uses: actions/checkout@v4 diff --git a/.github/workflows/pypi_release.yml b/.github/workflows/pypi_release.yml index ee52541410..3118198e65 100644 --- a/.github/workflows/pypi_release.yml +++ b/.github/workflows/pypi_release.yml @@ -33,5 +33,5 @@ jobs: uses: act10ns/slack@v2 with: status: ${{ job.status }} - channel: "#haystack" + channel: "#haystack-notifications" config: .github/config/pypi-release-slack-notification.yml diff --git a/.github/workflows/release_notes.yml b/.github/workflows/release_notes.yml index 6e6e4cb9e6..72fabb7ff8 100644 --- a/.github/workflows/release_notes.yml +++ b/.github/workflows/release_notes.yml @@ -22,9 +22,9 @@ jobs: - name: Checkout uses: actions/checkout@v4 with: - # With the default value of 1, there are corner cases where tj-actions/changed-files - # fails with a `no merge base` error - fetch-depth: 0 + # With the default value of 1, there are corner cases where tj-actions/changed-files + # fails with a `no merge base` error + fetch-depth: 0 - name: Get release note files id: changed-files @@ -35,5 +35,10 @@ jobs: - name: Check release notes if: steps.changed-files.outputs.any_changed == 'false' && !contains( github.event.pull_request.labels.*.name, 'ignore-for-release-notes') run: | + # Check if any of the commit messages contain tags ci/docs/test + if git log --pretty=%s origin/main..HEAD | grep -E '^(ci:|docs:|test:)' > /dev/null; then + echo "Skipping release note check for commits with 'ci:', 'docs:', or 'test:' tags." + else echo "::error::The release notes file is missing, please add one or attach the label 'ignore-for-release-notes' to this PR." exit 1 + fi diff --git a/.github/workflows/rest_api_tests.yml b/.github/workflows/rest_api_tests.yml index 33e8a5ff7a..e3fc5a5971 100644 --- a/.github/workflows/rest_api_tests.yml +++ b/.github/workflows/rest_api_tests.yml @@ -18,8 +18,6 @@ on: - "rest_api/pyproject.toml" env: - SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} - SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK PYTHON_VERSION: "3.8" jobs: diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 98845da2d4..4ecdd1fe85 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -17,13 +17,13 @@ on: paths: - "**.py" - "pyproject.toml" + - "!haystack/preview/**/*.py" # See tests_preview.yml + - "!test/preview/**/*.py" # See tests_preview.yml - "!.github/**/*.py" - "!rest_api/**/*.py" - "!docs/**/*.py" env: - SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} - SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} COHERE_API_KEY: ${{ secrets.COHERE_API_KEY }} PYTHON_VERSION: "3.8" diff --git a/.github/workflows/tests_preview.yml b/.github/workflows/tests_preview.yml new file mode 100644 index 0000000000..9f570491b7 --- /dev/null +++ b/.github/workflows/tests_preview.yml @@ -0,0 +1,319 @@ +# If you change this name also do it in tests_preview_skipper.yml +name: Tests (Preview) + +on: + workflow_dispatch: # Activate this workflow manually + push: + branches: + - main + # release branches have the form v1.9.x + - "v[0-9].*[0-9].x" + pull_request: + types: + - opened + - reopened + - synchronize + - ready_for_review + paths: + - "haystack/preview/**/*.py" + - "test/preview/**/*.py" + +env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + COHERE_API_KEY: ${{ secrets.COHERE_API_KEY }} + CORE_AZURE_CS_ENDPOINT: ${{ secrets.CORE_AZURE_CS_ENDPOINT }} + CORE_AZURE_CS_API_KEY: ${{ secrets.CORE_AZURE_CS_API_KEY }} + PYTHON_VERSION: "3.8" + +jobs: + black: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install Black + run: | + pip install --upgrade pip + pip install .[formatting] + + - name: Check status + run: | + if ! black . --check; then + git status + echo "###################################################################################################" + echo "# " + echo "# CHECK FAILED! Black found issues with your code formatting." + echo "# " + echo "# Either:" + echo "# 1. Run Black locally before committing:" + echo "# " + echo "# pip install .[formatting]" + echo "# black ." + echo "# " + echo "# 2. Install the pre-commit hook:" + echo "# " + echo "# pre-commit install" + echo "# " + echo "# 3. See https://github.com/deepset-ai/haystack/blob/main/CONTRIBUTING.md for help." + echo "# " + echo "# If you have further problems, please open an issue: https://github.com/deepset-ai/haystack/issues" + echo "# " + echo "##################################################################################################" + exit 1 + fi + + - name: Calculate alert data + id: calculator + shell: bash + if: (success() || failure()) && github.ref_name == 'main' + run: | + if [ "${{ job.status }}" = "success" ]; then + echo "alert_type=success" >> "$GITHUB_OUTPUT"; + else + echo "alert_type=error" >> "$GITHUB_OUTPUT"; + fi + + - name: Send event to Datadog + if: (success() || failure()) && github.ref_name == 'main' + uses: masci/datadog@v1 + with: + api-key: ${{ secrets.CORE_DATADOG_API_KEY }} + api-url: https://api.datadoghq.eu + events: | + - title: "${{ github.workflow }} workflow" + text: "Job ${{ github.job }} in branch ${{ github.ref_name }}" + alert_type: "${{ steps.calculator.outputs.alert_type }}" + source_type_name: "Github" + host: ${{ github.repository_owner }} + tags: + - "project:${{ github.repository }}" + - "job:${{ github.job }}" + - "run_id:${{ github.run_id }}" + - "workflow:${{ github.workflow }}" + - "branch:${{ github.ref_name }}" + - "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + + unit-tests: + name: Unit / ${{ matrix.os }} + needs: black + strategy: + fail-fast: false + matrix: + os: + - ubuntu-latest + - windows-latest + - macos-latest + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install Haystack + run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2' + + - name: Run + run: pytest -m "unit" test/preview + + - name: Calculate alert data + id: calculator + shell: bash + if: (success() || failure()) && github.ref_name == 'main' + run: | + if [ "${{ job.status }}" = "success" ]; then + echo "alert_type=success" >> "$GITHUB_OUTPUT"; + else + echo "alert_type=error" >> "$GITHUB_OUTPUT"; + fi + + - name: Send event to Datadog + if: (success() || failure()) && github.ref_name == 'main' + uses: masci/datadog@v1 + with: + api-key: ${{ secrets.CORE_DATADOG_API_KEY }} + api-url: https://api.datadoghq.eu + events: | + - title: "${{ github.workflow }} workflow" + text: "Job ${{ github.job }} in branch ${{ github.ref_name }}" + alert_type: "${{ steps.calculator.outputs.alert_type }}" + source_type_name: "Github" + host: ${{ github.repository_owner }} + tags: + - "project:${{ github.repository }}" + - "job:${{ github.job }}" + - "run_id:${{ github.run_id }}" + - "workflow:${{ github.workflow }}" + - "branch:${{ github.ref_name }}" + - "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + + + integration-tests-linux: + name: Integration / ubuntu-latest + needs: unit-tests + runs-on: ubuntu-latest + services: + tika: + image: apache/tika:2.9.0.0 + ports: + - 9998:9998 + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install dependencies + run: | + sudo apt update + sudo apt install ffmpeg # for local Whisper tests + + - name: Install Haystack + run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2' + + - name: Run + run: pytest --maxfail=5 -m "integration" test/preview + + - name: Calculate alert data + id: calculator + shell: bash + if: (success() || failure()) && github.ref_name == 'main' + run: | + if [ "${{ job.status }}" = "success" ]; then + echo "alert_type=success" >> "$GITHUB_OUTPUT"; + else + echo "alert_type=error" >> "$GITHUB_OUTPUT"; + fi + + - name: Send event to Datadog + if: (success() || failure()) && github.ref_name == 'main' + uses: masci/datadog@v1 + with: + api-key: ${{ secrets.CORE_DATADOG_API_KEY }} + api-url: https://api.datadoghq.eu + events: | + - title: "${{ github.workflow }} workflow" + text: "Job ${{ github.job }} in branch ${{ github.ref_name }}" + alert_type: "${{ steps.calculator.outputs.alert_type }}" + source_type_name: "Github" + host: ${{ github.repository_owner }} + tags: + - "project:${{ github.repository }}" + - "job:${{ github.job }}" + - "run_id:${{ github.run_id }}" + - "workflow:${{ github.workflow }}" + - "branch:${{ github.ref_name }}" + - "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + + + integration-tests-macos: + name: Integration / macos-latest + needs: unit-tests + runs-on: macos-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install dependencies + run: | + brew install ffmpeg # for local Whisper tests + brew install docker + colima start + + - name: Install Haystack + run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2' + + - name: Run Tika + run: docker run -d -p 9998:9998 apache/tika:2.9.0.0 + + - name: Run + run: pytest --maxfail=5 -m "integration" test/preview + + - name: Calculate alert data + id: calculator + shell: bash + if: (success() || failure()) && github.ref_name == 'main' + run: | + if [ "${{ job.status }}" = "success" ]; then + echo "alert_type=success" >> "$GITHUB_OUTPUT"; + else + echo "alert_type=error" >> "$GITHUB_OUTPUT"; + fi + + - name: Send event to Datadog + if: (success() || failure()) && github.ref_name == 'main' + uses: masci/datadog@v1 + with: + api-key: ${{ secrets.CORE_DATADOG_API_KEY }} + api-url: https://api.datadoghq.eu + events: | + - title: "${{ github.workflow }} workflow" + text: "Job ${{ github.job }} in branch ${{ github.ref_name }}" + alert_type: "${{ steps.calculator.outputs.alert_type }}" + source_type_name: "Github" + host: ${{ github.repository_owner }} + tags: + - "project:${{ github.repository }}" + - "job:${{ github.job }}" + - "run_id:${{ github.run_id }}" + - "workflow:${{ github.workflow }}" + - "branch:${{ github.ref_name }}" + - "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + + + integration-tests-windows: + name: Integration / windows-latest + needs: unit-tests + runs-on: windows-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install Haystack + run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2' + + - name: Run + run: pytest --maxfail=5 -m "integration" test/preview -k 'not tika' + + - name: Calculate alert data + id: calculator + shell: bash + if: (success() || failure()) && github.ref_name == 'main' + run: | + if [ "${{ job.status }}" = "success" ]; then + echo "alert_type=success" >> "$GITHUB_OUTPUT"; + else + echo "alert_type=error" >> "$GITHUB_OUTPUT"; + fi + + - name: Send event to Datadog + if: (success() || failure()) && github.ref_name == 'main' + uses: masci/datadog@v1 + with: + api-key: ${{ secrets.CORE_DATADOG_API_KEY }} + api-url: https://api.datadoghq.eu + events: | + - title: "${{ github.workflow }} workflow" + text: "Job ${{ github.job }} in branch ${{ github.ref_name }}" + alert_type: "${{ steps.calculator.outputs.alert_type }}" + source_type_name: "Github" + host: ${{ github.repository_owner }} + tags: + - "project:${{ github.repository }}" + - "job:${{ github.job }}" + - "run_id:${{ github.run_id }}" + - "workflow:${{ github.workflow }}" + - "branch:${{ github.ref_name }}" + - "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" diff --git a/.github/workflows/tests_preview_skipper.yml b/.github/workflows/tests_preview_skipper.yml new file mode 100644 index 0000000000..2f64eaae99 --- /dev/null +++ b/.github/workflows/tests_preview_skipper.yml @@ -0,0 +1,21 @@ +# If you change this name also do it in tests_preview.yml +name: Tests (Preview) + +on: + pull_request: + types: + - opened + - reopened + - synchronize + - ready_for_review + paths-ignore: + - "haystack/preview/**/*.py" + - "test/preview/**/*.py" + +jobs: + catch-all: + name: Catch-all check + runs-on: ubuntu-latest + steps: + - name: Skip preview tests + run: echo "Skipped!" diff --git a/.github/workflows/tests_skipper.yml b/.github/workflows/tests_skipper.yml index c4e4dbae09..5e0ad9c691 100644 --- a/.github/workflows/tests_skipper.yml +++ b/.github/workflows/tests_skipper.yml @@ -10,6 +10,8 @@ on: - ready_for_review paths-ignore: - "**.py" + - "!haystack/preview/**/*.py" # See tests_preview.yml + - "!test/preview/**/*.py" # See tests_preview.yml - "pyproject.toml" - "!.github/**/*.py" - "!rest_api/**/*.py" diff --git a/README.md b/README.md index 9aab541214..b39c3cc70a 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,57 @@ | Meta | ![Discord](https://img.shields.io/discord/993534733298450452?logo=discord) ![Twitter Follow](https://img.shields.io/twitter/follow/deepset_ai) | -[Haystack](https://haystack.deepset.ai/) is an end-to-end NLP framework that enables you to build applications powered by LLMs, Transformer models, vector search and more. Whether you want to perform question answering, answer generation, semantic document search, or build tools that are capable of complex decision making and query resolution, you can use the state-of-the-art NLP models with Haystack to build end-to-end NLP applications solving your use case. +[Haystack](https://haystack.deepset.ai/) is an end-to-end NLP framework that enables you to build applications powered by LLMs, Transformer models, vector search and more. Whether you want to perform question answering, answer generation, semantic document search, or build tools that are capable of complex decision-making and query resolution, you can use the state-of-the-art NLP models with Haystack to build end-to-end NLP applications solving your use case. + +## Quickstart + +Haystack is built around the concept of pipelines. A pipeline is a powerful structure that performs an NLP task. It's made up of components connected together. For example, you can connect a `Retriever` and a `PromptNode` to build a Generative Question Answering pipeline that uses your own data. + +Try out how Haystack answers questions about Game of Thrones using the Retrieval Augmented Generation (RAG) approach 👇 + +First, run the minimal Haystack installation: + +```sh +pip install farm-haystack +``` + +Then, index your data to the DocumentStore, build a RAG pipeline, and ask a question on your data: + +```python +from haystack.document_stores import InMemoryDocumentStore +from haystack.utils import build_pipeline, add_example_data, print_answers + +# We are model agnostic :) Here, you can choose from: "anthropic", "cohere", "huggingface", and "openai". +provider = "openai" +API_KEY = "sk-..." # ADD YOUR KEY HERE + +# We support many different databases. Here, we load a simple and lightweight in-memory database. +document_store = InMemoryDocumentStore(use_bm25=True) + +# Download and add Game of Thrones TXT articles to Haystack DocumentStore. +# You can also provide a folder with your local documents. +add_example_data(document_store, "data/GoT_getting_started") + +# Build a pipeline with a Retriever to get relevant documents to the query and a PromptNode interacting with LLMs using a custom prompt. +pipeline = build_pipeline(provider, API_KEY, document_store) + +# Ask a question on the data you just added. +result = pipeline.run(query="Who is the father of Arya Stark?") + +# For details, like which documents were used to generate the answer, look into the object +print_answers(result, details="medium") +``` + +The output of the pipeline will reference the documents used to generate the answer: + +``` +'Query: Who is the father of Arya Stark?' +'Answers:' +[{'answer': 'The father of Arya Stark is Lord Eddard Stark of ' + 'Winterfell. [Document 1, Document 4, Document 5]'}] +``` + +Congratulations, you have just built your first Haystack app! ## Core Concepts @@ -106,7 +156,7 @@ Then move into the cloned folder and install the project with `pip`, including t cd haystack && pip install -e '.[dev]' ``` -If you want to contribute to the Haystack repo, check our [Contributor Guidelines](#💙-contributing) first. +If you want to contribute to the Haystack repo, check our [Contributor Guidelines](https://github.com/deepset-ai/haystack/blob/main/CONTRIBUTING.md) first. See the list of [dependencies](https://github.com/deepset-ai/haystack/blob/main/pyproject.toml) to check which ones you want to install (for example, `[all]`, `[dev]`, or other). diff --git a/annotation_tool/docker-compose.yml b/annotation_tool/docker-compose.yml index 044004ed19..c5522cf081 100644 --- a/annotation_tool/docker-compose.yml +++ b/annotation_tool/docker-compose.yml @@ -13,6 +13,7 @@ services: # DEFAULT_ADMIN_PASSWORD: "DEMO_PASSWORD" # COOKIE_KEYS: "somesafecookiekeys" # JWT_SECRET: "somesafesecret" + # DOMAIN_WHITELIST: "*" ports: - "7001:7001" links: diff --git a/e2e/preview/pipelines/test_extractive_qa_pipeline.py b/e2e/preview/pipelines/test_extractive_qa_pipeline.py index c25b73a057..b7ee11a0e5 100644 --- a/e2e/preview/pipelines/test_extractive_qa_pipeline.py +++ b/e2e/preview/pipelines/test_extractive_qa_pipeline.py @@ -1,25 +1,39 @@ +import json + from haystack.preview import Pipeline, Document from haystack.preview.document_stores import MemoryDocumentStore from haystack.preview.components.retrievers import MemoryBM25Retriever from haystack.preview.components.readers import ExtractiveReader -def test_extractive_qa_pipeline(): - document_store = MemoryDocumentStore() +def test_extractive_qa_pipeline(tmp_path): + # Create the pipeline + qa_pipeline = Pipeline() + qa_pipeline.add_component(instance=MemoryBM25Retriever(document_store=MemoryDocumentStore()), name="retriever") + qa_pipeline.add_component(instance=ExtractiveReader(model_name_or_path="deepset/tinyroberta-squad2"), name="reader") + qa_pipeline.connect("retriever", "reader") + + # Draw the pipeline + qa_pipeline.draw(tmp_path / "test_extractive_qa_pipeline.png") + + # Serialize the pipeline to JSON + with open(tmp_path / "test_bm25_rag_pipeline.json", "w") as f: + print(json.dumps(qa_pipeline.to_dict(), indent=4)) + json.dump(qa_pipeline.to_dict(), f) + # Load the pipeline back + with open(tmp_path / "test_bm25_rag_pipeline.json", "r") as f: + qa_pipeline = Pipeline.from_dict(json.load(f)) + + # Populate the document store documents = [ Document(text="My name is Jean and I live in Paris."), Document(text="My name is Mark and I live in Berlin."), Document(text="My name is Giorgio and I live in Rome."), ] + qa_pipeline.get_component("retriever").document_store.write_documents(documents) - document_store.write_documents(documents) - - qa_pipeline = Pipeline() - qa_pipeline.add_component(instance=MemoryBM25Retriever(document_store=document_store), name="retriever") - qa_pipeline.add_component(instance=ExtractiveReader(model_name_or_path="deepset/tinyroberta-squad2"), name="reader") - qa_pipeline.connect("retriever", "reader") - + # Query and assert questions = ["Who lives in Paris?", "Who lives in Berlin?", "Who lives in Rome?"] answers_spywords = ["Jean", "Mark", "Giorgio"] diff --git a/e2e/preview/pipelines/test_rag_pipelines.py b/e2e/preview/pipelines/test_rag_pipelines.py index 335cc9a2c2..719fbd8ed0 100644 --- a/e2e/preview/pipelines/test_rag_pipelines.py +++ b/e2e/preview/pipelines/test_rag_pipelines.py @@ -1,4 +1,5 @@ import os +import json import pytest from haystack.preview import Pipeline, Document @@ -15,15 +16,8 @@ not os.environ.get("OPENAI_API_KEY", None), reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.", ) -def test_bm25_rag_pipeline(): - document_store = MemoryDocumentStore() - - documents = [ - Document(text="My name is Jean and I live in Paris."), - Document(text="My name is Mark and I live in Berlin."), - Document(text="My name is Giorgio and I live in Rome."), - ] - +def test_bm25_rag_pipeline(tmp_path): + # Create the RAG pipeline prompt_template = """ Given these documents, answer the question.\nDocuments: {% for doc in documents %} @@ -33,11 +27,8 @@ def test_bm25_rag_pipeline(): \nQuestion: {{question}} \nAnswer: """ - - document_store.write_documents(documents) - rag_pipeline = Pipeline() - rag_pipeline.add_component(instance=MemoryBM25Retriever(document_store=document_store), name="retriever") + rag_pipeline.add_component(instance=MemoryBM25Retriever(document_store=MemoryDocumentStore()), name="retriever") rag_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder") rag_pipeline.add_component(instance=GPTGenerator(api_key=os.environ.get("OPENAI_API_KEY")), name="llm") rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") @@ -47,6 +38,26 @@ def test_bm25_rag_pipeline(): rag_pipeline.connect("llm.metadata", "answer_builder.metadata") rag_pipeline.connect("retriever", "answer_builder.documents") + # Draw the pipeline + rag_pipeline.draw(tmp_path / "test_bm25_rag_pipeline.png") + + # Serialize the pipeline to JSON + with open(tmp_path / "test_bm25_rag_pipeline.json", "w") as f: + json.dump(rag_pipeline.to_dict(), f) + + # Load the pipeline back + with open(tmp_path / "test_bm25_rag_pipeline.json", "r") as f: + rag_pipeline = Pipeline.from_dict(json.load(f)) + + # Populate the document store + documents = [ + Document(text="My name is Jean and I live in Paris."), + Document(text="My name is Mark and I live in Berlin."), + Document(text="My name is Giorgio and I live in Rome."), + ] + rag_pipeline.get_component("retriever").document_store.write_documents(documents) + + # Query and assert questions = ["Who lives in Paris?", "Who lives in Berlin?", "Who lives in Rome?"] answers_spywords = ["Jean", "Mark", "Giorgio"] @@ -71,15 +82,8 @@ def test_bm25_rag_pipeline(): not os.environ.get("OPENAI_API_KEY", None), reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.", ) -def test_embedding_retrieval_rag_pipeline(): - document_store = MemoryDocumentStore() - - documents = [ - Document(text="My name is Jean and I live in Paris."), - Document(text="My name is Mark and I live in Berlin."), - Document(text="My name is Giorgio and I live in Rome."), - ] - +def test_embedding_retrieval_rag_pipeline(tmp_path): + # Create the RAG pipeline prompt_template = """ Given these documents, answer the question.\nDocuments: {% for doc in documents %} @@ -89,22 +93,14 @@ def test_embedding_retrieval_rag_pipeline(): \nQuestion: {{question}} \nAnswer: """ - - indexing_pipeline = Pipeline() - indexing_pipeline.add_component( - instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-mpnet-base-v2"), - name="document_embedder", - ) - indexing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="document_writer") - indexing_pipeline.connect("document_embedder", "document_writer") - indexing_pipeline.run({"document_embedder": {"documents": documents}}) - rag_pipeline = Pipeline() rag_pipeline.add_component( - instance=SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-mpnet-base-v2"), + instance=SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"), name="text_embedder", ) - rag_pipeline.add_component(instance=MemoryEmbeddingRetriever(document_store=document_store), name="retriever") + rag_pipeline.add_component( + instance=MemoryEmbeddingRetriever(document_store=MemoryDocumentStore()), name="retriever" + ) rag_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder") rag_pipeline.add_component(instance=GPTGenerator(api_key=os.environ.get("OPENAI_API_KEY")), name="llm") rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") @@ -115,6 +111,34 @@ def test_embedding_retrieval_rag_pipeline(): rag_pipeline.connect("llm.metadata", "answer_builder.metadata") rag_pipeline.connect("retriever", "answer_builder.documents") + # Draw the pipeline + rag_pipeline.draw(tmp_path / "test_embedding_rag_pipeline.png") + + # Serialize the pipeline to JSON + with open(tmp_path / "test_bm25_rag_pipeline.json", "w") as f: + json.dump(rag_pipeline.to_dict(), f) + + # Load the pipeline back + with open(tmp_path / "test_bm25_rag_pipeline.json", "r") as f: + rag_pipeline = Pipeline.from_dict(json.load(f)) + + # Populate the document store + documents = [ + Document(text="My name is Jean and I live in Paris."), + Document(text="My name is Mark and I live in Berlin."), + Document(text="My name is Giorgio and I live in Rome."), + ] + document_store = rag_pipeline.get_component("retriever").document_store + indexing_pipeline = Pipeline() + indexing_pipeline.add_component( + instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"), + name="document_embedder", + ) + indexing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="document_writer") + indexing_pipeline.connect("document_embedder", "document_writer") + indexing_pipeline.run({"document_embedder": {"documents": documents}}) + + # Query and assert questions = ["Who lives in Paris?", "Who lives in Berlin?", "Who lives in Rome?"] answers_spywords = ["Jean", "Mark", "Giorgio"] @@ -129,7 +153,6 @@ def test_embedding_retrieval_rag_pipeline(): assert len(result["answer_builder"]["answers"]) == 1 generated_answer = result["answer_builder"]["answers"][0] - print(generated_answer) assert spyword in generated_answer.data assert generated_answer.query == question assert hasattr(generated_answer, "documents") diff --git a/haystack/nodes/answer_generator/openai.py b/haystack/nodes/answer_generator/openai.py index ae1ac362ed..6b8893c9e6 100644 --- a/haystack/nodes/answer_generator/openai.py +++ b/haystack/nodes/answer_generator/openai.py @@ -1,6 +1,7 @@ import logging import os from typing import List, Optional, Tuple, Union +import warnings from haystack import Document from haystack.environment import HAYSTACK_REMOTE_API_TIMEOUT_SEC @@ -21,6 +22,10 @@ class OpenAIAnswerGenerator(BaseGenerator): """ + This component is now deprecated and will be removed in future versions. + Use `PromptNode` instead of `OpenAIAnswerGenerator`, + as explained in https://haystack.deepset.ai/tutorials/22_pipeline_with_promptnode. + Uses the GPT-3 models from the OpenAI API to generate Answers based on the Documents it receives. The Documents can come from a Retriever or you can supply them manually. @@ -109,6 +114,12 @@ def __init__( :param openai_organization: The OpenAI-Organization ID, defaults to `None`. For more details, see see OpenAI [documentation](https://platform.openai.com/docs/api-reference/requesting-organization). """ + warnings.warn( + "`OpenAIAnswerGenerator component is deprecated and will be removed in future versions. Use `PromptNode` " + "instead of `OpenAIAnswerGenerator`.", + category=DeprecationWarning, + ) + super().__init__(progress_bar=progress_bar) if (examples is None and examples_context is not None) or (examples is not None and examples_context is None): logger.warning( diff --git a/haystack/nodes/prompt/invocation_layer/chatgpt.py b/haystack/nodes/prompt/invocation_layer/chatgpt.py index 371b86d6a8..f3e1a3ef6b 100644 --- a/haystack/nodes/prompt/invocation_layer/chatgpt.py +++ b/haystack/nodes/prompt/invocation_layer/chatgpt.py @@ -1,10 +1,17 @@ import logging -from typing import Optional, List, Dict, Union, Any +from typing import Any, Dict, List, Optional, Union from haystack.nodes.prompt.invocation_layer.handlers import DefaultTokenStreamingHandler, TokenStreamingHandler from haystack.nodes.prompt.invocation_layer.open_ai import OpenAIInvocationLayer from haystack.nodes.prompt.invocation_layer.utils import has_azure_parameters -from haystack.utils.openai_utils import openai_request, _check_openai_finish_reason, count_openai_tokens_messages +from haystack.utils.openai_utils import ( + _check_openai_finish_reason, + check_openai_async_policy_violation, + check_openai_policy_violation, + count_openai_tokens_messages, + openai_async_request, + openai_request, +) logger = logging.getLogger(__name__) @@ -43,45 +50,6 @@ def __init__( """ super().__init__(api_key, model_name_or_path, max_length, api_base=api_base, **kwargs) - def _execute_openai_request( - self, prompt: Union[str, List[Dict]], base_payload: Dict, kwargs_with_defaults: Dict, stream: bool - ): - """ - For more details, see [OpenAI ChatGPT API reference](https://platform.openai.com/docs/api-reference/chat). - """ - if isinstance(prompt, str): - messages = [{"role": "user", "content": prompt}] - elif isinstance(prompt, list) and len(prompt) > 0 and isinstance(prompt[0], dict): - messages = prompt - else: - raise ValueError( - f"The prompt format is different than what the model expects. " - f"The model {self.model_name_or_path} requires either a string or messages in the ChatML format. " - f"For more details, see this [GitHub discussion](https://github.com/openai/openai-python/blob/main/chatml.md)." - ) - extra_payload = {"messages": messages} - payload = {**base_payload, **extra_payload} - if not stream: - response = openai_request(url=self.url, headers=self.headers, payload=payload) - _check_openai_finish_reason(result=response, payload=payload) - assistant_response = [choice["message"]["content"].strip() for choice in response["choices"]] - else: - response = openai_request( - url=self.url, headers=self.headers, payload=payload, read_response=False, stream=True - ) - handler: TokenStreamingHandler = kwargs_with_defaults.pop("stream_handler", DefaultTokenStreamingHandler()) - assistant_response = self._process_streaming_response(response=response, stream_handler=handler) - - # Although ChatGPT generates text until stop words are encountered, unfortunately it includes the stop word - # We want to exclude it to be consistent with other invocation layers - if "stop" in kwargs_with_defaults and kwargs_with_defaults["stop"] is not None: - stop_words = kwargs_with_defaults["stop"] - for idx, _ in enumerate(assistant_response): - for stop_word in stop_words: - assistant_response[idx] = assistant_response[idx].replace(stop_word, "").strip() - - return assistant_response - def _extract_token(self, event_data: Dict[str, Any]): delta = event_data["choices"][0]["delta"] if "content" in delta: @@ -141,3 +109,109 @@ def supports(cls, model_name_or_path: str, **kwargs) -> bool: and not "gpt-3.5-turbo-instruct" in model_name_or_path ) return valid_model and not has_azure_parameters(**kwargs) + + async def ainvoke(self, *args, **kwargs): + """ + Invokes a prompt on the model. Based on the model, it takes in a prompt (or either a prompt or a list of messages) + and returns a list of responses using a REST invocation. + + :return: The responses are being returned. + + Note: Only kwargs relevant to OpenAI are passed to OpenAI rest API. Others kwargs are ignored. + For more details, see OpenAI [documentation](https://platform.openai.com/docs/api-reference/completions/create). + """ + prompt, base_payload, kwargs_with_defaults, stream, moderation = self._prepare_invoke(*args, **kwargs) + + if moderation and await check_openai_async_policy_violation(input=prompt, headers=self.headers): + logger.info("Prompt '%s' will not be sent to OpenAI due to potential policy violation.", prompt) + return [] + + if isinstance(prompt, str): + messages = [{"role": "user", "content": prompt}] + elif isinstance(prompt, list) and len(prompt) > 0 and isinstance(prompt[0], dict): + messages = prompt + else: + raise ValueError( + f"The prompt format is different than what the model expects. " + f"The model {self.model_name_or_path} requires either a string or messages in the ChatML format. " + f"For more details, see this [GitHub discussion](https://github.com/openai/openai-python/blob/main/chatml.md)." + ) + extra_payload = {"messages": messages} + payload = {**base_payload, **extra_payload} + if not stream: + response = await openai_async_request(url=self.url, headers=self.headers, payload=payload) + _check_openai_finish_reason(result=response, payload=payload) + assistant_response = [choice["message"]["content"].strip() for choice in response["choices"]] + else: + response = await openai_async_request( + url=self.url, headers=self.headers, payload=payload, read_response=False, stream=True + ) + handler: TokenStreamingHandler = kwargs_with_defaults.pop("stream_handler", DefaultTokenStreamingHandler()) + assistant_response = self._process_streaming_response(response=response, stream_handler=handler) + + # Although ChatGPT generates text until stop words are encountered, unfortunately it includes the stop word + # We want to exclude it to be consistent with other invocation layers + if "stop" in kwargs_with_defaults and kwargs_with_defaults["stop"] is not None: + stop_words = kwargs_with_defaults["stop"] + for idx, _ in enumerate(assistant_response): + for stop_word in stop_words: + assistant_response[idx] = assistant_response[idx].replace(stop_word, "").strip() + + if moderation and await check_openai_async_policy_violation(input=assistant_response, headers=self.headers): + logger.info("Response '%s' will not be returned due to potential policy violation.", assistant_response) + return [] + + return assistant_response + + def invoke(self, *args, **kwargs): + """ + Invokes a prompt on the model. Based on the model, it takes in a prompt (or either a prompt or a list of messages) + and returns a list of responses using a REST invocation. + + :return: The responses are being returned. + + Note: Only kwargs relevant to OpenAI are passed to OpenAI rest API. Others kwargs are ignored. + For more details, see OpenAI [documentation](https://platform.openai.com/docs/api-reference/completions/create). + """ + prompt, base_payload, kwargs_with_defaults, stream, moderation = self._prepare_invoke(*args, **kwargs) + + if moderation and check_openai_policy_violation(input=prompt, headers=self.headers): + logger.info("Prompt '%s' will not be sent to OpenAI due to potential policy violation.", prompt) + return [] + + if isinstance(prompt, str): + messages = [{"role": "user", "content": prompt}] + elif isinstance(prompt, list) and len(prompt) > 0 and isinstance(prompt[0], dict): + messages = prompt + else: + raise ValueError( + f"The prompt format is different than what the model expects. " + f"The model {self.model_name_or_path} requires either a string or messages in the ChatML format. " + f"For more details, see this [GitHub discussion](https://github.com/openai/openai-python/blob/main/chatml.md)." + ) + extra_payload = {"messages": messages} + payload = {**base_payload, **extra_payload} + if not stream: + response = openai_request(url=self.url, headers=self.headers, payload=payload) + _check_openai_finish_reason(result=response, payload=payload) + assistant_response = [choice["message"]["content"].strip() for choice in response["choices"]] + else: + response = openai_request( + url=self.url, headers=self.headers, payload=payload, read_response=False, stream=True + ) + handler: TokenStreamingHandler = kwargs_with_defaults.pop("stream_handler", DefaultTokenStreamingHandler()) + assistant_response = self._process_streaming_response(response=response, stream_handler=handler) + + # Although ChatGPT generates text until stop words are encountered, unfortunately it includes the stop word + # We want to exclude it to be consistent with other invocation layers + if "stop" in kwargs_with_defaults and kwargs_with_defaults["stop"] is not None: + stop_words = kwargs_with_defaults["stop"] + for idx, _ in enumerate(assistant_response): + for stop_word in stop_words: + assistant_response[idx] = assistant_response[idx].replace(stop_word, "").strip() + + if moderation and check_openai_policy_violation(input=assistant_response, headers=self.headers): + logger.info("Response '%s' will not be returned due to potential policy violation.", assistant_response) + return [] + + return assistant_response diff --git a/haystack/nodes/prompt/invocation_layer/open_ai.py b/haystack/nodes/prompt/invocation_layer/open_ai.py index 759fd7a925..85bae198b0 100644 --- a/haystack/nodes/prompt/invocation_layer/open_ai.py +++ b/haystack/nodes/prompt/invocation_layer/open_ai.py @@ -7,11 +7,13 @@ from haystack.errors import OpenAIError from haystack.nodes.prompt.invocation_layer.utils import has_azure_parameters from haystack.utils.openai_utils import ( - openai_request, _openai_text_completion_tokenization_details, load_openai_tokenizer, _check_openai_finish_reason, + check_openai_async_policy_violation, check_openai_policy_violation, + openai_async_request, + openai_request, ) from haystack.nodes.prompt.invocation_layer.base import PromptModelInvocationLayer from haystack.nodes.prompt.invocation_layer.handlers import TokenStreamingHandler, DefaultTokenStreamingHandler @@ -112,17 +114,13 @@ def headers(self) -> Dict[str, str]: headers["OpenAI-Organization"] = self.openai_organization return headers - def invoke(self, *args, **kwargs): - """ - Invokes a prompt on the model. Based on the model, it takes in a prompt (or either a prompt or a list of messages) - and returns a list of responses using a REST invocation. - - :return: The responses are being returned. - - Note: Only kwargs relevant to OpenAI are passed to OpenAI rest API. Others kwargs are ignored. - For more details, see OpenAI [documentation](https://platform.openai.com/docs/api-reference/completions/create). - """ + def _prepare_invoke(self, *args, **kwargs): prompt = kwargs.get("prompt") + if not prompt: + raise ValueError( + f"No prompt provided. Model {self.model_name_or_path} requires prompt." + f"Make sure to provide prompt in kwargs." + ) # either stream is True (will use default handler) or stream_handler is provided kwargs_with_defaults = self.model_input_kwargs if kwargs: @@ -150,23 +148,59 @@ def invoke(self, *args, **kwargs): "frequency_penalty": kwargs_with_defaults.get("frequency_penalty", 0), "logit_bias": kwargs_with_defaults.get("logit_bias", {}), } + + return (prompt, base_payload, kwargs_with_defaults, stream, moderation) + + def invoke(self, *args, **kwargs): + """ + Invokes a prompt on the model. Based on the model, it takes in a prompt (or either a prompt or a list of messages) + and returns a list of responses using a REST invocation. + + :return: The responses are being returned. + + Note: Only kwargs relevant to OpenAI are passed to OpenAI rest API. Others kwargs are ignored. + For more details, see OpenAI [documentation](https://platform.openai.com/docs/api-reference/completions/create). + """ + prompt, base_payload, kwargs_with_defaults, stream, moderation = self._prepare_invoke(*args, **kwargs) + if moderation and check_openai_policy_violation(input=prompt, headers=self.headers): logger.info("Prompt '%s' will not be sent to OpenAI due to potential policy violation.", prompt) return [] - responses = self._execute_openai_request( - prompt=prompt, base_payload=base_payload, kwargs_with_defaults=kwargs_with_defaults, stream=stream - ) + + extra_payload = { + "prompt": prompt, + "suffix": kwargs_with_defaults.get("suffix", None), + "logprobs": kwargs_with_defaults.get("logprobs", None), + "echo": kwargs_with_defaults.get("echo", False), + "best_of": kwargs_with_defaults.get("best_of", 1), + } + payload = {**base_payload, **extra_payload} + if not stream: + res = openai_request(url=self.url, headers=self.headers, payload=payload) + _check_openai_finish_reason(result=res, payload=payload) + responses = [ans["text"].strip() for ans in res["choices"]] + else: + response = openai_request( + url=self.url, headers=self.headers, payload=payload, read_response=False, stream=True + ) + handler: TokenStreamingHandler = kwargs_with_defaults.pop("stream_handler", DefaultTokenStreamingHandler()) + responses = self._process_streaming_response(response=response, stream_handler=handler) + if moderation and check_openai_policy_violation(input=responses, headers=self.headers): logger.info("Response '%s' will not be returned due to potential policy violation.", responses) return [] + return responses - def _execute_openai_request(self, prompt: str, base_payload: Dict, kwargs_with_defaults: Dict, stream: bool): - if not prompt: - raise ValueError( - f"No prompt provided. Model {self.model_name_or_path} requires prompt." - f"Make sure to provide prompt in kwargs." - ) + async def ainvoke(self, *args, **kwargs): + """ + asyncio version of the `invoke` method. + """ + prompt, base_payload, kwargs_with_defaults, stream, moderation = self._prepare_invoke(*args, **kwargs) + if moderation and await check_openai_async_policy_violation(input=prompt, headers=self.headers): + logger.info("Prompt '%s' will not be sent to OpenAI due to potential policy violation.", prompt) + return [] + extra_payload = { "prompt": prompt, "suffix": kwargs_with_defaults.get("suffix", None), @@ -176,16 +210,21 @@ def _execute_openai_request(self, prompt: str, base_payload: Dict, kwargs_with_d } payload = {**base_payload, **extra_payload} if not stream: - res = openai_request(url=self.url, headers=self.headers, payload=payload) + res = await openai_async_request(url=self.url, headers=self.headers, payload=payload) _check_openai_finish_reason(result=res, payload=payload) responses = [ans["text"].strip() for ans in res["choices"]] - return responses else: - response = openai_request( + response = await openai_async_request( url=self.url, headers=self.headers, payload=payload, read_response=False, stream=True ) handler: TokenStreamingHandler = kwargs_with_defaults.pop("stream_handler", DefaultTokenStreamingHandler()) - return self._process_streaming_response(response=response, stream_handler=handler) + responses = self._process_streaming_response(response=response, stream_handler=handler) + + if moderation and await check_openai_async_policy_violation(input=responses, headers=self.headers): + logger.info("Response '%s' will not be returned due to potential policy violation.", responses) + return [] + + return responses def _process_streaming_response(self, response, stream_handler: TokenStreamingHandler): client = sseclient.SSEClient(response) diff --git a/haystack/nodes/prompt/prompt_model.py b/haystack/nodes/prompt/prompt_model.py index f8a406004e..100a2b46ea 100644 --- a/haystack/nodes/prompt/prompt_model.py +++ b/haystack/nodes/prompt/prompt_model.py @@ -111,6 +111,16 @@ def invoke(self, prompt: Union[str, List[str], List[Dict[str, str]]], **kwargs) output = self.model_invocation_layer.invoke(prompt=prompt, **kwargs) return output + async def ainvoke(self, prompt: Union[str, List[str], List[Dict[str, str]]], **kwargs) -> List[str]: + """ + Drop-in replacement asyncio version of the `invoke` method, see there for documentation. + """ + if hasattr(self.model_invocation_layer, "ainvoke"): + return await self.model_invocation_layer.ainvoke(prompt=prompt, **kwargs) + + # The underlying invocation layer doesn't support asyncio + return self.model_invocation_layer.invoke(prompt=prompt, **kwargs) + @overload def _ensure_token_limit(self, prompt: str) -> str: ... diff --git a/haystack/nodes/prompt/prompt_node.py b/haystack/nodes/prompt/prompt_node.py index 060bec69c5..2360d5b44c 100644 --- a/haystack/nodes/prompt/prompt_node.py +++ b/haystack/nodes/prompt/prompt_node.py @@ -232,6 +232,37 @@ def prompt_template_params(self, prompt_template: str) -> List[str]: return list(template.prompt_params) return [] + def _prepare( # type: ignore + self, query, file_paths, labels, documents, meta, invocation_context, prompt_template, generation_kwargs + ) -> Dict: + """ + Prepare prompt invocation. + """ + invocation_context = invocation_context or {} + + if query and "query" not in invocation_context: + invocation_context["query"] = query + + if file_paths and "file_paths" not in invocation_context: + invocation_context["file_paths"] = file_paths + + if labels and "labels" not in invocation_context: + invocation_context["labels"] = labels + + if documents and "documents" not in invocation_context: + invocation_context["documents"] = documents + + if meta and "meta" not in invocation_context: + invocation_context["meta"] = meta + + if "prompt_template" not in invocation_context: + invocation_context["prompt_template"] = self.get_prompt_template(prompt_template) + + if generation_kwargs: + invocation_context.update(generation_kwargs) + + return invocation_context + def run( self, query: Optional[str] = None, @@ -272,29 +303,86 @@ def run( # so that they can be returned by `run()` as part of the pipeline's debug output. prompt_collector: List[str] = [] - invocation_context = invocation_context or {} - if query and "query" not in invocation_context.keys(): - invocation_context["query"] = query + invocation_context = self._prepare( + query, file_paths, labels, documents, meta, invocation_context, prompt_template, generation_kwargs + ) - if file_paths and "file_paths" not in invocation_context.keys(): - invocation_context["file_paths"] = file_paths + results = self(**invocation_context, prompt_collector=prompt_collector) - if labels and "labels" not in invocation_context.keys(): - invocation_context["labels"] = labels + prompt_template_resolved: PromptTemplate = invocation_context.pop("prompt_template") - if documents and "documents" not in invocation_context.keys(): - invocation_context["documents"] = documents + try: + output_variable = self.output_variable or prompt_template_resolved.output_variable or "results" + except: + output_variable = "results" - if meta and "meta" not in invocation_context.keys(): - invocation_context["meta"] = meta + invocation_context[output_variable] = results + invocation_context["prompts"] = prompt_collector + final_result: Dict[str, Any] = {output_variable: results, "invocation_context": invocation_context} - if "prompt_template" not in invocation_context.keys(): - invocation_context["prompt_template"] = self.get_prompt_template(prompt_template) + if self.debug: + final_result["_debug"] = {"prompts_used": prompt_collector} - if generation_kwargs: - invocation_context.update(generation_kwargs) + return final_result, "output_1" + + async def _aprompt(self, prompt_template: Optional[Union[str, PromptTemplate]], *args, **kwargs): + """ + Async version of the actual prompt invocation. + """ + results = [] + # we pop the prompt_collector kwarg to avoid passing it to the model + prompt_collector: List[Union[str, List[Dict[str, str]]]] = kwargs.pop("prompt_collector", []) + + # kwargs override model kwargs + kwargs = {**self._prepare_model_kwargs(), **kwargs} + template_to_fill = self.get_prompt_template(prompt_template) + if template_to_fill: + # prompt template used, yield prompts from inputs args + for prompt in template_to_fill.fill(*args, **kwargs): + kwargs_copy = template_to_fill.remove_template_params(copy.copy(kwargs)) + # and pass the prepared prompt and kwargs copy to the model + prompt = self.prompt_model._ensure_token_limit(prompt) + prompt_collector.append(prompt) + logger.debug("Prompt being sent to LLM with prompt %s and kwargs %s", prompt, kwargs_copy) + output = await self.prompt_model.ainvoke(prompt, **kwargs_copy) + results.extend(output) + + kwargs["prompts"] = prompt_collector + results = template_to_fill.post_process(results, **kwargs) + else: + # straightforward prompt, no templates used + for prompt in list(args): + kwargs_copy = copy.copy(kwargs) + prompt = self.prompt_model._ensure_token_limit(prompt) + prompt_collector.append(prompt) + logger.debug("Prompt being sent to LLM with prompt %s and kwargs %s ", prompt, kwargs_copy) + output = await self.prompt_model.ainvoke(prompt, **kwargs_copy) + results.extend(output) + return results + + async def arun( + self, + query: Optional[str] = None, + file_paths: Optional[List[str]] = None, + labels: Optional[MultiLabel] = None, + documents: Optional[List[Document]] = None, + meta: Optional[dict] = None, + invocation_context: Optional[Dict[str, Any]] = None, + prompt_template: Optional[Union[str, PromptTemplate]] = None, + generation_kwargs: Optional[Dict[str, Any]] = None, + ) -> Tuple[Dict, str]: + """ + Drop-in replacement asyncio version of the `run` method, see there for documentation. + """ + prompt_collector: List[str] = [] + + invocation_context = self._prepare( + query, file_paths, labels, documents, meta, invocation_context, prompt_template, generation_kwargs + ) - results = self(prompt_collector=prompt_collector, **invocation_context) + # Let's skip the call to __call__, because all it does is injecting a prompt template + # if there isn't any, while we know for sure it'll be in `invocation_context`. + results = await self._aprompt(prompt_collector=prompt_collector, **invocation_context) prompt_template_resolved: PromptTemplate = invocation_context.pop("prompt_template") diff --git a/haystack/preview/README.md b/haystack/preview/README.md index 99fe7eb3e3..1dcd4c351b 100644 --- a/haystack/preview/README.md +++ b/haystack/preview/README.md @@ -23,5 +23,12 @@ pip install farm-haystack ``` The `farm-haystack` package includes all new features of Haystack 2.0. Note that updates to this package occur less frequently compared to `haystack-ai`. So, you might not get the all latest Haystack 2.0 features immediately when using `farm-haystack`. +## 🚗 Getting Started + +In our **end 2 end tests** you can find example code for the following pipelines: +- [RAG pipeline](https://github.com/deepset-ai/haystack/blob/main/e2e/preview/pipelines/test_rag_pipelines.py) +- [Extractive QA pipeline](https://github.com/deepset-ai/haystack/blob/main/e2e/preview/pipelines/test_extractive_qa_pipeline.py) +- more to come, check out the [folder](https://github.com/deepset-ai/haystack/blob/main/e2e/preview/) + ## 💙 Stay Updated To learn how and when components will be migrated to the new major version, have a look at the [Migrate Components to Pipeline v2](https://github.com/deepset-ai/haystack/issues/5265) roadmap item, where we keep track of issues and PRs about Haystack 2.0. When you have questions, you can always contact us using the [Shaping Haystack 2.0](https://github.com/deepset-ai/haystack/discussions/5568) discussion or [Haystack Discord server](https://discord.com/channels/993534733298450452/1141683185458094211). diff --git a/haystack/preview/components/builders/answer_builder.py b/haystack/preview/components/builders/answer_builder.py index af644d22a0..201d43b9c7 100644 --- a/haystack/preview/components/builders/answer_builder.py +++ b/haystack/preview/components/builders/answer_builder.py @@ -42,7 +42,7 @@ def run( self, query: str, replies: List[str], - metadata: List[Dict[str, Any]], + metadata: Optional[List[Dict[str, Any]]] = None, documents: Optional[List[Document]] = None, pattern: Optional[str] = None, reference_pattern: Optional[str] = None, @@ -52,7 +52,8 @@ def run( :param query: The query used in the prompts for the Generator. A strings. :param replies: The output of the Generator. A list of strings. - :param metadata: The metadata returned by the Generator. A list of dictionaries. + :param metadata: The metadata returned by the Generator. An optional list of dictionaries. If not specified, + the generated answer will contain no metadata. :param documents: The documents used as input to the Generator. A list of `Document` objects. If `documents` are specified, they are added to the `Answer` objects. If both `documents` and `reference_pattern` are specified, the documents referenced in the @@ -73,8 +74,11 @@ def run( If not specified, no parsing is done, and all documents are referenced. Default: `None`. """ - if len(replies) != len(metadata): + if not metadata: + metadata = [{}] * len(replies) + elif len(replies) != len(metadata): raise ValueError(f"Number of replies ({len(replies)}), and metadata ({len(metadata)}) must match.") + if pattern: AnswerBuilder._check_num_groups_in_regex(pattern) diff --git a/haystack/preview/components/classifiers/__init__.py b/haystack/preview/components/classifiers/__init__.py deleted file mode 100644 index 208ec92cbe..0000000000 --- a/haystack/preview/components/classifiers/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from haystack.preview.components.classifiers.file_classifier import FileExtensionClassifier - -__all__ = ["FileExtensionClassifier"] diff --git a/haystack/preview/components/embedders/__init__.py b/haystack/preview/components/embedders/__init__.py index de8b93958f..a0840d7e0a 100644 --- a/haystack/preview/components/embedders/__init__.py +++ b/haystack/preview/components/embedders/__init__.py @@ -2,5 +2,12 @@ from haystack.preview.components.embedders.sentence_transformers_document_embedder import ( SentenceTransformersDocumentEmbedder, ) +from haystack.preview.components.embedders.openai_document_embedder import OpenAIDocumentEmbedder +from haystack.preview.components.embedders.openai_text_embedder import OpenAITextEmbedder -__all__ = ["SentenceTransformersTextEmbedder", "SentenceTransformersDocumentEmbedder"] +__all__ = [ + "SentenceTransformersTextEmbedder", + "SentenceTransformersDocumentEmbedder", + "OpenAITextEmbedder", + "OpenAIDocumentEmbedder", +] diff --git a/haystack/preview/components/embedders/openai_document_embedder.py b/haystack/preview/components/embedders/openai_document_embedder.py new file mode 100644 index 0000000000..a2e51530bd --- /dev/null +++ b/haystack/preview/components/embedders/openai_document_embedder.py @@ -0,0 +1,164 @@ +from typing import List, Optional, Dict, Any, Tuple +import os + +import openai +from tqdm import tqdm + + +from haystack.preview import component, Document, default_to_dict, default_from_dict + + +@component +class OpenAIDocumentEmbedder: + """ + A component for computing Document embeddings using OpenAI models. + The embedding of each Document is stored in the `embedding` field of the Document. + """ + + def __init__( + self, + api_key: Optional[str] = None, + model_name: str = "text-embedding-ada-002", + organization: Optional[str] = None, + prefix: str = "", + suffix: str = "", + batch_size: int = 32, + progress_bar: bool = True, + metadata_fields_to_embed: Optional[List[str]] = None, + embedding_separator: str = "\n", + ): + """ + Create a OpenAIDocumentEmbedder component. + :param api_key: The OpenAI API key. It can be explicitly provided or automatically read from the + environment variable OPENAI_API_KEY (recommended). + :param model_name: The name of the model to use. + :param api_base_url: The OpenAI API Base url, defaults to `https://api.openai.com/v1`. + :param organization: The OpenAI-Organization ID, defaults to `None`. For more details, see OpenAI + [documentation](https://platform.openai.com/docs/api-reference/requesting-organization). + :param prefix: A string to add to the beginning of each text. + :param suffix: A string to add to the end of each text. + :param batch_size: Number of Documents to encode at once. + :param progress_bar: Whether to show a progress bar or not. Can be helpful to disable in production deployments + to keep the logs clean. + :param metadata_fields_to_embed: List of meta fields that should be embedded along with the Document text. + :param embedding_separator: Separator used to concatenate the meta fields to the Document text. + """ + + if api_key is None: + try: + api_key = os.environ["OPENAI_API_KEY"] + except KeyError as e: + raise ValueError( + "OpenAIDocumentEmbedder expects an OpenAI API key. " + "Set the OPENAI_API_KEY environment variable (recommended) or pass it explicitly." + ) from e + + self.model_name = model_name + self.organization = organization + self.prefix = prefix + self.suffix = suffix + self.batch_size = batch_size + self.progress_bar = progress_bar + self.metadata_fields_to_embed = metadata_fields_to_embed or [] + self.embedding_separator = embedding_separator + + openai.api_key = api_key + if organization is not None: + openai.organization = organization + + def to_dict(self) -> Dict[str, Any]: + """ + This method overrides the default serializer in order to avoid leaking the `api_key` value passed + to the constructor. + """ + return default_to_dict( + self, + model_name=self.model_name, + organization=self.organization, + prefix=self.prefix, + suffix=self.suffix, + batch_size=self.batch_size, + progress_bar=self.progress_bar, + metadata_fields_to_embed=self.metadata_fields_to_embed, + embedding_separator=self.embedding_separator, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "OpenAIDocumentEmbedder": + """ + Deserialize this component from a dictionary. + """ + return default_from_dict(cls, data) + + def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]: + """ + Prepare the texts to embed by concatenating the Document text with the metadata fields to embed. + """ + texts_to_embed = [] + for doc in documents: + meta_values_to_embed = [ + str(doc.metadata[key]) + for key in self.metadata_fields_to_embed + if key in doc.metadata and doc.metadata[key] is not None + ] + + text_to_embed = ( + self.prefix + self.embedding_separator.join(meta_values_to_embed + [doc.text or ""]) + self.suffix + ) + + # copied from OpenAI embedding_utils (https://github.com/openai/openai-python/blob/main/openai/embeddings_utils.py) + # replace newlines, which can negatively affect performance. + text_to_embed = text_to_embed.replace("\n", " ") + texts_to_embed.append(text_to_embed) + return texts_to_embed + + def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> Tuple[List[str], Dict[str, Any]]: + """ + Embed a list of texts in batches. + """ + + all_embeddings = [] + metadata = {} + for i in tqdm( + range(0, len(texts_to_embed), batch_size), disable=not self.progress_bar, desc="Calculating embeddings" + ): + batch = texts_to_embed[i : i + batch_size] + response = openai.Embedding.create(model=self.model_name, input=batch) + embeddings = [el["embedding"] for el in response.data] + all_embeddings.extend(embeddings) + + if "model" not in metadata: + metadata["model"] = response.model + if "usage" not in metadata: + metadata["usage"] = dict(response.usage.items()) + else: + metadata["usage"]["prompt_tokens"] += response.usage.prompt_tokens + metadata["usage"]["total_tokens"] += response.usage.total_tokens + + return all_embeddings, metadata + + @component.output_types(documents=List[Document], metadata=Dict[str, Any]) + def run(self, documents: List[Document]): + """ + Embed a list of Documents. + The embedding of each Document is stored in the `embedding` field of the Document. + + :param documents: A list of Documents to embed. + """ + if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): + raise TypeError( + "OpenAIDocumentEmbedder expects a list of Documents as input." + "In case you want to embed a string, please use the OpenAITextEmbedder." + ) + + texts_to_embed = self._prepare_texts_to_embed(documents=documents) + + embeddings, metadata = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self.batch_size) + + documents_with_embeddings = [] + for doc, emb in zip(documents, embeddings): + doc_as_dict = doc.to_dict() + doc_as_dict["embedding"] = emb + documents_with_embeddings.append(Document.from_dict(doc_as_dict)) + + return {"documents": documents_with_embeddings, "metadata": metadata} diff --git a/haystack/preview/components/file_converters/__init__.py b/haystack/preview/components/file_converters/__init__.py index 08f7a46132..ac75560757 100644 --- a/haystack/preview/components/file_converters/__init__.py +++ b/haystack/preview/components/file_converters/__init__.py @@ -1,5 +1,13 @@ from haystack.preview.components.file_converters.txt import TextFileToDocument from haystack.preview.components.file_converters.tika import TikaDocumentConverter from haystack.preview.components.file_converters.azure import AzureOCRDocumentConverter +from haystack.preview.components.file_converters.pypdf import PyPDFToDocument +from haystack.preview.components.file_converters.html import HTMLToDocument -__all__ = ["TextFileToDocument", "TikaDocumentConverter", "AzureOCRDocumentConverter"] +__all__ = [ + "TextFileToDocument", + "TikaDocumentConverter", + "AzureOCRDocumentConverter", + "PyPDFToDocument", + "HTMLToDocument", +] diff --git a/haystack/preview/components/file_converters/html.py b/haystack/preview/components/file_converters/html.py new file mode 100644 index 0000000000..3cb7c03377 --- /dev/null +++ b/haystack/preview/components/file_converters/html.py @@ -0,0 +1,70 @@ +import logging +from typing import List, Optional, Dict, Any, Union +from pathlib import Path + +from haystack.preview.lazy_imports import LazyImport +from haystack.preview import Document, component, default_to_dict, default_from_dict + +with LazyImport("Run 'pip install boilerpy3'") as boilerpy3_import: + from boilerpy3 import extractors + + +logger = logging.getLogger(__name__) + + +@component +class HTMLToDocument: + """ + A component for converting an HTML file to a Document. + """ + + def __init__(self, id_hash_keys: Optional[List[str]] = None): + """ + Create a HTMLToDocument component. + + :param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's + attributes. Default: `None` + """ + boilerpy3_import.check() + self.id_hash_keys = id_hash_keys or [] + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + return default_to_dict(self, id_hash_keys=self.id_hash_keys) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "HTMLToDocument": + """ + Deserialize this component from a dictionary. + """ + return default_from_dict(cls, data) + + @component.output_types(documents=List[Document]) + def run(self, paths: List[Union[str, Path]]): + """ + Convert HTML files to Documents. + + :param paths: A list of paths to HTML files. + :return: A list of Documents. + """ + documents = [] + extractor = extractors.ArticleExtractor(raise_on_failure=False) + for path in paths: + try: + file_content = extractor.read_from_file(path) + except Exception as e: + logger.warning("Could not read file %s. Skipping it. Error message: %s", path, e) + continue + # although raise_on_failure is set to False, the extractor can still raise an exception + try: + text = extractor.get_content(file_content) + except Exception as conversion_e: + logger.warning("Could not extract raw txt from %s. Skipping it. Error message: %s", path, conversion_e) + continue + + document = Document(text=text, id_hash_keys=self.id_hash_keys) + documents.append(document) + + return {"documents": documents} diff --git a/haystack/preview/components/generators/openai/gpt.py b/haystack/preview/components/generators/openai/gpt.py index 2aeb29f4e3..e5618f738e 100644 --- a/haystack/preview/components/generators/openai/gpt.py +++ b/haystack/preview/components/generators/openai/gpt.py @@ -121,7 +121,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "GPTGenerator": """ init_params = data.get("init_parameters", {}) streaming_callback = None - if "streaming_callback" in init_params: + if "streaming_callback" in init_params and init_params["streaming_callback"]: parts = init_params["streaming_callback"].split(".") module_name = ".".join(parts[:-1]) function_name = parts[-1] diff --git a/haystack/preview/components/rankers/__init__.py b/haystack/preview/components/rankers/__init__.py new file mode 100644 index 0000000000..27337481eb --- /dev/null +++ b/haystack/preview/components/rankers/__init__.py @@ -0,0 +1,3 @@ +from haystack.preview.components.rankers.similarity import SimilarityRanker + +__all__ = ["SimilarityRanker"] diff --git a/haystack/preview/components/rankers/similarity.py b/haystack/preview/components/rankers/similarity.py new file mode 100644 index 0000000000..7abb370954 --- /dev/null +++ b/haystack/preview/components/rankers/similarity.py @@ -0,0 +1,108 @@ +import logging +from pathlib import Path +from typing import List, Union, Dict, Any + +from haystack.preview import ComponentError, Document, component, default_from_dict, default_to_dict +from haystack.preview.lazy_imports import LazyImport + +logger = logging.getLogger(__name__) + + +with LazyImport(message="Run 'pip install transformers[torch,sentencepiece]==4.32.1'") as torch_and_transformers_import: + import torch + from transformers import AutoModelForSequenceClassification, AutoTokenizer + + +@component +class SimilarityRanker: + """ + Ranks documents based on query similarity. + + Usage example: + ``` + from haystack.preview import Document + from haystack.preview.components.rankers import SimilarityRanker + + sampler = SimilarityRanker() + docs = [Document(text="Paris"), Document(text="Berlin")] + query = "City in Germany" + output = sampler.run(query=query, documents=docs) + docs = output["documents"] + assert len(docs) == 2 + assert docs[0].text == "Berlin" + ``` + """ + + def __init__( + self, model_name_or_path: Union[str, Path] = "cross-encoder/ms-marco-MiniLM-L-6-v2", device: str = "cpu" + ): + """ + Creates an instance of SimilarityRanker. + + :param model_name_or_path: Path to a pre-trained sentence-transformers model. + :param device: torch device (for example, cuda:0, cpu, mps) to limit model inference to a specific device. + """ + torch_and_transformers_import.check() + + self.model_name_or_path = model_name_or_path + self.device = device + self.model = None + self.tokenizer = None + + def warm_up(self): + """ + Warm up the model and tokenizer used in scoring the documents. + """ + if self.model_name_or_path and not self.model: + self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name_or_path) + self.model = self.model.to(self.device) + self.model.eval() + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path) + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + return default_to_dict(self, device=self.device, model_name_or_path=self.model_name_or_path) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "SimilarityRanker": + """ + Deserialize this component from a dictionary. + """ + return default_from_dict(cls, data) + + @component.output_types(documents=List[Document]) + def run(self, query: str, documents: List[Document]): + """ + Returns a list of documents ranked by their similarity to the given query + + :param query: Query string. + :param documents: List of Documents. + :return: List of Documents sorted by (desc.) similarity with the query. + """ + if not documents: + return {"documents": []} + + # If a model path is provided but the model isn't loaded + if self.model_name_or_path and not self.model: + raise ComponentError( + f"The component {self.__class__.__name__} not warmed up. Run 'warm_up()' before calling 'run()'." + ) + + query_doc_pairs = [[query, doc.text] for doc in documents] + features = self.tokenizer( + query_doc_pairs, padding=True, truncation=True, return_tensors="pt" + ).to( # type: ignore + self.device + ) + with torch.inference_mode(): + similarity_scores = self.model(**features).logits.squeeze() # type: ignore + + _, sorted_indices = torch.sort(similarity_scores, descending=True) + ranked_docs = [] + for sorted_index_tensor in sorted_indices: + i = sorted_index_tensor.item() + documents[i].score = similarity_scores[i].item() + ranked_docs.append(documents[i]) + return {"documents": ranked_docs} diff --git a/haystack/preview/components/samplers/__init__.py b/haystack/preview/components/samplers/__init__.py new file mode 100644 index 0000000000..cab0e878e8 --- /dev/null +++ b/haystack/preview/components/samplers/__init__.py @@ -0,0 +1,3 @@ +from haystack.preview.components.samplers.top_p import TopPSampler + +__all__ = ["TopPSampler"] diff --git a/haystack/preview/components/samplers/top_p.py b/haystack/preview/components/samplers/top_p.py new file mode 100644 index 0000000000..f0c9d6d992 --- /dev/null +++ b/haystack/preview/components/samplers/top_p.py @@ -0,0 +1,140 @@ +import logging +from typing import List, Optional, Dict, Any + +from haystack.preview import ComponentError, Document, component, default_from_dict, default_to_dict +from haystack.preview.lazy_imports import LazyImport + +logger = logging.getLogger(__name__) + + +with LazyImport(message="Run 'pip install torch>=1.13'") as torch_import: + import torch + + +@component +class TopPSampler: + """ + Filters documents using top-p (nucleus) sampling based on their similarity scores' cumulative probability. + + Usage example: + + ```python + from haystack.preview import Document + from haystack.preview.components.samplers import TopPSampler + + sampler = TopPSampler(top_p=0.95) + docs = [ + Document(text="Berlin", metadata={"similarity_score": -10.6}), + Document(text="Belgrade", metadata={"similarity_score": -8.9}), + Document(text="Sarajevo", metadata={"similarity_score": -4.6}), + ] + output = sampler.run(documents=docs) + docs = output["documents"] + assert len(docs) == 1 + assert docs[0].text == "Sarajevo" + ``` + """ + + def __init__(self, top_p: float = 1.0, score_field: Optional[str] = None): + """ + Creates an instance of TopPSampler. + + :param top_p: Cumulative probability threshold (usually between 0.9 and 0.99). + :param score_field: Field name in a document's metadata containing the scores. Defaults to the Document score + if not provided. + """ + torch_import.check() + + self.top_p = top_p + self.score_field = score_field + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + return default_to_dict(self, top_p=self.top_p, score_field=self.score_field) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "TopPSampler": + """ + Deserialize this component from a dictionary. + """ + return default_from_dict(cls, data) + + @component.output_types(documents=List[Document]) + def run(self, documents: List[Document], top_p: Optional[float] = None): + """ + Filter documents based on their similarity scores using top-p sampling. + + :param documents: List of Documents to filter. + :param top_p: Cumulative probability threshold. Defaults to the value set during initialization or 1.0 + if not set. + :return: List of filtered Documents. + """ + if not documents: + return {"documents": []} + + top_p = top_p or self.top_p or 1.0 # default to 1.0 if both are None + + if not 0 <= top_p <= 1: + raise ComponentError(f"top_p must be between 0 and 1. Got {top_p}.") + + similarity_scores = torch.tensor(self._collect_scores(documents), dtype=torch.float32) + + # Apply softmax normalization to the similarity scores + probs = torch.nn.functional.softmax(similarity_scores, dim=-1) + + # Sort the probabilities and calculate their cumulative sum + sorted_probs, sorted_indices = torch.sort(probs, descending=True) + cumulative_probs = torch.cumsum(sorted_probs, dim=-1) + + # Check if the cumulative probabilities are close to top_p with a 1e-6 tolerance + close_to_top_p = torch.isclose(cumulative_probs, torch.tensor(top_p, device=cumulative_probs.device), atol=1e-6) + + # Combine the close_to_top_p with original condition using logical OR + condition = (cumulative_probs <= top_p) | close_to_top_p + + # Find the indices with cumulative probabilities that exceed top_p + top_p_indices = torch.where(torch.BoolTensor(condition))[0] + + # Map the selected indices back to their original indices + original_indices = sorted_indices[top_p_indices] + selected_docs = [documents[i.item()] for i in original_indices] + + # If low p resulted in no documents being selected, then + # return at least one document + if not selected_docs: + logger.warning( + "Top-p sampling with p=%s resulted in no documents being selected. " + "Returning the document with the highest similarity score.", + top_p, + ) + highest_prob_indices = torch.argsort(probs, descending=True) + selected_docs = [documents[int(highest_prob_indices[0].item())]] + + return {"documents": selected_docs} + + def _collect_scores(self, documents: List[Document]) -> List[float]: + """ + Collect the scores from the documents' metadata. + :param documents: List of Documents. + :return: List of scores. + """ + if self.score_field: + missing_scores_docs = [d for d in documents if self.score_field not in d.metadata] + if missing_scores_docs: + missing_scores_docs_ids = [d.id for d in missing_scores_docs if d.id] + raise ComponentError( + f"Score field '{self.score_field}' not found in metadata of documents " + f"with IDs: {missing_scores_docs_ids}." + f"Make sure that all documents have a score field '{self.score_field}' in their metadata." + ) + return [d.metadata[self.score_field] for d in documents] + else: + missing_scores_docs = [d for d in documents if d.score is None] + if missing_scores_docs: + missing_scores_docs_ids = [d.id for d in missing_scores_docs if d.id] + raise ComponentError( + f"Ensure all documents have a valid score value. These docs {missing_scores_docs_ids} don't." + ) + return [d.score for d in documents] # type: ignore ## because Document score is Optional diff --git a/haystack/preview/components/writers/document_writer.py b/haystack/preview/components/writers/document_writer.py index aef0d823b3..5ce8c9d4c2 100644 --- a/haystack/preview/components/writers/document_writer.py +++ b/haystack/preview/components/writers/document_writer.py @@ -44,13 +44,14 @@ def from_dict(cls, data: Dict[str, Any]) -> "DocumentWriter": data["init_parameters"]["policy"] = DuplicatePolicy[data["init_parameters"]["policy"]] return default_from_dict(cls, data) + @component.output_types(documents_written=int) def run(self, documents: List[Document], policy: Optional[DuplicatePolicy] = None): """ Run DocumentWriter on the given input data. :param documents: A list of documents to write to the store. :param policy: The policy to use when encountering duplicate documents. - :return: None + :return: Number of documents written :raises ValueError: If the specified document store is not found. """ @@ -58,4 +59,4 @@ def run(self, documents: List[Document], policy: Optional[DuplicatePolicy] = Non policy = self.policy self.document_store.write_documents(documents=documents, policy=policy) - return {} + return {"documents_written": len(documents)} diff --git a/haystack/preview/dataclasses/__init__.py b/haystack/preview/dataclasses/__init__.py index 5a0d8489f8..6873ac0ccb 100644 --- a/haystack/preview/dataclasses/__init__.py +++ b/haystack/preview/dataclasses/__init__.py @@ -1,4 +1,5 @@ from haystack.preview.dataclasses.document import Document from haystack.preview.dataclasses.answer import ExtractedAnswer, GeneratedAnswer, Answer +from haystack.preview.dataclasses.byte_stream import ByteStream -__all__ = ["Document", "ExtractedAnswer", "GeneratedAnswer", "Answer"] +__all__ = ["Document", "ExtractedAnswer", "GeneratedAnswer", "Answer", "ByteStream"] diff --git a/haystack/preview/dataclasses/byte_stream.py b/haystack/preview/dataclasses/byte_stream.py new file mode 100644 index 0000000000..fe006fcf85 --- /dev/null +++ b/haystack/preview/dataclasses/byte_stream.py @@ -0,0 +1,37 @@ +from dataclasses import dataclass, field +from pathlib import Path +from typing import Dict, Any + + +@dataclass(frozen=True) +class ByteStream: + """ + Base data class representing a binary object in the Haystack API. + """ + + data: bytes + metadata: Dict[str, Any] = field(default_factory=dict, hash=False) + + def to_file(self, destination_path: Path): + with open(destination_path, "wb") as fd: + fd.write(self.data) + + @classmethod + def from_file_path(cls, filepath: Path) -> "ByteStream": + """ + Create a ByteStream from the contents read from a file. + + :param filepath: A valid path to a file. + """ + with open(filepath, "rb") as fd: + return cls(data=fd.read()) + + @classmethod + def from_string(cls, text: str, encoding: str = "utf-8") -> "ByteStream": + """ + Create a ByteStream encoding a string. + + :param text: The string to encode + :param encoding: The encoding used to convert the string into bytes + """ + return cls(data=text.encode(encoding)) diff --git a/haystack/preview/dataclasses/document.py b/haystack/preview/dataclasses/document.py index eb0704b867..76c3b78421 100644 --- a/haystack/preview/dataclasses/document.py +++ b/haystack/preview/dataclasses/document.py @@ -1,15 +1,13 @@ -from typing import List, Any, Dict, Optional, Type - -import json import hashlib +import json import logging +from dataclasses import asdict, dataclass, field, fields from pathlib import Path -from dataclasses import dataclass, field, fields, asdict +from typing import Any, Dict, List, Optional, Type import numpy import pandas - logger = logging.getLogger(__name__) @@ -50,7 +48,7 @@ def document_decoder(self, dictionary): return dictionary -@dataclass(frozen=True) +@dataclass class Document: """ Base data class containing some data to be queried. diff --git a/haystack/preview/testing/test_utils.py b/haystack/preview/testing/test_utils.py index 333685c8d3..596feb7001 100644 --- a/haystack/preview/testing/test_utils.py +++ b/haystack/preview/testing/test_utils.py @@ -1,7 +1,10 @@ import os import random +import logging import numpy as np -import torch + + +logger = logging.getLogger(__name__) def set_all_seeds(seed: int, deterministic_cudnn: bool = False) -> None: @@ -16,9 +19,16 @@ def set_all_seeds(seed: int, deterministic_cudnn: bool = False) -> None: """ random.seed(seed) np.random.seed(seed) - torch.manual_seed(seed) os.environ["PYTHONHASHSEED"] = str(seed) - torch.cuda.manual_seed_all(seed) - if deterministic_cudnn: - torch.backends.cudnn.deterministic = True - torch.backends.cudnn.benchmark = False + + try: + import torch + + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + if deterministic_cudnn: + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + except (ImportError, ModuleNotFoundError) as exc: + logger.info("Could not set PyTorch seed because torch is not installed. Exception: %s", exc) diff --git a/haystack/utils/deepsetcloud.py b/haystack/utils/deepsetcloud.py index 24ab50b3c4..dfe1bc09f5 100644 --- a/haystack/utils/deepsetcloud.py +++ b/haystack/utils/deepsetcloud.py @@ -1,12 +1,11 @@ import json -from mimetypes import guess_type -from pathlib import Path -from typing import Any, Dict, Generator, List, Optional, Tuple, Union, Literal - import logging import os import time from enum import Enum +from mimetypes import guess_type +from pathlib import Path +from typing import Any, Dict, Generator, List, Literal, Optional, Tuple, Union import pandas as pd import requests @@ -560,10 +559,11 @@ def save_pipeline_config( :param workspace: Specifies the name of the workspace on deepset Cloud. :param headers: Headers to pass to the API call. """ - config["name"] = pipeline_config_name workspace_url = self._build_workspace_url(workspace=workspace) pipelines_url = f"{workspace_url}/pipelines" - response = self.client.post(url=pipelines_url, data=yaml.dump(config), headers=headers).json() + response = self.client.post( + url=pipelines_url, json={"name": pipeline_config_name, "config": yaml.dump(config)}, headers=headers + ).json() if "name" not in response or response["name"] != pipeline_config_name: logger.warning("Unexpected response from saving pipeline config: %s", response) @@ -582,7 +582,6 @@ def update_pipeline_config( :param workspace: Specifies the name of the workspace on deepset Cloud. :param headers: Headers to pass to the API call. """ - config["name"] = pipeline_config_name pipeline_url = self._build_pipeline_url(workspace=workspace, pipeline_config_name=pipeline_config_name) yaml_url = f"{pipeline_url}/yaml" response = self.client.put(url=yaml_url, data=yaml.dump(config), headers=headers).json() diff --git a/haystack/utils/getting_started.py b/haystack/utils/getting_started.py index 5a227b7213..cd54e7169d 100644 --- a/haystack/utils/getting_started.py +++ b/haystack/utils/getting_started.py @@ -65,7 +65,7 @@ def add_example_data(document_store, dir): output_dir=dir, ) files_to_index = [dir + "/" + f for f in os.listdir(dir)] - converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"]) + converter = TextConverter(remove_numeric_tables=True) docs = [converter.convert(file_path=file, meta=None)[0] for file in files_to_index] else: # Here you can add a local folder with your files(.txt, .pdf, .docx). diff --git a/haystack/utils/openai_utils.py b/haystack/utils/openai_utils.py index fdde7306ce..439d045989 100644 --- a/haystack/utils/openai_utils.py +++ b/haystack/utils/openai_utils.py @@ -3,7 +3,9 @@ import logging import platform import json -from typing import Dict, Union, Tuple, Optional, List +from typing import Dict, Union, Tuple, Optional, List, cast + +import httpx import requests import tenacity import tiktoken @@ -143,6 +145,53 @@ def openai_request( return response +@tenacity.retry( + reraise=True, + retry=tenacity.retry_if_exception_type(OpenAIError) + and tenacity.retry_if_not_exception_type(OpenAIUnauthorizedError), + wait=tenacity.wait_exponential(multiplier=OPENAI_BACKOFF), + stop=tenacity.stop_after_attempt(OPENAI_MAX_RETRIES), +) +async def openai_async_request( + url: str, + headers: Dict, + payload: Dict, + timeout: Union[float, Tuple[float, float]] = OPENAI_TIMEOUT, + read_response: bool = True, + **kwargs, +): + """Make a request to the OpenAI API given a `url`, `headers`, `payload`, and `timeout`. + + See `openai_request`. + """ + async with httpx.AsyncClient() as client: + response = await client.request( + "POST", url, headers=headers, json=payload, timeout=cast(float, timeout), **kwargs + ) + + if read_response: + json_response = json.loads(response.text) + + if response.status_code != 200: + openai_error: OpenAIError + if response.status_code == 429: + openai_error = OpenAIRateLimitError(f"API rate limit exceeded: {response.text}") + elif response.status_code == 401: + openai_error = OpenAIUnauthorizedError(f"API key is invalid: {response.text}") + else: + openai_error = OpenAIError( + f"OpenAI returned an error.\n" + f"Status code: {response.status_code}\n" + f"Response body: {response.text}", + status_code=response.status_code, + ) + raise openai_error + if read_response: + return json_response + else: + return response + + def check_openai_policy_violation(input: Union[List[str], str], headers: Dict) -> bool: """ Calls the moderation endpoint to check if the text(s) violate the policy. @@ -163,6 +212,26 @@ def check_openai_policy_violation(input: Union[List[str], str], headers: Dict) - return flagged +async def check_openai_async_policy_violation(input: Union[List[str], str], headers: Dict) -> bool: + """ + Calls the moderation endpoint to check if the text(s) violate the policy. + See [OpenAI Moderation API](https://platform.openai.com/docs/guides/moderation) for more details. + Returns true if any of the input is flagged as any of ['sexual', 'hate', 'violence', 'self-harm', 'sexual/minors', 'hate/threatening', 'violence/graphic']. + """ + response = await openai_async_request(url=OPENAI_MODERATION_URL, headers=headers, payload={"input": input}) + results = response["results"] + flagged = any(res["flagged"] for res in results) + if flagged: + for result in results: + if result["flagged"]: + logger.debug( + "OpenAI Moderation API flagged the text '%s' as a potential policy violation of the following categories: %s", + input, + result["categories"], + ) + return flagged + + def _check_openai_finish_reason(result: Dict, payload: Dict) -> None: """Check the `finish_reason` the answers returned by OpenAI completions endpoint. If the `finish_reason` is `length` or `content_filter`, log a warning to the user. diff --git a/proposals/text/0000-evaluation-haystack-2.md b/proposals/text/5794-evaluation-haystack-2.md similarity index 99% rename from proposals/text/0000-evaluation-haystack-2.md rename to proposals/text/5794-evaluation-haystack-2.md index 7f7507e78e..9093a19e0a 100644 --- a/proposals/text/0000-evaluation-haystack-2.md +++ b/proposals/text/5794-evaluation-haystack-2.md @@ -1,7 +1,7 @@ - Title: Evaluation in Haystack 2.0 - Decision driver: (Silvano Cerza, Julian Risch) - Start Date: 2023-08-23 -- Proposal PR: #5794 +- Proposal PR: [#5794](https://github.com/deepset-ai/haystack/pull/5794/) - Github Issue or Discussion: https://github.com/deepset-ai/haystack/issues/5628 # Summary diff --git a/proposals/text/0000-evaluation-haystack-2.py b/proposals/text/5794-evaluation-haystack-2.py similarity index 99% rename from proposals/text/0000-evaluation-haystack-2.py rename to proposals/text/5794-evaluation-haystack-2.py index 46dd8af4b3..faaaec377d 100644 --- a/proposals/text/0000-evaluation-haystack-2.py +++ b/proposals/text/5794-evaluation-haystack-2.py @@ -135,5 +135,5 @@ ] eval_result = eval(pipe, inputs=inputs, expected_output=expected_output) -metrics = result.calculate_metrics(Metric.SAS) +metrics = result.calculate_metrics(Metric.SAS) # noqa metrics.save("path/to/file.csv") diff --git a/pyproject.toml b/pyproject.toml index 601e2f385c..8ecde0c81b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,7 @@ classifiers = [ ] dependencies = [ "requests", + "httpx", "pydantic<2", "transformers==4.32.1", "pandas", @@ -86,7 +87,7 @@ dependencies = [ [project.optional-dependencies] preview = [ - "canals==0.8.0", + "canals==0.8.1", "requests", "pandas", "rank_bm25", @@ -197,9 +198,6 @@ onnx-gpu = [ metrics = [ # for metrics "scipy>=1.3.2", "rapidfuzz>=2.0.15,<2.8.0", # FIXME https://github.com/deepset-ai/haystack/pull/3199 - # Pin setuptools_scm to prevent errors in Windows tests while installing seqeval - # https://github.com/deepset-ai/haystack/issues/5889 - "setuptools_scm<8.0; platform_system == 'Windows'", "seqeval", "mlflow", ] @@ -288,7 +286,7 @@ line-length = 120 skip_magic_trailing_comma = true # For compatibility with pydoc>=4.6, check if still needed. [tool.codespell] -ignore-words-list = "ans,astroid,nd,ned,nin,ue" +ignore-words-list = "ans,astroid,nd,ned,nin,ue,rouge" quiet-level = 3 skip = "test/nodes/*,test/others/*,test/samples/*" diff --git a/releasenotes/notes/add-blob-type-2a9476a39841f54d.yaml b/releasenotes/notes/add-blob-type-2a9476a39841f54d.yaml new file mode 100644 index 0000000000..163d9631ef --- /dev/null +++ b/releasenotes/notes/add-blob-type-2a9476a39841f54d.yaml @@ -0,0 +1,5 @@ +--- +preview: + - | + Add ByteStream type to send binary raw data across components + in a pipeline. diff --git a/releasenotes/notes/add-document-writer-number-of-documents-written-2c57f3a5d6ae2131.yaml b/releasenotes/notes/add-document-writer-number-of-documents-written-2c57f3a5d6ae2131.yaml new file mode 100644 index 0000000000..f742601594 --- /dev/null +++ b/releasenotes/notes/add-document-writer-number-of-documents-written-2c57f3a5d6ae2131.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + Document writer returns the number of documents written. diff --git a/releasenotes/notes/add-html-to-document-21fe38b244388f4d.yaml b/releasenotes/notes/add-html-to-document-21fe38b244388f4d.yaml new file mode 100644 index 0000000000..1ff9b25b6b --- /dev/null +++ b/releasenotes/notes/add-html-to-document-21fe38b244388f4d.yaml @@ -0,0 +1,4 @@ +--- +preview: + - | + Adds HTMLToDocument component to convert HTML to a Document. diff --git a/releasenotes/notes/add-openai-async-1f65701142f77181.yaml b/releasenotes/notes/add-openai-async-1f65701142f77181.yaml new file mode 100644 index 0000000000..7d97d8064e --- /dev/null +++ b/releasenotes/notes/add-openai-async-1f65701142f77181.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + Add asyncio support to the OpenAI invocation layer. diff --git a/releasenotes/notes/add-promptnode-arun-bc4c2bcc9c653015.yaml b/releasenotes/notes/add-promptnode-arun-bc4c2bcc9c653015.yaml new file mode 100644 index 0000000000..e9e3305eab --- /dev/null +++ b/releasenotes/notes/add-promptnode-arun-bc4c2bcc9c653015.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + PromptNode can now be run asynchronously by calling the `arun` method. diff --git a/releasenotes/notes/add-similarity-ranker-401bf595cea7318a.yaml b/releasenotes/notes/add-similarity-ranker-401bf595cea7318a.yaml new file mode 100644 index 0000000000..a0d3217a67 --- /dev/null +++ b/releasenotes/notes/add-similarity-ranker-401bf595cea7318a.yaml @@ -0,0 +1,4 @@ +--- +preview: + - | + Adds SimilarityRanker, a component that ranks a list of Documents based on their similarity to the query. diff --git a/releasenotes/notes/add-top-p-sampler-ad6e0f5d623a6bb5.yaml b/releasenotes/notes/add-top-p-sampler-ad6e0f5d623a6bb5.yaml new file mode 100644 index 0000000000..729c8b6245 --- /dev/null +++ b/releasenotes/notes/add-top-p-sampler-ad6e0f5d623a6bb5.yaml @@ -0,0 +1,4 @@ +--- +preview: + - | + Adds TopPSampler, a component selects documents based on the cumulative probability of the Document scores using top p (nucleus) sampling. diff --git a/releasenotes/notes/deprecate-openai-answergenerator-537266612ba1ffff.yaml b/releasenotes/notes/deprecate-openai-answergenerator-537266612ba1ffff.yaml new file mode 100644 index 0000000000..3e42dfacab --- /dev/null +++ b/releasenotes/notes/deprecate-openai-answergenerator-537266612ba1ffff.yaml @@ -0,0 +1,5 @@ +--- +deprecations: + - | + Deprecate `OpenAIAnswerGenerator` in favor of `PromptNode`. + `OpenAIAnswerGenerator` will be removed in Haystack 1.23. diff --git a/releasenotes/notes/fix-chatgpt-invocation-layer-bc25d0ea5f77f05c.yaml b/releasenotes/notes/fix-chatgpt-invocation-layer-bc25d0ea5f77f05c.yaml new file mode 100644 index 0000000000..2be3084dd7 --- /dev/null +++ b/releasenotes/notes/fix-chatgpt-invocation-layer-bc25d0ea5f77f05c.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + Fixed the bug that prevented the correct usage of ChatGPT invocation layer + in 1.21.1. + Added async support for ChatGPT invocation layer. diff --git a/releasenotes/notes/openai-document-embedder-d2f59ba1f21babcb.yaml b/releasenotes/notes/openai-document-embedder-d2f59ba1f21babcb.yaml new file mode 100644 index 0000000000..eaf44199c6 --- /dev/null +++ b/releasenotes/notes/openai-document-embedder-d2f59ba1f21babcb.yaml @@ -0,0 +1,6 @@ +--- +preview: + - | + Add OpenAI Document Embedder. + It computes embeddings of Documents using OpenAI models. + The embedding of each Document is stored in the `embedding` field of the Document. diff --git a/releasenotes/notes/update-deepset-cloud-sdk-save-pipeline-config-ff820838846f5f38.yaml b/releasenotes/notes/update-deepset-cloud-sdk-save-pipeline-config-ff820838846f5f38.yaml new file mode 100644 index 0000000000..a5ca3ac5e4 --- /dev/null +++ b/releasenotes/notes/update-deepset-cloud-sdk-save-pipeline-config-ff820838846f5f38.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + Update the deepset Cloud SDK to the new endpoint format for new saving pipeline configs. diff --git a/test/nodes/test_generator.py b/test/nodes/test_generator.py index 1fea0fc4d2..f69663aeba 100644 --- a/test/nodes/test_generator.py +++ b/test/nodes/test_generator.py @@ -6,9 +6,19 @@ from haystack.nodes.answer_generator import OpenAIAnswerGenerator from haystack.nodes import PromptTemplate +from ..conftest import fail_at_version + import logging +@pytest.mark.unit +@fail_at_version(1, 23) +@patch("haystack.nodes.answer_generator.openai.load_openai_tokenizer") +def test_openaianswergenerator_deprecation(mock_load_tokenizer): + with pytest.warns(DeprecationWarning): + OpenAIAnswerGenerator(api_key="fake_api_key") + + @pytest.mark.unit @patch("haystack.nodes.answer_generator.openai.openai_request") def test_no_openai_organization(mock_request): diff --git a/test/preview/components/builders/test_answer_builder.py b/test/preview/components/builders/test_answer_builder.py index 03b4a42f9a..e93ae2f030 100644 --- a/test/preview/components/builders/test_answer_builder.py +++ b/test/preview/components/builders/test_answer_builder.py @@ -36,7 +36,29 @@ def test_from_dict(self): def test_run_unmatching_input_len(self): component = AnswerBuilder() with pytest.raises(ValueError): - component.run(query="query", replies=["reply1", "reply2"], metadata=[]) + component.run(query="query", replies=["reply1"], metadata=[{"test": "meta"}, {"test": "meta2"}]) + + @pytest.mark.unit + def test_run_without_meta(self): + component = AnswerBuilder() + output = component.run(query="query", replies=["reply1"]) + answers = output["answers"] + assert answers[0].data == "reply1" + assert answers[0].metadata == {} + assert answers[0].query == "query" + assert answers[0].documents == [] + assert isinstance(answers[0], GeneratedAnswer) + + @pytest.mark.unit + def test_run_meta_is_an_empty_list(self): + component = AnswerBuilder() + output = component.run(query="query", replies=["reply1"], metadata=[]) + answers = output["answers"] + assert answers[0].data == "reply1" + assert answers[0].metadata == {} + assert answers[0].query == "query" + assert answers[0].documents == [] + assert isinstance(answers[0], GeneratedAnswer) def test_run_without_pattern(self): component = AnswerBuilder() diff --git a/test/preview/components/embedders/test_openai_document_embedder.py b/test/preview/components/embedders/test_openai_document_embedder.py new file mode 100644 index 0000000000..b6f3ea9b37 --- /dev/null +++ b/test/preview/components/embedders/test_openai_document_embedder.py @@ -0,0 +1,334 @@ +from unittest.mock import patch +import pytest +from typing import List +import numpy as np +import openai +from openai.util import convert_to_openai_object + +from haystack.preview import Document +from haystack.preview.components.embedders.openai_document_embedder import OpenAIDocumentEmbedder + + +def mock_openai_response( + input: List[str], model: str = "text-embedding-ada-002", **kwargs +) -> openai.openai_object.OpenAIObject: + dict_response = { + "object": "list", + "data": [ + {"object": "embedding", "index": i, "embedding": np.random.rand(1536).tolist()} for i in range(len(input)) + ], + "model": model, + "usage": {"prompt_tokens": 4, "total_tokens": 4}, + } + + return convert_to_openai_object(dict_response) + + +class TestOpenAIDocumentEmbedder: + @pytest.mark.unit + def test_init_default(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "fake-api-key") + embedder = OpenAIDocumentEmbedder() + + assert openai.api_key == "fake-api-key" + + assert embedder.model_name == "text-embedding-ada-002" + assert embedder.organization is None + assert embedder.prefix == "" + assert embedder.suffix == "" + assert embedder.batch_size == 32 + assert embedder.progress_bar is True + assert embedder.metadata_fields_to_embed == [] + assert embedder.embedding_separator == "\n" + + @pytest.mark.unit + def test_init_with_parameters(self): + embedder = OpenAIDocumentEmbedder( + api_key="fake-api-key", + model_name="model", + organization="my-org", + prefix="prefix", + suffix="suffix", + batch_size=64, + progress_bar=False, + metadata_fields_to_embed=["test_field"], + embedding_separator=" | ", + ) + assert openai.api_key == "fake-api-key" + assert openai.organization == "my-org" + + assert embedder.organization == "my-org" + assert embedder.model_name == "model" + assert embedder.prefix == "prefix" + assert embedder.suffix == "suffix" + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.metadata_fields_to_embed == ["test_field"] + assert embedder.embedding_separator == " | " + + @pytest.mark.unit + def test_init_fail_wo_api_key(self, monkeypatch): + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + with pytest.raises(ValueError, match="OpenAIDocumentEmbedder expects an OpenAI API key"): + OpenAIDocumentEmbedder() + + @pytest.mark.unit + def test_to_dict(self): + component = OpenAIDocumentEmbedder(api_key="fake-api-key") + data = component.to_dict() + assert data == { + "type": "OpenAIDocumentEmbedder", + "init_parameters": { + "model_name": "text-embedding-ada-002", + "organization": None, + "prefix": "", + "suffix": "", + "batch_size": 32, + "progress_bar": True, + "metadata_fields_to_embed": [], + "embedding_separator": "\n", + }, + } + + @pytest.mark.unit + def test_to_dict_with_custom_init_parameters(self): + component = OpenAIDocumentEmbedder( + api_key="fake-api-key", + model_name="model", + organization="my-org", + prefix="prefix", + suffix="suffix", + batch_size=64, + progress_bar=False, + metadata_fields_to_embed=["test_field"], + embedding_separator=" | ", + ) + data = component.to_dict() + assert data == { + "type": "OpenAIDocumentEmbedder", + "init_parameters": { + "model_name": "model", + "organization": "my-org", + "prefix": "prefix", + "suffix": "suffix", + "batch_size": 64, + "progress_bar": False, + "metadata_fields_to_embed": ["test_field"], + "embedding_separator": " | ", + }, + } + + @pytest.mark.unit + def test_from_dict(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "fake-api-key") + data = { + "type": "OpenAIDocumentEmbedder", + "init_parameters": { + "model_name": "model", + "organization": "my-org", + "prefix": "prefix", + "suffix": "suffix", + "batch_size": 64, + "progress_bar": False, + "metadata_fields_to_embed": ["test_field"], + "embedding_separator": " | ", + }, + } + component = OpenAIDocumentEmbedder.from_dict(data) + assert openai.api_key == "fake-api-key" + assert component.model_name == "model" + assert component.organization == "my-org" + assert openai.organization == "my-org" + assert component.prefix == "prefix" + assert component.suffix == "suffix" + assert component.batch_size == 64 + assert component.progress_bar is False + assert component.metadata_fields_to_embed == ["test_field"] + assert component.embedding_separator == " | " + + @pytest.mark.unit + def test_from_dict_fail_wo_env_var(self, monkeypatch): + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + data = { + "type": "OpenAIDocumentEmbedder", + "init_parameters": { + "model_name": "model", + "organization": "my-org", + "prefix": "prefix", + "suffix": "suffix", + "batch_size": 64, + "progress_bar": False, + "metadata_fields_to_embed": ["test_field"], + "embedding_separator": " | ", + }, + } + with pytest.raises(ValueError, match="OpenAIDocumentEmbedder expects an OpenAI API key"): + OpenAIDocumentEmbedder.from_dict(data) + + @pytest.mark.unit + def test_prepare_texts_to_embed_w_metadata(self): + documents = [ + Document(text=f"document number {i}:\ncontent", metadata={"meta_field": f"meta_value {i}"}) + for i in range(5) + ] + + embedder = OpenAIDocumentEmbedder( + api_key="fake-api-key", metadata_fields_to_embed=["meta_field"], embedding_separator=" | " + ) + + prepared_texts = embedder._prepare_texts_to_embed(documents) + + # note that newline is replaced by space + assert prepared_texts == [ + "meta_value 0 | document number 0: content", + "meta_value 1 | document number 1: content", + "meta_value 2 | document number 2: content", + "meta_value 3 | document number 3: content", + "meta_value 4 | document number 4: content", + ] + + @pytest.mark.unit + def test_prepare_texts_to_embed_w_suffix(self): + documents = [Document(text=f"document number {i}") for i in range(5)] + + embedder = OpenAIDocumentEmbedder(api_key="fake-api-key", prefix="my_prefix ", suffix=" my_suffix") + + prepared_texts = embedder._prepare_texts_to_embed(documents) + + assert prepared_texts == [ + "my_prefix document number 0 my_suffix", + "my_prefix document number 1 my_suffix", + "my_prefix document number 2 my_suffix", + "my_prefix document number 3 my_suffix", + "my_prefix document number 4 my_suffix", + ] + + @pytest.mark.unit + def test_embed_batch(self): + texts = ["text 1", "text 2", "text 3", "text 4", "text 5"] + + with patch( + "haystack.preview.components.embedders.openai_document_embedder.openai.Embedding" + ) as openai_embedding_patch: + openai_embedding_patch.create.side_effect = mock_openai_response + embedder = OpenAIDocumentEmbedder(api_key="fake-api-key", model_name="model") + + embeddings, metadata = embedder._embed_batch(texts_to_embed=texts, batch_size=2) + + assert openai_embedding_patch.create.call_count == 3 + + assert isinstance(embeddings, list) + assert len(embeddings) == len(texts) + for embedding in embeddings: + assert isinstance(embedding, list) + assert len(embedding) == 1536 + assert all(isinstance(x, float) for x in embedding) + + # openai.Embedding.create is called 3 times + assert metadata == {"model": "model", "usage": {"prompt_tokens": 3 * 4, "total_tokens": 3 * 4}} + + @pytest.mark.unit + def test_run(self): + docs = [ + Document(text="I love cheese", metadata={"topic": "Cuisine"}), + Document(text="A transformer is a deep learning architecture", metadata={"topic": "ML"}), + ] + + model = "text-similarity-ada-001" + with patch( + "haystack.preview.components.embedders.openai_document_embedder.openai.Embedding" + ) as openai_embedding_patch: + openai_embedding_patch.create.side_effect = mock_openai_response + embedder = OpenAIDocumentEmbedder( + api_key="fake-api-key", + model_name=model, + prefix="prefix ", + suffix=" suffix", + metadata_fields_to_embed=["topic"], + embedding_separator=" | ", + ) + + result = embedder.run(documents=docs) + + openai_embedding_patch.create.assert_called_once_with( + model=model, + input=[ + "prefix Cuisine | I love cheese suffix", + "prefix ML | A transformer is a deep learning architecture suffix", + ], + ) + documents_with_embeddings = result["documents"] + metadata = result["metadata"] + + assert isinstance(documents_with_embeddings, list) + assert len(documents_with_embeddings) == len(docs) + for doc in documents_with_embeddings: + assert isinstance(doc, Document) + assert isinstance(doc.embedding, list) + assert len(doc.embedding) == 1536 + assert all(isinstance(x, float) for x in doc.embedding) + assert metadata == {"model": model, "usage": {"prompt_tokens": 4, "total_tokens": 4}} + + @pytest.mark.unit + def test_run_custom_batch_size(self): + docs = [ + Document(text="I love cheese", metadata={"topic": "Cuisine"}), + Document(text="A transformer is a deep learning architecture", metadata={"topic": "ML"}), + ] + + model = "text-similarity-ada-001" + with patch( + "haystack.preview.components.embedders.openai_document_embedder.openai.Embedding" + ) as openai_embedding_patch: + openai_embedding_patch.create.side_effect = mock_openai_response + embedder = OpenAIDocumentEmbedder( + api_key="fake-api-key", + model_name=model, + prefix="prefix ", + suffix=" suffix", + metadata_fields_to_embed=["topic"], + embedding_separator=" | ", + batch_size=1, + ) + + result = embedder.run(documents=docs) + + assert openai_embedding_patch.create.call_count == 2 + + documents_with_embeddings = result["documents"] + metadata = result["metadata"] + + assert isinstance(documents_with_embeddings, list) + assert len(documents_with_embeddings) == len(docs) + for doc in documents_with_embeddings: + assert isinstance(doc, Document) + assert isinstance(doc.embedding, list) + assert len(doc.embedding) == 1536 + assert all(isinstance(x, float) for x in doc.embedding) + + # openai.Embedding.create is called 2 times + assert metadata == {"model": model, "usage": {"prompt_tokens": 2 * 4, "total_tokens": 2 * 4}} + + @pytest.mark.unit + def test_run_wrong_input_format(self): + embedder = OpenAIDocumentEmbedder(api_key="fake-api-key") + + # wrong formats + string_input = "text" + list_integers_input = [1, 2, 3] + + with pytest.raises(TypeError, match="OpenAIDocumentEmbedder expects a list of Documents as input"): + embedder.run(documents=string_input) + + with pytest.raises(TypeError, match="OpenAIDocumentEmbedder expects a list of Documents as input"): + embedder.run(documents=list_integers_input) + + @pytest.mark.unit + def test_run_on_empty_list(self): + embedder = OpenAIDocumentEmbedder(api_key="fake-api-key") + + empty_list_input = [] + result = embedder.run(documents=empty_list_input) + + assert result["documents"] is not None + assert not result["documents"] # empty list diff --git a/test/preview/components/file_converters/test_html_to_document.py b/test/preview/components/file_converters/test_html_to_document.py new file mode 100644 index 0000000000..b46f841282 --- /dev/null +++ b/test/preview/components/file_converters/test_html_to_document.py @@ -0,0 +1,63 @@ +import logging + +import pytest + +from haystack.preview.components.file_converters import HTMLToDocument + + +class TestHTMLToDocument: + @pytest.mark.unit + def test_to_dict(self): + component = HTMLToDocument() + data = component.to_dict() + assert data == {"type": "HTMLToDocument", "init_parameters": {"id_hash_keys": []}} + + @pytest.mark.unit + def test_to_dict_with_custom_init_parameters(self): + component = HTMLToDocument(id_hash_keys=["name"]) + data = component.to_dict() + assert data == {"type": "HTMLToDocument", "init_parameters": {"id_hash_keys": ["name"]}} + + @pytest.mark.unit + def test_from_dict(self): + data = {"type": "HTMLToDocument", "init_parameters": {"id_hash_keys": ["name"]}} + component = HTMLToDocument.from_dict(data) + assert component.id_hash_keys == ["name"] + + @pytest.mark.unit + def test_run(self, preview_samples_path): + """ + Test if the component runs correctly. + """ + paths = [preview_samples_path / "html" / "what_is_haystack.html"] + converter = HTMLToDocument() + output = converter.run(paths=paths) + docs = output["documents"] + assert len(docs) == 1 + assert "Haystack" in docs[0].text + + @pytest.mark.unit + def test_run_wrong_file_type(self, preview_samples_path, caplog): + """ + Test if the component runs correctly when an input file is not of the expected type. + """ + paths = [preview_samples_path / "audio" / "answer.wav"] + converter = HTMLToDocument() + with caplog.at_level(logging.WARNING): + output = converter.run(paths=paths) + assert "codec can't decode byte" in caplog.text + + docs = output["documents"] + assert docs == [] + + @pytest.mark.unit + def test_run_error_handling(self, preview_samples_path, caplog): + """ + Test if the component correctly handles errors. + """ + paths = ["non_existing_file.html"] + converter = HTMLToDocument() + with caplog.at_level(logging.WARNING): + result = converter.run(paths=paths) + assert "Could not read file non_existing_file.html" in caplog.text + assert result["documents"] == [] diff --git a/test/preview/components/rankers/test_similarity.py b/test/preview/components/rankers/test_similarity.py new file mode 100644 index 0000000000..5ddb3b18dc --- /dev/null +++ b/test/preview/components/rankers/test_similarity.py @@ -0,0 +1,74 @@ +import pytest + +from haystack.preview import Document, ComponentError +from haystack.preview.components.rankers.similarity import SimilarityRanker + + +class TestSimilarityRanker: + @pytest.mark.unit + def test_to_dict(self): + component = SimilarityRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-6-v2") + data = component.to_dict() + assert data == { + "type": "SimilarityRanker", + "init_parameters": {"device": "cpu", "model_name_or_path": "cross-encoder/ms-marco-MiniLM-L-6-v2"}, + } + + @pytest.mark.unit + def test_to_dict_with_custom_init_parameters(self): + component = SimilarityRanker() + data = component.to_dict() + assert data == { + "type": "SimilarityRanker", + "init_parameters": {"device": "cpu", "model_name_or_path": "cross-encoder/ms-marco-MiniLM-L-6-v2"}, + } + + @pytest.mark.integration + def test_from_dict(self): + data = { + "type": "SimilarityRanker", + "init_parameters": {"device": "cpu", "model_name_or_path": "cross-encoder/ms-marco-MiniLM-L-6-v2"}, + } + component = SimilarityRanker.from_dict(data) + assert component.model_name_or_path == "cross-encoder/ms-marco-MiniLM-L-6-v2" + + @pytest.mark.integration + @pytest.mark.parametrize( + "query,docs_before_texts,expected_first_text", + [ + ("City in Bosnia and Herzegovina", ["Berlin", "Belgrade", "Sarajevo"], "Sarajevo"), + ("Machine learning", ["Python", "Bakery in Paris", "Tesla Giga Berlin"], "Python"), + ("Cubist movement", ["Nirvana", "Pablo Picasso", "Coffee"], "Pablo Picasso"), + ], + ) + def test_run(self, query, docs_before_texts, expected_first_text): + """ + Test if the component ranks documents correctly. + """ + ranker = SimilarityRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-6-v2") + ranker.warm_up() + docs_before = [Document(text=text) for text in docs_before_texts] + output = ranker.run(query=query, documents=docs_before) + docs_after = output["documents"] + + assert len(docs_after) == 3 + assert docs_after[0].text == expected_first_text + + sorted_scores = sorted([doc.score for doc in docs_after], reverse=True) + assert [doc.score for doc in docs_after] == sorted_scores + + # Returns an empty list if no documents are provided + @pytest.mark.integration + def test_returns_empty_list_if_no_documents_are_provided(self): + sampler = SimilarityRanker() + sampler.warm_up() + output = sampler.run(query="City in Germany", documents=[]) + assert output["documents"] == [] + + # Raises ComponentError if model is not warmed up + @pytest.mark.integration + def test_raises_component_error_if_model_not_warmed_up(self): + sampler = SimilarityRanker() + + with pytest.raises(ComponentError): + sampler.run(query="query", documents=[Document(text="document")]) diff --git a/test/preview/components/samplers/__init__.py b/test/preview/components/samplers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/preview/components/samplers/test_top_p.py b/test/preview/components/samplers/test_top_p.py new file mode 100644 index 0000000000..111dffa25c --- /dev/null +++ b/test/preview/components/samplers/test_top_p.py @@ -0,0 +1,107 @@ +import random + +import pytest + +from haystack.preview import Document, ComponentError +from haystack.preview.components.samplers.top_p import TopPSampler + + +class TestTopPSampler: + @pytest.mark.unit + def test_to_dict(self): + component = TopPSampler() + data = component.to_dict() + assert data == {"type": "TopPSampler", "init_parameters": {"top_p": 1.0, "score_field": None}} + + @pytest.mark.unit + def test_to_dict_with_custom_init_parameters(self): + component = TopPSampler(top_p=0.92) + data = component.to_dict() + assert data == {"type": "TopPSampler", "init_parameters": {"top_p": 0.92, "score_field": None}} + + @pytest.mark.unit + def test_from_dict(self): + data = {"type": "TopPSampler", "init_parameters": {"top_p": 0.9, "score_field": None}} + component = TopPSampler.from_dict(data) + assert component.top_p == 0.9 + + @pytest.mark.unit + def test_run_scores_from_metadata(self): + """ + Test if the component runs correctly with scores already in the metadata. + """ + sampler = TopPSampler(top_p=0.95, score_field="similarity_score") + docs = [ + Document(text="Berlin", metadata={"similarity_score": -10.6}), + Document(text="Belgrade", metadata={"similarity_score": -8.9}), + Document(text="Sarajevo", metadata={"similarity_score": -4.6}), + ] + output = sampler.run(documents=docs) + docs = output["documents"] + assert len(docs) == 1 + assert docs[0].text == "Sarajevo" + + @pytest.mark.unit + def test_run_scores(self): + """ + Test if the component runs correctly with scores in the Document score field. + """ + sampler = TopPSampler(top_p=0.99) + docs = [ + Document(text="Berlin", score=-10.6), + Document(text="Belgrade", score=-8.9), + Document(text="Sarajevo", score=-4.6), + ] + + random.shuffle(docs) + sorted_scores = sorted([doc.score for doc in docs], reverse=True) + + # top_p = 0.99 will get the top 1 document + output = sampler.run(documents=docs) + docs_filtered = output["documents"] + assert len(docs_filtered) == 1 + assert docs_filtered[0].text == "Sarajevo" + + assert [doc.score for doc in docs_filtered] == sorted_scores[:1] + + @pytest.mark.unit + def test_run_scores_top_p_1(self): + """ + Test if the component runs correctly top_p=1. + """ + sampler = TopPSampler(top_p=1.0) + docs = [ + Document(text="Berlin", score=-10.6), + Document(text="Belgrade", score=-8.9), + Document(text="Sarajevo", score=-4.6), + ] + + random.shuffle(docs) + output = sampler.run(documents=docs) + docs_filtered = output["documents"] + assert len(docs_filtered) == len(docs) + assert docs_filtered[0].text == "Sarajevo" + + assert [doc.score for doc in docs_filtered] == sorted([doc.score for doc in docs], reverse=True) + + # Returns an empty list if no documents are provided + @pytest.mark.unit + def test_returns_empty_list_if_no_documents_are_provided(self): + sampler = TopPSampler() + output = sampler.run(documents=[]) + assert output["documents"] == [] + + @pytest.mark.unit + def test_run_scores_no_metadata_present(self): + """ + Test if the component runs correctly with scores missing from the metadata yet being specified in the + score_field. + """ + sampler = TopPSampler(top_p=0.95, score_field="similarity_score") + docs = [ + Document(text="Berlin", score=-10.6), + Document(text="Belgrade", score=-8.9), + Document(text="Sarajevo", score=-4.6), + ] + with pytest.raises(ComponentError, match="Score field 'similarity_score' not found"): + sampler.run(documents=docs) diff --git a/test/preview/dataclasses/test_byte_stream.py b/test/preview/dataclasses/test_byte_stream.py new file mode 100644 index 0000000000..05d40eb79e --- /dev/null +++ b/test/preview/dataclasses/test_byte_stream.py @@ -0,0 +1,33 @@ +import io + +from haystack.preview.dataclasses import ByteStream + +import pytest + + +@pytest.mark.unit +def test_from_file_path(tmp_path, request): + test_bytes = "Hello, world!\n".encode() + test_path = tmp_path / request.node.name + with open(test_path, "wb") as fd: + assert fd.write(test_bytes) + + b = ByteStream.from_file_path(test_path) + assert b.data == test_bytes + + +@pytest.mark.unit +def test_from_string(): + test_string = "Hello, world!" + b = ByteStream.from_string(test_string) + assert b.data.decode() == test_string + + +@pytest.mark.unit +def test_to_file(tmp_path, request): + test_str = "Hello, world!\n" + test_path = tmp_path / request.node.name + + ByteStream(test_str.encode()).to_file(test_path) + with open(test_path, "rb") as fd: + assert fd.read().decode() == test_str diff --git a/test/preview/dataclasses/test_document.py b/test/preview/dataclasses/test_document.py index f53b146fea..1f2610c849 100644 --- a/test/preview/dataclasses/test_document.py +++ b/test/preview/dataclasses/test_document.py @@ -1,21 +1,14 @@ -from pathlib import Path import dataclasses -import textwrap import json +import textwrap +from pathlib import Path -import pytest -import pandas as pd import numpy as np +import pandas as pd +import pytest from haystack.preview import Document -from haystack.preview.dataclasses.document import DocumentEncoder, DocumentDecoder - - -@pytest.mark.unit -def test_document_is_immutable(): - doc = Document(text="test text") - with pytest.raises(dataclasses.FrozenInstanceError): - doc.text = "won't work" +from haystack.preview.dataclasses.document import DocumentDecoder, DocumentEncoder @pytest.mark.unit diff --git a/test/preview/test_files/html/what_is_haystack.html b/test/preview/test_files/html/what_is_haystack.html new file mode 100644 index 0000000000..2d62b206c0 --- /dev/null +++ b/test/preview/test_files/html/what_is_haystack.html @@ -0,0 +1,1634 @@ + + + + + + + + + + What is Haystack? | Haystack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ 🎃 We're participating in Hacktoberfest 2023! + + + + + +
+
+ + + +
+ +
+
+
+ + + + + +
+

What is Haystack?

+

Haystack is the open source Python framework by deepset for building custom apps with large language models (LLMs). It lets you quickly try out the latest models in natural language processing (NLP) while being flexible and easy to use. Our inspiring community of users and builders has helped shape Haystack into what it is today: a complete framework for building production-ready NLP apps.

+

Building with Haystack

+

Haystack offers comprehensive tooling for developing state-of-the-art NLP systems that use LLMs (such as GPT-4, Falcon and similar) and Transformer models . With Haystack, you can effortlessly experiment with various models hosted on platforms like Hugging Face, OpenAI, Cohere, or even models deployed on SageMaker and your local models to find the perfect fit for your use case.

+ + + + + + + + + + + + + + + + + + + + + Model Providers + + +

Some examples of what you can build include:

+
    +
  • Semantic search on a large collection of documents in any language
  • +
  • Generative question answering on a knowledge base containing mixed types of information: images, text, and tables.
  • +
  • Natural language chatbots powered by cutting-edge generative models like GPT-4
  • +
  • An LLM-based Haystack Agent capable of resolving complex queries
  • +
  • Information extraction from documents to populate your database or build a knowledge graph
  • +
+

This is just a small subset of the kinds of systems that can be created in Haystack.

+

Functionality for all stages of an NLP project

+

A successful NLP project requires more than just the language models. As an end-to-end framework, Haystack assists you in building your system every step of the way, offering tooling for each stage of the NLP project life cycle:

+ +

But that’s not all: +metadata filtering, +model distillation, or the prompt hub, whatever your NLP heart desires, you’re likely to find it in Haystack. And if not? We’ll build it together.

+ + + + + + + + + + + + + + + + + + + + + + + Rest API + + +

Building blocks

+

Haystack uses a few simple but effective concepts to help you build fully functional and customized end-to-end NLP systems.

+

Components

+

At the core of Haystack are its components—fundamental building blocks that can perform tasks like document retrieval, text generation, or summarization. A single component is already quite powerful. It can manage local language models or communicate with a hosted model through an API.

+

While Haystack offers a bunch of components you can use out of the box, it also lets you create your own custom components. Explore the +collection of integrations that includes custom components developed by our community, which you can freely use.

+

You can chain components together to build pipelines, which are the foundation of the NLP app architecture in Haystack.

+

Pipelines

+

Pipelines are powerful structures made up of components, such as a Retriever and Reader, connected to infrastructure building blocks, such as a DocumentStore (for example, Elasticsearch or Weaviate) to form complex systems.

+

Haystack offers ready-made pipelines for most common tasks, such as question answering, document retrieval, or summarization. But it’s just as easy to design and create a custom pipeline for NLP scenarios that are way more complex than question answering.

+

Agents

+

The Haystack Agent makes use of a large language model to resolve complex tasks. When initializing the Agent, you give it a set of tools, which can be pipeline components or whole pipelines. The Agent can use to those tools iteratively to arrive at an answer. When given a query, the Agent determines which tools are useful to answer this query and calls them in a loop until it gets the answer. This way, it can achieve much more than extractive or generative question answering pipelines.

+ + + + + + + + + + + + + + + + + + + + + Agent Tools + + +

Who’s it for?

+

Haystack is for everyone looking to build natural language apps—NLP enthusiasts and newbies alike. You don’t need to understand how the models work under the hood. With Haystack’s modular and flexible components, pipelines, and agents, all you need is some basic knowledge of Python to dive right in.

+

Our community

+

At the heart of Haystack is the vibrant open source community that thrives on the diverse backgrounds and skill sets of its members. We value collaboration greatly and encourage our users to shape Haystack actively through GitHub contributions. Our Discord channel is a space where community members can connect, seek help, and learn from each other.

+

We also organize live online and in-person events, webinars, and office hours, which are an opportunity to learn and grow.

+ + + + + + + + +
+ + + +
+ Join Discord +
+ + + +
+
+ +

Enter the Haystack universe

+ + + + +
+ + + +
+ +
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/prompt/invocation_layer/test_chatgpt.py b/test/prompt/invocation_layer/test_chatgpt.py index 9036353901..c1b816d493 100644 --- a/test/prompt/invocation_layer/test_chatgpt.py +++ b/test/prompt/invocation_layer/test_chatgpt.py @@ -1,6 +1,6 @@ +import logging from unittest.mock import patch -import logging import pytest from haystack.nodes.prompt.invocation_layer import ChatGPTInvocationLayer diff --git a/test/prompt/test_prompt_model.py b/test/prompt/test_prompt_model.py index 1ac1702a1d..9e7c2b0f51 100644 --- a/test/prompt/test_prompt_model.py +++ b/test/prompt/test_prompt_model.py @@ -1,4 +1,5 @@ -from unittest.mock import patch, Mock +import asyncio +from unittest.mock import patch, MagicMock import pytest @@ -36,3 +37,24 @@ def test_construtor_with_custom_model(): def test_constructor_with_no_supported_model(): with pytest.raises(ValueError, match="Model some-random-model is not supported"): PromptModel("some-random-model") + + +@pytest.mark.asyncio +async def test_ainvoke(): + def async_return(result): + f = asyncio.Future() + f.set_result(result) + return f + + mock_layer = MagicMock() # no async-defined methods, await will fail and fall back to regular `invoke` + mock_layer.return_value.invoke.return_value = async_return("Async Bar!") + model = PromptModel(invocation_layer_class=mock_layer) + assert await model.ainvoke("Foo") == "Async Bar!" + + +@pytest.mark.asyncio +async def test_ainvoke_falls_back_to_sync(): + mock_layer = MagicMock() # no async-defined methods, await will fail and fall back to regular `invoke` + mock_layer.return_value.invoke.return_value = "Bar!" + model = PromptModel(invocation_layer_class=mock_layer) + assert await model.ainvoke("Foo") == "Bar!" diff --git a/test/prompt/test_prompt_node.py b/test/prompt/test_prompt_node.py index fe9d034f1d..972a04be18 100644 --- a/test/prompt/test_prompt_node.py +++ b/test/prompt/test_prompt_node.py @@ -1,23 +1,20 @@ -import os import logging -from typing import Optional, Union, List, Dict, Any, Tuple -from unittest.mock import patch, Mock, MagicMock +import os +from typing import Any, Dict, List, Optional, Tuple, Union +from unittest.mock import AsyncMock, MagicMock, Mock, patch import pytest from prompthub import Prompt -from transformers import GenerationConfig, TextStreamer -from haystack import Document, Pipeline, BaseComponent, MultiLabel -from haystack.nodes.prompt import PromptTemplate, PromptNode, PromptModel -from haystack.nodes.prompt.prompt_template import LEGACY_DEFAULT_TEMPLATES +from haystack import BaseComponent, Document, MultiLabel, Pipeline +from haystack.nodes.prompt import PromptModel, PromptNode, PromptTemplate from haystack.nodes.prompt.invocation_layer import ( - HFLocalInvocationLayer, - DefaultTokenStreamingHandler, AzureChatGPTInvocationLayer, AzureOpenAIInvocationLayer, - OpenAIInvocationLayer, ChatGPTInvocationLayer, + OpenAIInvocationLayer, ) +from haystack.nodes.prompt.prompt_template import LEGACY_DEFAULT_TEMPLATES @pytest.fixture @@ -1049,36 +1046,63 @@ def test_chatgpt_direct_prompting_w_messages(chatgpt_prompt_model): @pytest.mark.unit @patch("haystack.nodes.prompt.invocation_layer.open_ai.load_openai_tokenizer", lambda tokenizer_name: None) @patch("haystack.nodes.prompt.prompt_model.PromptModel._ensure_token_limit", lambda self, prompt: prompt) -def test_content_moderation_gpt_3_and_gpt_3_5(): +def test_content_moderation_gpt_3(): """ - Check all possible cases of the moderation checks passing / failing in a PromptNode call - for both ChatGPTInvocationLayer and OpenAIInvocationLayer. + Check all possible cases of the moderation checks passing / failing in a PromptNode uses + OpenAIInvocationLayer. """ - prompt_node_gpt_3_5 = PromptNode( - model_name_or_path="gpt-3.5-turbo", api_key="key", model_kwargs={"moderate_content": True} - ) - prompt_node_gpt_3 = PromptNode( + prompt_node = PromptNode( model_name_or_path="text-davinci-003", api_key="key", model_kwargs={"moderate_content": True} ) with patch("haystack.nodes.prompt.invocation_layer.open_ai.check_openai_policy_violation") as mock_check, patch( - "haystack.nodes.prompt.invocation_layer.chatgpt.ChatGPTInvocationLayer._execute_openai_request" - ) as mock_execute_gpt_3_5, patch( - "haystack.nodes.prompt.invocation_layer.open_ai.OpenAIInvocationLayer._execute_openai_request" - ) as mock_execute_gpt_3: + "haystack.nodes.prompt.invocation_layer.open_ai.openai_request" + ) as mock_request: VIOLENT_TEXT = "some violent text" mock_check.side_effect = lambda input, headers: input == VIOLENT_TEXT or input == [VIOLENT_TEXT] # case 1: prompt fails the moderation check # prompt should not be sent to OpenAi & function should return an empty list mock_check.return_value = True - assert prompt_node_gpt_3_5(VIOLENT_TEXT) == prompt_node_gpt_3(VIOLENT_TEXT) == [] + assert prompt_node(VIOLENT_TEXT) == [] # case 2: prompt passes the moderation check but the generated output fails the check # function should also return an empty list - mock_execute_gpt_3_5.return_value = mock_execute_gpt_3.return_value = [VIOLENT_TEXT] - assert prompt_node_gpt_3_5("normal prompt") == prompt_node_gpt_3("normal prompt") == [] + mock_request.return_value = {"choices": [{"text": VIOLENT_TEXT, "finish_reason": ""}]} + assert prompt_node("normal prompt") == [] # case 3: both prompt and output pass the moderation check # function should return the output - mock_execute_gpt_3_5.return_value = mock_execute_gpt_3.return_value = ["normal output"] - assert prompt_node_gpt_3_5("normal prompt") == prompt_node_gpt_3("normal prompt") == ["normal output"] + mock_request.return_value = {"choices": [{"text": "normal output", "finish_reason": ""}]} + assert prompt_node("normal prompt") == ["normal output"] + + +@pytest.mark.unit +@patch("haystack.nodes.prompt.invocation_layer.open_ai.load_openai_tokenizer", lambda tokenizer_name: None) +@patch("haystack.nodes.prompt.prompt_model.PromptModel._ensure_token_limit", lambda self, prompt: prompt) +def test_content_moderation_gpt_35(): + """ + Check all possible cases of the moderation checks passing / failing in a PromptNode uses + ChatGPTInvocationLayer. + """ + prompt_node = PromptNode(model_name_or_path="gpt-3.5-turbo", api_key="key", model_kwargs={"moderate_content": True}) + with patch("haystack.nodes.prompt.invocation_layer.chatgpt.check_openai_policy_violation") as mock_check, patch( + "haystack.nodes.prompt.invocation_layer.chatgpt.openai_request" + ) as mock_request: + VIOLENT_TEXT = "some violent text" + mock_check.side_effect = lambda input, headers: input == VIOLENT_TEXT or input == [VIOLENT_TEXT] + # case 1: prompt fails the moderation check + # prompt should not be sent to OpenAi & function should return an empty list + mock_check.return_value = True + assert prompt_node(VIOLENT_TEXT) == [] + # case 2: prompt passes the moderation check but the generated output fails the check + # function should also return an empty list + mock_request.return_value = { + "choices": [{"message": {"content": VIOLENT_TEXT, "role": "assistant"}, "finish_reason": ""}] + } + assert prompt_node("normal prompt") == [] + # case 3: both prompt and output pass the moderation check + # function should return the output + mock_request.return_value = { + "choices": [{"message": {"content": "normal output", "role": "assistant"}, "finish_reason": ""}] + } + assert prompt_node("normal prompt") == ["normal output"] @patch("haystack.nodes.prompt.prompt_node.PromptModel") @@ -1098,3 +1122,83 @@ def test_prompt_node_warns_about_missing_documents(mock_model, caplog): "Expected prompt parameter 'documents' to be provided but it is missing. " "Continuing with an empty list of documents." in caplog.text ) + + +@pytest.mark.unit +@patch("haystack.nodes.prompt.prompt_node.PromptModel") +def test__prepare_invocation_context_is_empty(mock_model): + node = PromptNode() + node.get_prompt_template = MagicMock(return_value="Test Template") + + kwargs = { + "query": "query", + "file_paths": ["foo", "bar"], + "labels": ["label", "another"], + "documents": ["A", "B"], + "meta": {"meta_key": "meta_value"}, + "prompt_template": "my-test-prompt", + "invocation_context": None, + "generation_kwargs": {"gen_key": "gen_value"}, + } + + invocation_context = node._prepare(**kwargs) + + node.get_prompt_template.assert_called_once_with("my-test-prompt") + assert invocation_context == { + "query": "query", + "file_paths": ["foo", "bar"], + "labels": ["label", "another"], + "documents": ["A", "B"], + "meta": {"meta_key": "meta_value"}, + "prompt_template": "Test Template", + "gen_key": "gen_value", + } + + +@pytest.mark.unit +@patch("haystack.nodes.prompt.prompt_node.PromptModel") +def test__prepare_invocation_context_was_passed(mock_model): + node = PromptNode() + + # Test invocation_context is left untouched + invocation_context = { + "query": "query", + "file_paths": ["foo", "bar"], + "labels": ["label", "another"], + "documents": ["A", "B"], + "meta": {"meta_key": "meta_value"}, + "prompt_template": "my-test-prompt", + "invocation_context": None, + } + kwargs = { + "query": None, + "file_paths": None, + "labels": None, + "documents": None, + "meta": None, + "prompt_template": None, + "invocation_context": invocation_context, + "generation_kwargs": None, + } + + assert node._prepare(**kwargs) == invocation_context + + +@pytest.mark.unit +@pytest.mark.asyncio +@patch("haystack.nodes.prompt.prompt_node.PromptModel") +async def test_arun(mock_model): + node = PromptNode() + node._aprompt = AsyncMock() + await node.arun("a query") + node._aprompt.assert_awaited_once_with(prompt_collector=[], query="a query", prompt_template=None) + + +@pytest.mark.unit +@pytest.mark.asyncio +@patch("haystack.nodes.prompt.prompt_node.PromptModel") +async def test_aprompt(mock_model): + node = PromptNode() + mock_model.return_value.ainvoke = AsyncMock() + await node._aprompt(PromptTemplate("test template")) + mock_model.return_value.ainvoke.assert_awaited_once() diff --git a/test/samples/dc/pipeline_config.json b/test/samples/dc/pipeline_config.json index 25d4b29e5a..f10fd32190 100644 --- a/test/samples/dc/pipeline_config.json +++ b/test/samples/dc/pipeline_config.json @@ -1,6 +1,5 @@ { "version": "ignore", - "name": "document_retrieval_1", "components": [ { "name": "DocumentStore", diff --git a/test/utils/test_deepset_cloud.py b/test/utils/test_deepset_cloud.py new file mode 100644 index 0000000000..e4951a1fcf --- /dev/null +++ b/test/utils/test_deepset_cloud.py @@ -0,0 +1,70 @@ +import json +from pathlib import Path +from typing import Any, Dict +from unittest.mock import Mock + +import pytest +import yaml + +from haystack.utils.deepsetcloud import DeepsetCloudClient, PipelineClient + + +@pytest.fixture +def pipeline_config(samples_path: Path) -> Dict[str, Any]: + with (samples_path / "dc" / "pipeline_config.json").open() as f: + return json.load(f) + + +@pytest.fixture() +def mocked_client() -> Mock: + api_client = Mock(spec=DeepsetCloudClient) + + api_client.build_workspace_url.return_value = "https://dc" + + return api_client + + +@pytest.fixture() +def mock_success_response() -> Mock: + mock_response = Mock() + mock_response.json.return_value = {"name": "test_pipeline"} + + return mock_response + + +class TestSaveConfig: + def test_save_config( + self, pipeline_config: Dict[str, Any], mocked_client: Mock, mock_success_response: Mock + ) -> None: + mocked_client.post.return_value = mock_success_response + + pipeline_name = "test_pipeline" + workspace_name = "test_workspace" + + pipeline_client = PipelineClient(client=mocked_client) + + pipeline_client.save_pipeline_config( + config=pipeline_config, pipeline_config_name=pipeline_name, workspace=workspace_name + ) + + expected_payload = {"name": pipeline_name, "config": yaml.dump(pipeline_config)} + mocked_client.post.assert_called_once_with(url="https://dc/pipelines", json=expected_payload, headers=None) + + +class TestUpdateConfig: + def test_update_config( + self, pipeline_config: Dict[str, Any], mocked_client: Mock, mock_success_response: Mock + ) -> None: + mocked_client.put.return_value = mock_success_response + pipeline_name = "test_pipeline" + workspace_name = "test_workspace" + + pipeline_client = PipelineClient(client=mocked_client) + + pipeline_client.update_pipeline_config( + config=pipeline_config, pipeline_config_name=pipeline_name, workspace=workspace_name + ) + + mocked_client.put.assert_called_once_with( + url=f"https://dc/pipelines/{pipeline_name}/yaml", data=yaml.dump(pipeline_config), headers=None + )