From 4d3fd160c51e005fee05a811d0db5c945e5fcad9 Mon Sep 17 00:00:00 2001 From: Joseph McElroy Date: Mon, 4 Dec 2023 10:58:23 +0000 Subject: [PATCH 1/7] notebook wip --- .../langchain-parent-retriever.ipynb | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 notebooks/langchain/langchain-parent-retriever.ipynb diff --git a/notebooks/langchain/langchain-parent-retriever.ipynb b/notebooks/langchain/langchain-parent-retriever.ipynb new file mode 100644 index 00000000..d048b850 --- /dev/null +++ b/notebooks/langchain/langchain-parent-retriever.ipynb @@ -0,0 +1,109 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "!python3 -m pip install -qU langchain elasticsearch " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# import modules\n", + "from getpass import getpass\n", + "from langchain.vectorstores import ElasticsearchStore\n", + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "\n", + "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#finding-your-cloud-id\n", + "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", + "\n", + "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#creating-an-api-key\n", + "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")\n", + "\n", + "# https://platform.openai.com/api-keys\n", + "OPENAI_API_KEY = getpass(\"OpenAI API key: \")\n", + "\n", + "embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)\n", + "\n", + "vector_store = ElasticsearchStore(\n", + " es_cloud_id=ELASTIC_CLOUD_ID, \n", + " es_api_key=ELASTIC_API_KEY,\n", + " index_name= \"workplace_index\", \n", + " embedding=embeddings\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "\n", + "\n", + "def parent_child_splitter(data, id_key=PARENT_DOC_ID_KEY):\n", + " parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)\n", + " # This text splitter is used to create the child documents\n", + " # It should create documents smaller than the parent\n", + " child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)\n", + " documents = parent_splitter.split_documents(data)\n", + " doc_ids = [str(uuid.uuid4()) for _ in documents]\n", + "\n", + " docs = []\n", + " for i, doc in enumerate(documents):\n", + " _id = doc_ids[i]\n", + " sub_docs = child_splitter.split_documents([doc])\n", + " for _doc in sub_docs:\n", + " _doc.metadata[id_key] = _id\n", + " _doc.metadata[\"doc_level\"] = \"child\"\n", + " docs.extend(sub_docs)\n", + " doc.metadata[id_key] = _id\n", + " doc.metadata[\"doc_level\"] = \"parent\"\n", + " return documents, docs\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 16dea980723e12fd4a154d009a49610aed7ee93f Mon Sep 17 00:00:00 2001 From: Joseph McElroy Date: Wed, 6 Dec 2023 14:27:11 +0000 Subject: [PATCH 2/7] remove explicitly declaring the shard settings --- notebooks/document-chunking/with-index-pipelines.ipynb | 2 -- 1 file changed, 2 deletions(-) diff --git a/notebooks/document-chunking/with-index-pipelines.ipynb b/notebooks/document-chunking/with-index-pipelines.ipynb index cd5e92fa..c4fced89 100644 --- a/notebooks/document-chunking/with-index-pipelines.ipynb +++ b/notebooks/document-chunking/with-index-pipelines.ipynb @@ -309,8 +309,6 @@ " index=INDEX_NAME, \n", " settings={\n", " \"index\": {\n", - " \"number_of_replicas\": \"1\",\n", - " \"number_of_shards\": \"1\",\n", " \"default_pipeline\": \"chunk_text_to_passages\"\n", " }\n", " },\n", From b242f690498880289f97d04ee3bdc302c223a126 Mon Sep 17 00:00:00 2001 From: Joseph McElroy Date: Wed, 6 Dec 2023 14:27:25 +0000 Subject: [PATCH 3/7] remove unused fn --- .../chatbot-with-bm25-only-example.ipynb | 27 ------------------- 1 file changed, 27 deletions(-) diff --git a/notebooks/langchain/self-query-retriever-examples/chatbot-with-bm25-only-example.ipynb b/notebooks/langchain/self-query-retriever-examples/chatbot-with-bm25-only-example.ipynb index 5ca8cf9b..04acd6ec 100644 --- a/notebooks/langchain/self-query-retriever-examples/chatbot-with-bm25-only-example.ipynb +++ b/notebooks/langchain/self-query-retriever-examples/chatbot-with-bm25-only-example.ipynb @@ -303,33 +303,6 @@ "from langchain.prompts import ChatPromptTemplate, PromptTemplate\n", "from langchain.schema import format_document\n", "\n", - "def custom_query(query_body, query):\n", - " filters = query_body.get(\"knn\", {}).get(\"filter\", [])\n", - " \n", - " print(f\"filters: {filters}\")\n", - " print(f\"query: {query}\")\n", - "\n", - " if query.strip() != \"\":\n", - " query_clause = [{\n", - " \"multi_match\": {\n", - " \"query\": query,\n", - " \"fields\": [\"text\"],\n", - " \"fuzziness\": \"AUTO\",\n", - " }\n", - " }]\n", - " else:\n", - " query_clause = []\n", - "\n", - "\n", - " return {\n", - " \"query\": {\n", - " \"bool\": {\n", - " \"filter\": filters,\n", - " \"must\": query_clause\n", - " }\n", - " },\n", - " }\n", - "\n", "retriever = SelfQueryRetriever.from_llm(\n", " llm, \n", " vectorstore, \n", From 7930341240a8615634299a847c3b75c90f33fa0a Mon Sep 17 00:00:00 2001 From: Joseph McElroy Date: Wed, 6 Dec 2023 14:27:43 +0000 Subject: [PATCH 4/7] notebook for parent / child capability --- .../langchain-parent-retriever.ipynb | 809 +++++++++++++++++- 1 file changed, 766 insertions(+), 43 deletions(-) diff --git a/notebooks/langchain/langchain-parent-retriever.ipynb b/notebooks/langchain/langchain-parent-retriever.ipynb index d048b850..a18f3ced 100644 --- a/notebooks/langchain/langchain-parent-retriever.ipynb +++ b/notebooks/langchain/langchain-parent-retriever.ipynb @@ -1,87 +1,810 @@ { "cells": [ { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "# Parent Child Retriever Examples\n", + "**Using Elasticsearch Nested Dense Vector Support**\n", + "\n", + "When splitting documents for retrieval, there are often conflicting desires:\n", + "\n", + "- You may want to have small documents, so that their embeddings can most accurately reflect their meaning. If too long, then the embeddings can lose meaning.\n", + "- You want to have long enough documents that the context of each chunk is retained.\n", + "\n", + "We can take advantage of Nested Dense Vector capability in Elasticsearch to store both large passages and smaller linked passages in one document. During retrieval, we query for small passages which link back to a larger parent passage.\n", + "\n", + "Note that “parent document” refers to the document that a small chunk originated from. This can either be the whole raw document OR a larger chunk." + ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "## Dependencies\n", + "In this notebook, we're going to use Langchain and the Elasticsearch python client.\n", + "\n", + "We will also require a running Elasticsearch instance with an ML node and model deployed to it." + ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "!python3 -m pip install -qU langchain elasticsearch " ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Connect to Elasticsearch\n", + "\n", + "ℹ️ We're using an Elastic Cloud deployment of Elasticsearch for this notebook. If you don't have an Elastic Cloud deployment, sign up [here](https://cloud.elastic.co/registration?utm_source=github&utm_content=elasticsearch-labs-notebook) for a free trial. \n", + "\n", + "We'll use the **Cloud ID** to identify our deployment, because we are using Elastic Cloud deployment. To find the Cloud ID for your deployment, go to https://cloud.elastic.co/deployments and select your deployment." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "\n", - "# import modules\n", "from getpass import getpass\n", - "from langchain.vectorstores import ElasticsearchStore\n", - "from langchain.embeddings.openai import OpenAIEmbeddings\n", "\n", "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#finding-your-cloud-id\n", "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", "\n", "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#creating-an-api-key\n", - "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")\n", + "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "from elasticsearch import Elasticsearch\n", + "\n", + "client = Elasticsearch(cloud_id=ELASTIC_CLOUD_ID, api_key=ELASTIC_API_KEY)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Download our example Dataset\n", + "We are going to use Langchain's tooling to ingest and split raw documents into smaller chunks. We are using our example workplace search dataset.\n", + "\n", + "LangChain has a number of other loaders to ingest data from other sources. See their [core loaders](https://python.langchain.com/docs/modules/data_connection/document_loaders/) or [loaders integration](https://python.langchain.com/docs/integrations/document_loaders) for more information. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from urllib.request import urlopen\n", + "import json\n", "\n", - "# https://platform.openai.com/api-keys\n", - "OPENAI_API_KEY = getpass(\"OpenAI API key: \")\n", + "url = \"https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/datasets/workplace-documents.json\"\n", "\n", - "embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)\n", + "response = urlopen(url)\n", + "data = json.load(response)\n", "\n", - "vector_store = ElasticsearchStore(\n", - " es_cloud_id=ELASTIC_CLOUD_ID, \n", - " es_api_key=ELASTIC_API_KEY,\n", - " index_name= \"workplace_index\", \n", - " embedding=embeddings\n", - ")\n" + "with open('temp.json', 'w') as json_file:\n", + " json.dump(data, json_file)\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "from langchain.document_loaders import JSONLoader \n", "\n", + "def metadata_func(record: dict, metadata: dict) -> dict:\n", + " metadata[\"name\"] = record.get(\"name\")\n", + " metadata[\"summary\"] = record.get(\"summary\")\n", + " metadata[\"url\"] = record.get(\"url\")\n", + " metadata[\"category\"] = record.get(\"category\")\n", + " metadata[\"updated_at\"] = record.get(\"updated_at\")\n", "\n", - "def parent_child_splitter(data, id_key=PARENT_DOC_ID_KEY):\n", - " parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)\n", - " # This text splitter is used to create the child documents\n", - " # It should create documents smaller than the parent\n", - " child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)\n", + " return metadata\n", + "\n", + "# For more loaders https://python.langchain.com/docs/modules/data_connection/document_loaders/\n", + "# And 3rd party loaders https://python.langchain.com/docs/modules/data_connection/document_loaders/#third-party-loaders\n", + "loader = JSONLoader(\n", + " file_path=\"temp.json\",\n", + " jq_schema=\".[]\",\n", + " content_key=\"content\",\n", + " metadata_func=metadata_func,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setting up our Elasticsearch Index\n", + "In this example we're going to use a pipeline to do the inference and store the embeddings in our index. \n", + "\n", + "In this example, we are using the sentence transformers minilm-l6-v2 model, which you will need to is running on the ML node. With this model, we are setting up an index_pipeline to do the inference and store the embeddings in our index." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'nb_parent_retriever_index'})" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "PIPELINE_ID = \"chunk_text_to_passages\"\n", + "MODEL_ID = \"sentence-transformers__all-minilm-l6-v2\"\n", + "MODEL_DIMS = 384\n", + "INDEX_NAME = \"nb_parent_retriever_index\"\n", + "\n", + "# Create the pipeline\n", + "client.ingest.put_pipeline(\n", + " id=PIPELINE_ID, \n", + " processors=[\n", + " {\n", + " \"foreach\": {\n", + " \"field\": \"passages\",\n", + " \"processor\": {\n", + " \"inference\": {\n", + " \"field_map\": {\n", + " \"_ingest._value.text\": \"text_field\"\n", + " },\n", + " \"model_id\": MODEL_ID,\n", + " \"target_field\": \"_ingest._value.vector\",\n", + " \"on_failure\": [\n", + " {\n", + " \"append\": {\n", + " \"field\": \"_source._ingest.inference_errors\",\n", + " \"value\": [\n", + " {\n", + " \"message\": \"Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'\",\n", + " \"pipeline\": \"ml-inference-title-vector\",\n", + " \"timestamp\": \"{{{ _ingest.timestamp }}}\"\n", + " }\n", + " ]\n", + " }\n", + " }\n", + " ]\n", + " }\n", + " }\n", + " }\n", + " }\n", + " ]\n", + ")\n", + "\n", + "# Create the index\n", + "client.indices.create( \n", + " index=INDEX_NAME, \n", + " settings={\n", + " \"index\": {\n", + " \"default_pipeline\": PIPELINE_ID\n", + " }\n", + " },\n", + " mappings={\n", + " \"dynamic\": \"true\",\n", + " \"properties\": {\n", + " \"passages\": {\n", + " \"type\": \"nested\",\n", + " \"properties\": {\n", + " \"vector\": {\n", + " \"properties\": {\n", + " \"predicted_value\": {\n", + " \"type\": \"dense_vector\",\n", + " \"index\": True,\n", + " \"dims\": MODEL_DIMS,\n", + " \"similarity\": \"dot_product\"\n", + " }\n", + " }\n", + " }\n", + " }\n", + " }\n", + " }\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Utils: Parent Child Splitter Function\n", + "This function will split a document into multiple passages, and return the parent document with the child passages. \n", + "\n", + "It also has an option to chunk the parent document into smaller documents, meaning the parent document will be split into multiple index documents. We will use this in example 2." + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "\n", + "def parent_child_splitter(data, parent_chunk_size: int | None = None, child_chunk_size: int = 200):\n", + " if parent_chunk_size:\n", + " parent_splitter = RecursiveCharacterTextSplitter(chunk_size=parent_chunk_size)\n", " documents = parent_splitter.split_documents(data)\n", - " doc_ids = [str(uuid.uuid4()) for _ in documents]\n", - "\n", - " docs = []\n", - " for i, doc in enumerate(documents):\n", - " _id = doc_ids[i]\n", - " sub_docs = child_splitter.split_documents([doc])\n", - " for _doc in sub_docs:\n", - " _doc.metadata[id_key] = _id\n", - " _doc.metadata[\"doc_level\"] = \"child\"\n", - " docs.extend(sub_docs)\n", - " doc.metadata[id_key] = _id\n", - " doc.metadata[\"doc_level\"] = \"parent\"\n", - " return documents, docs\n" + " else:\n", + " documents = data\n", + "\n", + " child_splitter = RecursiveCharacterTextSplitter(chunk_size=child_chunk_size)\n", + "\n", + " docs = []\n", + " for i, doc in enumerate(documents):\n", + " passages = []\n", + "\n", + " for _doc in child_splitter.split_documents([doc]):\n", + " passages.append({\n", + " \"text\": _doc.page_content,\n", + " })\n", + "\n", + " doc = {\n", + " \"content\": doc.page_content,\n", + " \"metadata\": doc.metadata,\n", + " \"passages\": passages\n", + " }\n", + " docs.append(doc)\n", + " \n", + " return docs\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Utils: Pretty Response\n", + "This function will print out the response from Elasticsearch in an easier to read format." + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "def pretty_response(response, show_parent_text=False):\n", + " if len(response['hits']['hits']) == 0:\n", + " print('Your search returned no results.')\n", + " else:\n", + " for hit in response['hits']['hits']:\n", + " id = hit['_id']\n", + " score = hit['_score']\n", + " doc_title = hit['_source'][\"metadata\"]['name']\n", + " parent_text = \"\"\n", + "\n", + " if show_parent_text:\n", + " parent_text = hit['_source'][\"content\"]\n", + "\n", + " passage_text = \"\"\n", + "\n", + " for passage in hit['inner_hits']['passages']['hits']['hits']:\n", + " passage_text += passage[\"fields\"][\"passages\"][0]['text'][0] + \"\\n\\n\"\n", + "\n", + " pretty_output = (f\"\\nID: {id}\\nDoc Title: {doc_title}\\nparent text:\\n{parent_text}\\nPassage Text:\\n{passage_text}\\nScore: {score}\\n\")\n", + " print(pretty_output)\n", + " print(\"---\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 1: Full Document, nested passages\n", + "In this example we will split a document into passages, and store the full document as a parent document. We will then store the passages as nested documents, with a link back to the parent document.\n", + "\n", + "Below we are using the parent child splitter to split the full documents into passages. The `parent_child_splitter` fn returns a list of documents, with an array of nested passages. \n", + "\n", + "We then index these documents into Elasticsearch. This will index the full document and the passages will be stored in a nested field. \n", + "\n", + "Our index pipeline processor will then run the inference on the passages, and store the embeddings in the index." + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Indexed 15 documents with [] errors\n" + ] + } + ], + "source": [ + "from elasticsearch import helpers\n", + "\n", + "chunked_docs = parent_child_splitter(loader.load(), parent_chunk_size=None)\n", + "\n", + "count, errors = helpers.bulk(\n", + " client, \n", + " chunked_docs,\n", + " index=INDEX_NAME\n", + ")\n", + "\n", + "print(f\"Indexed {count} documents with {errors} errors\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Perform a Nested Search\n", + "We can now perform a nested search, to find the passages that match our query, which will be returned in `inner_hits`." + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "ID: AvgyPowBeCQuLJUsS_Tv\n", + "Doc Title: Work From Home Policy\n", + "Passage Text:\n", + "Effective: March 2020\n", + "Purpose\n", + "\n", + "The purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.\n", + "Scope\n", + "\n", + "\n", + "Score: 0.84295774\n", + "\n", + "---\n", + "\n", + "ID: CfgyPowBeCQuLJUsS_Tv\n", + "Doc Title: Intellectual Property Policy\n", + "Passage Text:\n", + "Scope\n", + "This policy applies to all employees, including full-time, part-time, temporary, and contract employees.\n", + "\n", + "\n", + "Score: 0.7304177\n", + "\n", + "---\n", + "\n", + "ID: BvgyPowBeCQuLJUsS_Tv\n", + "Doc Title: Company Vacation Policy\n", + "Passage Text:\n", + "Purpose\n", + "\n", + "The purpose of this vacation policy is to outline the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. This policy aims to promote a healthy work-life balance and encourage employees to take time to rest and recharge.\n", + "Scope\n", + "\n", + "\n", + "Score: 0.71928245\n", + "\n", + "---\n", + "\n", + "ID: BPgyPowBeCQuLJUsS_Tv\n", + "Doc Title: Wfh Policy Update May 2023\n", + "Passage Text:\n", + "As we continue to prioritize the well-being of our employees, we are making a slight adjustment to our hybrid work policy. Starting May 1, 2023, employees will be required to work from the office three days a week, with two days designated for remote work. Please communicate with your supervisor and HR department to establish your updated in-office workdays.\n", + "\n", + "\n", + "Score: 0.70840263\n", + "\n", + "---\n", + "\n", + "ID: EPgyPowBeCQuLJUsS_Tv\n", + "Doc Title: New Employee Onboarding Guide\n", + "Passage Text:\n", + "Designate beneficiaries: If applicable, designate beneficiaries for your life insurance and retirement plans.\n", + "Getting Settled in Your Workspace\n", + "To help you feel comfortable and productive in your new workspace, take the following steps:\n", + "\n", + "\n", + "Score: 0.69544923\n", + "\n", + "---\n" + ] + } + ], + "source": [ + "response = client.search(\n", + " index=INDEX_NAME, \n", + " knn={\n", + " \"inner_hits\": {\n", + " \"_source\": False,\n", + " \"fields\": [\n", + " \"passages.text\"\n", + " ]\n", + " },\n", + " \"field\": \"passages.vector.predicted_value\",\n", + " \"k\": 5,\n", + " \"num_candidates\": 100,\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \"sentence-transformers__all-minilm-l6-v2\",\n", + " \"model_text\": \"Whats the work from home policy?\"\n", + " }\n", + " }\n", + " }\n", + ")\n", + "\n", + "pretty_response(response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### With Langchain\n", + "We can also peform this search within Langchain with an adjustment to the query.\n", + "\n", + "We also override the `doc_builder` to populate the `site_content` with the passages rather than the full document." + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='Effective: March 2020\\nPurpose\\n\\nThe purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.\\nScope\\n\\n', metadata={'summary': 'This policy outlines the guidelines for full-time remote work, including eligibility, equipment and resources, workspace requirements, communication expectations, performance expectations, time tracking and overtime, confidentiality and data security, health and well-being, and policy reviews and updates. Employees are encouraged to direct any questions or concerns', 'updated_at': '2020-03-01', 'name': 'Work From Home Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'teams', 'seq_num': 1, 'url': './sharepoint/Work from home policy.txt'}),\n", + " Document(page_content='Scope\\nThis policy applies to all employees, including full-time, part-time, temporary, and contract employees.\\n\\n', metadata={'summary': \"This Intellectual Property Policy outlines guidelines and procedures for the ownership, protection, and utilization of intellectual property generated by employees during their employment. It establishes the company's ownership of work generated on company time, while recognizing employee ownership of work generated outside of company time without the use of company resources. The policy\", 'updated_at': None, 'name': 'Intellectual Property Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'sharepoint', 'seq_num': 8, 'url': 'https://enterprisesearch.sharepoint.com/:t:/s/MSBuilddemo/EWz3cYEVdzBNsiHsYbKhms4BVYGhravyrUw3T3lzxL4pTg?e=mPIgbO'}),\n", + " Document(page_content='Purpose\\n\\nThe purpose of this vacation policy is to outline the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. This policy aims to promote a healthy work-life balance and encourage employees to take time to rest and recharge.\\nScope\\n\\n', metadata={'summary': ': This policy outlines the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. Full-time employees accrue vacation time at a rate of [X hours] per month, equivalent to [Y days] per year. Vacation requests must be submitted to supervisors at least', 'updated_at': '2018-04-16', 'name': 'Company Vacation Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'sharepoint', 'seq_num': 5, 'url': 'https://enterprisesearch.sharepoint.com/:t:/s/MSBuilddemo/ES6rw9bKZxVBobG1WUoJpikBF9Bhx1pw_GvJWbsg-Z_HNA?e=faSHVt'}),\n", + " Document(page_content='As we continue to prioritize the well-being of our employees, we are making a slight adjustment to our hybrid work policy. Starting May 1, 2023, employees will be required to work from the office three days a week, with two days designated for remote work. Please communicate with your supervisor and HR department to establish your updated in-office workdays.\\n\\n', metadata={'summary': 'Starting May 1, 2023, our hybrid work policy will require employees to work from the office three days a week and two days remotely.', 'updated_at': '2023-05-01', 'name': 'Wfh Policy Update May 2023', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'teams', 'seq_num': 3, 'url': './sharepoint/WFH policy update May 2023.txt'})]" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain.vectorstores.elasticsearch import ElasticsearchStore, ApproxRetrievalStrategy\n", + "from typing import List, Union\n", + "from langchain_core.documents import Document\n", + "\n", + "class CustomRetrievalStrategy(ApproxRetrievalStrategy):\n", + "\n", + " def query(\n", + " self,\n", + " query: Union[str, None],\n", + " filter: List[dict],\n", + " **kwargs,\n", + " ):\n", + " \n", + " es_query = {\n", + " \"knn\": {\n", + " \"inner_hits\": {\n", + " \"_source\": False,\n", + " \"fields\": [\n", + " \"passages.text\"\n", + " ]\n", + " },\n", + " \"field\": \"passages.vector.predicted_value\",\n", + " \"filter\": filter,\n", + " \"k\": 5,\n", + " \"num_candidates\": 100,\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \"sentence-transformers__all-minilm-l6-v2\",\n", + " \"model_text\": query\n", + " }\n", + " }\n", + " }\n", + " }\n", + "\n", + " return es_query\n", + " \n", + "\n", + "vector_store = ElasticsearchStore(\n", + " index_name=INDEX_NAME,\n", + " es_connection=client,\n", + " query_field=\"content\",\n", + " strategy=CustomRetrievalStrategy(),\n", + ")\n", + "\n", + "def doc_builder(hit):\n", + " passage_hits = hit.get(\"inner_hits\", {}).get(\"passages\", {}).get(\"hits\", {}).get(\"hits\", [])\n", + " page_content = \"\"\n", + " for passage_hit in passage_hits:\n", + " passage_fields = passage_hit.get(\"fields\", {}).get(\"passages\", [])[0]\n", + " page_content += passage_fields.get(\"text\", [])[0] + \"\\n\\n\"\n", + "\n", + " return Document(\n", + " page_content=page_content,\n", + " metadata=hit[\"_source\"][\"metadata\"],\n", + " )\n", + "\n", + "vector_store.similarity_search(query=\"Whats the work from home policy?\", doc_builder=doc_builder)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Example 2: Parent Child Retriever\n", + "In the above example, we are storing the full document in the parent document. You can also still chunk the document into chunks that are large enough to retain context, but split the chunk into many small passages and store them in the parent chunk. This allows you to retrieve the parent chunk, but the passage embeddings can be very precise which link back to the parent chunk.\n", + "\n", + "Below we are using the same parent_child_splitter, but we are specifying the `parent_chunk_size` to be 2000 characters. This means that the parent chunk will be 2000 characters long, and the passages will be 200 characters long.\n", + "\n", + "You can see from the response we have now stored 32 documents in our index, representing the 15 documents from our dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Indexed 32 documents with [] errors\n" + ] + } + ], + "source": [ + "# delete documents in the index\n", + "client.delete_by_query(index=INDEX_NAME, query={\"match_all\": {}})\n", + "\n", + "chunked_docs = parent_child_splitter(loader.load(), parent_chunk_size=2000, child_chunk_size=200)\n", + "\n", + "count, errors = helpers.bulk(\n", + " client, \n", + " chunked_docs,\n", + " index=INDEX_NAME\n", + ")\n", + "\n", + "print(f\"Indexed {count} documents with {errors} errors\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Retrieving the parent Chunks\n", + "We can perform a normal nested dense vector query to retrieve the parent chunks. We can see that the parent chunks are returned, but the passages are not." + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "ID: FfltP4wBeCQuLJUsARJC\n", + "Doc Title: Work From Home Policy\n", + "parent text:\n", + "Effective: March 2020\n", + "Purpose\n", + "\n", + "The purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.\n", + "Scope\n", + "\n", + "This policy applies to all employees who are eligible for remote work as determined by their role and responsibilities. It is designed to allow employees to work from home full time while maintaining the same level of performance and collaboration as they would in the office.\n", + "Eligibility\n", + "\n", + "Employees who can perform their work duties remotely and have received approval from their direct supervisor and the HR department are eligible for this work-from-home arrangement.\n", + "Equipment and Resources\n", + "\n", + "The necessary equipment and resources will be provided to employees for remote work, including a company-issued laptop, software licenses, and access to secure communication tools. Employees are responsible for maintaining and protecting the company's equipment and data.\n", + "Workspace\n", + "\n", + "Employees working from home are responsible for creating a comfortable and safe workspace that is conducive to productivity. This includes ensuring that their home office is ergonomically designed, well-lit, and free from distractions.\n", + "Communication\n", + "\n", + "Effective communication is vital for successful remote work. Employees are expected to maintain regular communication with their supervisors, colleagues, and team members through email, phone calls, video conferences, and other approved communication tools.\n", + "Work Hours and Availability\n", + "\n", + "Employees are expected to maintain their regular work hours and be available during normal business hours, unless otherwise agreed upon with their supervisor. Any changes to work hours or availability must be communicated to the employee's supervisor and the HR department.\n", + "Performance Expectations\n", + "Passage Text:\n", + "The purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations\n", + "\n", + "\n", + "Score: 0.8779937\n", + "\n", + "---\n", + "\n", + "ID: FvltP4wBeCQuLJUsARJC\n", + "Doc Title: Work From Home Policy\n", + "parent text:\n", + "Employees working from home are expected to maintain the same level of performance and productivity as if they were working in the office. Supervisors and team members will collaborate to establish clear expectations and goals for remote work.\n", + "Time Tracking and Overtime\n", + "\n", + "Employees are required to accurately track their work hours using the company's time tracking system. Non-exempt employees must obtain approval from their supervisor before working overtime.\n", + "Confidentiality and Data Security\n", + "\n", + "Employees must adhere to the company's confidentiality and data security policies while working from home. This includes safeguarding sensitive information, securing personal devices and internet connections, and reporting any security breaches to the IT department.\n", + "Health and Well-being\n", + "\n", + "The company encourages employees to prioritize their health and well-being while working from home. This includes taking regular breaks, maintaining a work-life balance, and seeking support from supervisors and colleagues when needed.\n", + "Policy Review and Updates\n", + "\n", + "This work-from-home policy will be reviewed periodically and updated as necessary, taking into account changes in public health guidance, business needs, and employee feedback.\n", + "Questions and Concerns\n", + "\n", + "Employees are encouraged to direct any questions or concerns about this policy to their supervisor or the HR department.\n", + "Passage Text:\n", + "policies while working from home. This includes safeguarding sensitive information, securing personal devices and internet connections, and reporting any security breaches to the IT department.\n", + "\n", + "\n", + "Score: 0.8585499\n", + "\n", + "---\n", + "\n", + "ID: GPltP4wBeCQuLJUsARJC\n", + "Doc Title: Wfh Policy Update May 2023\n", + "parent text:\n", + "As we continue to prioritize the well-being of our employees, we are making a slight adjustment to our hybrid work policy. Starting May 1, 2023, employees will be required to work from the office three days a week, with two days designated for remote work. Please communicate with your supervisor and HR department to establish your updated in-office workdays.\n", + "Passage Text:\n", + "adjustment to our hybrid work policy. Starting May 1, 2023, employees will be required to work from the office three days a week, with two days designated for remote work. Please communicate with\n", + "\n", + "\n", + "Score: 0.7466323\n", + "\n", + "---\n", + "\n", + "ID: IvltP4wBeCQuLJUsARJC\n", + "Doc Title: Intellectual Property Policy\n", + "parent text:\n", + "Purpose\n", + "The purpose of this Intellectual Property Policy is to establish guidelines and procedures for the ownership, protection, and utilization of intellectual property generated by employees during their employment. This policy aims to encourage creativity and innovation while ensuring that the interests of both the company and its employees are protected.\n", + "\n", + "Scope\n", + "This policy applies to all employees, including full-time, part-time, temporary, and contract employees.\n", + "\n", + "Definitions\n", + "a. Intellectual Property (IP): Refers to creations of the mind, such as inventions, literary and artistic works, designs, symbols, and images, that are protected by copyright, trademark, patent, or other forms of legal protection.\n", + "b. Company Time: Refers to the time during which an employee is actively engaged in performing their job duties.\n", + "c. Outside Company Time: Refers to the time during which an employee is not engaged in performing their job duties.\n", + "Passage Text:\n", + "property generated by employees during their employment. This policy aims to encourage creativity and innovation while ensuring that the interests of both the company and its employees are protected.\n", + "\n", + "\n", + "Score: 0.74160063\n", + "\n", + "---\n", + "\n", + "ID: G_ltP4wBeCQuLJUsARJC\n", + "Doc Title: Company Vacation Policy\n", + "parent text:\n", + "Purpose\n", + "\n", + "The purpose of this vacation policy is to outline the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. This policy aims to promote a healthy work-life balance and encourage employees to take time to rest and recharge.\n", + "Scope\n", + "\n", + "This policy applies to all full-time and part-time employees who have completed their probationary period.\n", + "Vacation Accrual\n", + "\n", + "Full-time employees accrue vacation time at a rate of [X hours] per month, equivalent to [Y days] per year. Part-time employees accrue vacation time on a pro-rata basis, calculated according to their scheduled work hours.\n", + "\n", + "Vacation time will begin to accrue from the first day of employment, but employees are eligible to take vacation time only after completing their probationary period. Unused vacation time will be carried over to the next year, up to a maximum of [Z days]. Any additional unused vacation time will be forfeited.\n", + "Vacation Scheduling\n", + "\n", + "Employees are required to submit vacation requests to their supervisor at least [A weeks] in advance, specifying the start and end dates of their vacation. Supervisors will review and approve vacation requests based on business needs, ensuring adequate coverage during the employee's absence.\n", + "\n", + "Employees are encouraged to plan their vacations around the company's peak and non-peak periods to minimize disruptions. Vacation requests during peak periods may be subject to limitations and require additional advance notice.\n", + "Vacation Pay\n", + "\n", + "Employees will receive their regular pay during their approved vacation time. Vacation pay will be calculated based on the employee's average earnings over the [B weeks] preceding their vacation.\n", + "Unplanned Absences and Vacation Time\n", + "Passage Text:\n", + "of this vacation policy is to outline the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. This policy aims to promote a healthy work-life\n", + "\n", + "\n", + "Score: 0.7381881\n", + "\n", + "---\n" + ] + } + ], + "source": [ + "response = client.search(\n", + " index=INDEX_NAME,\n", + " source_includes=[\"content\", \"metadata\"],\n", + " knn={\n", + " \"inner_hits\": {\n", + " \"_source\": False,\n", + " \"fields\": [\n", + " \"passages.text\"\n", + " ]\n", + " },\n", + " \"field\": \"passages.vector.predicted_value\",\n", + " \"k\": 5,\n", + " \"num_candidates\": 100,\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \"sentence-transformers__all-minilm-l6-v2\",\n", + " \"model_text\": \"Whats the work from home policy?\"\n", + " }\n", + " }\n", + " }\n", + ")\n", + "\n", + "pretty_response(response, show_parent_text=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### With Langchain\n", + "You can also use Langchain to retrieve the passages from the parent chunks. In combination with the nested query search configured in the ElasticsearchStore strategy, we retrieve the parent chunks that are relevant to one or more chunked passages. " + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content=\"Effective: March 2020\\nPurpose\\n\\nThe purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.\\nScope\\n\\nThis policy applies to all employees who are eligible for remote work as determined by their role and responsibilities. It is designed to allow employees to work from home full time while maintaining the same level of performance and collaboration as they would in the office.\\nEligibility\\n\\nEmployees who can perform their work duties remotely and have received approval from their direct supervisor and the HR department are eligible for this work-from-home arrangement.\\nEquipment and Resources\\n\\nThe necessary equipment and resources will be provided to employees for remote work, including a company-issued laptop, software licenses, and access to secure communication tools. Employees are responsible for maintaining and protecting the company's equipment and data.\\nWorkspace\\n\\nEmployees working from home are responsible for creating a comfortable and safe workspace that is conducive to productivity. This includes ensuring that their home office is ergonomically designed, well-lit, and free from distractions.\\nCommunication\\n\\nEffective communication is vital for successful remote work. Employees are expected to maintain regular communication with their supervisors, colleagues, and team members through email, phone calls, video conferences, and other approved communication tools.\\nWork Hours and Availability\\n\\nEmployees are expected to maintain their regular work hours and be available during normal business hours, unless otherwise agreed upon with their supervisor. Any changes to work hours or availability must be communicated to the employee's supervisor and the HR department.\\nPerformance Expectations\\n\\nEmployees working from home are expected to maintain the same level of performance and productivity as if they were working in the office. Supervisors and team members will collaborate to establish clear expectations and goals for remote work.\\nTime Tracking and Overtime\\n\\nEmployees are required to accurately track their work hours using the company's time tracking system. Non-exempt employees must obtain approval from their supervisor before working overtime.\\nConfidentiality and Data Security\\n\\nEmployees must adhere to the company's confidentiality and data security policies while working from home. This includes safeguarding sensitive information, securing personal devices and internet connections, and reporting any security breaches to the IT department.\\nHealth and Well-being\\n\\nThe company encourages employees to prioritize their health and well-being while working from home. This includes taking regular breaks, maintaining a work-life balance, and seeking support from supervisors and colleagues when needed.\\nPolicy Review and Updates\\n\\nThis work-from-home policy will be reviewed periodically and updated as necessary, taking into account changes in public health guidance, business needs, and employee feedback.\\nQuestions and Concerns\\n\\nEmployees are encouraged to direct any questions or concerns about this policy to their supervisor or the HR department.\", metadata={'summary': 'This policy outlines the guidelines for full-time remote work, including eligibility, equipment and resources, workspace requirements, communication expectations, performance expectations, time tracking and overtime, confidentiality and data security, health and well-being, and policy reviews and updates. Employees are encouraged to direct any questions or concerns', 'updated_at': '2020-03-01', 'name': 'Work From Home Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'teams', 'seq_num': 1, 'url': './sharepoint/Work from home policy.txt'}),\n", + " Document(page_content=\"Purpose\\nThe purpose of this Intellectual Property Policy is to establish guidelines and procedures for the ownership, protection, and utilization of intellectual property generated by employees during their employment. This policy aims to encourage creativity and innovation while ensuring that the interests of both the company and its employees are protected.\\n\\nScope\\nThis policy applies to all employees, including full-time, part-time, temporary, and contract employees.\\n\\nDefinitions\\na. Intellectual Property (IP): Refers to creations of the mind, such as inventions, literary and artistic works, designs, symbols, and images, that are protected by copyright, trademark, patent, or other forms of legal protection.\\nb. Company Time: Refers to the time during which an employee is actively engaged in performing their job duties.\\nc. Outside Company Time: Refers to the time during which an employee is not engaged in performing their job duties.\\n\\nOwnership of Intellectual Property\\na. Work Generated on Company Time\\ni. Any intellectual property created, conceived, or developed by an employee during company time or using company resources, equipment, or facilities shall be considered the property of the Company.\\nii. Employees are required to promptly disclose any such intellectual property to their supervisor or the appropriate department head.\\nb. Work Generated Outside Company Time\\ni. Intellectual property created, conceived, or developed by an employee outside of company time and without the use of company resources, equipment, or facilities shall generally remain the property of the employee.\\nii. However, if the intellectual property is directly related to the employee's job responsibilities, or if the employee has used company resources, equipment, or facilities in its creation, it may be considered the property of the Company.\\nProtection and Utilization of Intellectual Property\\na. The Company shall have the right to protect, license, and commercialize any intellectual property owned by the company as it deems appropriate.\\nb. Employees are expected to cooperate with the Company in obtaining any necessary legal protection for intellectual property owned by the company, including by signing any documents or providing any necessary information or assistance.\\nConfidentiality\\nEmployees are expected to maintain the confidentiality of any intellectual property owned by the Company and not disclose it to any third parties without the express written consent of an authorized representative of the company.\\nEmployee Acknowledgment\\nAll employees are required to sign an acknowledgment of this Intellectual Property Policy as a condition of their employment with [Company Name]. By signing the acknowledgment, employees agree to abide by the terms of this policy and understand that any violations may result in disciplinary action, up to and including termination of employment.\\nPolicy Review\\nThis Intellectual Property Policy shall be reviewed periodically and may be amended as necessary to ensure its continued effectiveness and compliance with applicable laws and regulations. Employees will be notified of any significant changes to this policy.\", metadata={'summary': \"This Intellectual Property Policy outlines guidelines and procedures for the ownership, protection, and utilization of intellectual property generated by employees during their employment. It establishes the company's ownership of work generated on company time, while recognizing employee ownership of work generated outside of company time without the use of company resources. The policy\", 'updated_at': None, 'name': 'Intellectual Property Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'sharepoint', 'seq_num': 8, 'url': 'https://enterprisesearch.sharepoint.com/:t:/s/MSBuilddemo/EWz3cYEVdzBNsiHsYbKhms4BVYGhravyrUw3T3lzxL4pTg?e=mPIgbO'}),\n", + " Document(page_content=\"Purpose\\n\\nThe purpose of this vacation policy is to outline the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. This policy aims to promote a healthy work-life balance and encourage employees to take time to rest and recharge.\\nScope\\n\\nThis policy applies to all full-time and part-time employees who have completed their probationary period.\\nVacation Accrual\\n\\nFull-time employees accrue vacation time at a rate of [X hours] per month, equivalent to [Y days] per year. Part-time employees accrue vacation time on a pro-rata basis, calculated according to their scheduled work hours.\\n\\nVacation time will begin to accrue from the first day of employment, but employees are eligible to take vacation time only after completing their probationary period. Unused vacation time will be carried over to the next year, up to a maximum of [Z days]. Any additional unused vacation time will be forfeited.\\nVacation Scheduling\\n\\nEmployees are required to submit vacation requests to their supervisor at least [A weeks] in advance, specifying the start and end dates of their vacation. Supervisors will review and approve vacation requests based on business needs, ensuring adequate coverage during the employee's absence.\\n\\nEmployees are encouraged to plan their vacations around the company's peak and non-peak periods to minimize disruptions. Vacation requests during peak periods may be subject to limitations and require additional advance notice.\\nVacation Pay\\n\\nEmployees will receive their regular pay during their approved vacation time. Vacation pay will be calculated based on the employee's average earnings over the [B weeks] preceding their vacation.\\nUnplanned Absences and Vacation Time\\n\\nIn the event of an unplanned absence due to illness or personal emergencies, employees may use their accrued vacation time, subject to supervisor approval. Employees must inform their supervisor as soon as possible and provide any required documentation upon their return to work.\\nVacation Time and Termination of Employment\\n\\nIf an employee's employment is terminated, they will be paid out for any unused vacation time, calculated based on their current rate of pay.\\nPolicy Review and Updates\\n\\nThis vacation policy will be reviewed periodically and updated as necessary, taking into account changes in labor laws, business needs, and employee feedback.\\nQuestions and Concerns\\n\\nEmployees are encouraged to direct any questions or concerns about this policy to their supervisor or the HR department.\", metadata={'summary': ': This policy outlines the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. Full-time employees accrue vacation time at a rate of [X hours] per month, equivalent to [Y days] per year. Vacation requests must be submitted to supervisors at least', 'updated_at': '2018-04-16', 'name': 'Company Vacation Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'sharepoint', 'seq_num': 5, 'url': 'https://enterprisesearch.sharepoint.com/:t:/s/MSBuilddemo/ES6rw9bKZxVBobG1WUoJpikBF9Bhx1pw_GvJWbsg-Z_HNA?e=faSHVt'}),\n", + " Document(page_content='As we continue to prioritize the well-being of our employees, we are making a slight adjustment to our hybrid work policy. Starting May 1, 2023, employees will be required to work from the office three days a week, with two days designated for remote work. Please communicate with your supervisor and HR department to establish your updated in-office workdays.', metadata={'summary': 'Starting May 1, 2023, our hybrid work policy will require employees to work from the office three days a week and two days remotely.', 'updated_at': '2023-05-01', 'name': 'Wfh Policy Update May 2023', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'teams', 'seq_num': 3, 'url': './sharepoint/WFH policy update May 2023.txt'})]" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vector_store.similarity_search(query=\"Whats the work from home policy?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ObjectApiResponse({'acknowledged': True})" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client.indices.delete(index=INDEX_NAME)" ] } ], From 36100f50f451d825cd0576df0ccf1ffd7c4ac64e Mon Sep 17 00:00:00 2001 From: Joseph McElroy Date: Wed, 6 Dec 2023 16:37:37 +0000 Subject: [PATCH 5/7] updates --- .../langchain-parent-retriever.ipynb | 86 ++++++++++++------- 1 file changed, 55 insertions(+), 31 deletions(-) diff --git a/notebooks/langchain/langchain-parent-retriever.ipynb b/notebooks/langchain/langchain-parent-retriever.ipynb index a18f3ced..eb7e3cde 100644 --- a/notebooks/langchain/langchain-parent-retriever.ipynb +++ b/notebooks/langchain/langchain-parent-retriever.ipynb @@ -326,7 +326,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 71, "metadata": {}, "outputs": [ { @@ -340,7 +340,7 @@ "source": [ "from elasticsearch import helpers\n", "\n", - "chunked_docs = parent_child_splitter(loader.load(), parent_chunk_size=None)\n", + "chunked_docs = parent_child_splitter(loader.load(), parent_chunk_size=None, child_chunk_size=600)\n", "\n", "count, errors = helpers.bulk(\n", " client, \n", @@ -361,7 +361,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 72, "metadata": {}, "outputs": [ { @@ -369,8 +369,10 @@ "output_type": "stream", "text": [ "\n", - "ID: AvgyPowBeCQuLJUsS_Tv\n", + "ID: 2vn8P4wBeCQuLJUsMR_I\n", "Doc Title: Work From Home Policy\n", + "parent text:\n", + "\n", "Passage Text:\n", "Effective: March 2020\n", "Purpose\n", @@ -378,37 +380,52 @@ "The purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.\n", "Scope\n", "\n", + "This policy applies to all employees who are eligible for remote work as determined by their role and responsibilities. It is designed to allow employees to work from home full time while maintaining the same level of performance and collaboration as they would in the office.\n", + "Eligibility\n", + "\n", "\n", - "Score: 0.84295774\n", + "Score: 0.8483097\n", "\n", "---\n", "\n", - "ID: CfgyPowBeCQuLJUsS_Tv\n", + "ID: 4fn8P4wBeCQuLJUsMR_I\n", "Doc Title: Intellectual Property Policy\n", + "parent text:\n", + "\n", "Passage Text:\n", + "Purpose\n", + "The purpose of this Intellectual Property Policy is to establish guidelines and procedures for the ownership, protection, and utilization of intellectual property generated by employees during their employment. This policy aims to encourage creativity and innovation while ensuring that the interests of both the company and its employees are protected.\n", + "\n", "Scope\n", "This policy applies to all employees, including full-time, part-time, temporary, and contract employees.\n", "\n", "\n", - "Score: 0.7304177\n", + "Score: 0.7292882\n", "\n", "---\n", "\n", - "ID: BvgyPowBeCQuLJUsS_Tv\n", + "ID: 3vn8P4wBeCQuLJUsMR_I\n", "Doc Title: Company Vacation Policy\n", + "parent text:\n", + "\n", "Passage Text:\n", "Purpose\n", "\n", "The purpose of this vacation policy is to outline the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. This policy aims to promote a healthy work-life balance and encourage employees to take time to rest and recharge.\n", "Scope\n", "\n", + "This policy applies to all full-time and part-time employees who have completed their probationary period.\n", + "Vacation Accrual\n", + "\n", "\n", - "Score: 0.71928245\n", + "Score: 0.7137784\n", "\n", "---\n", "\n", - "ID: BPgyPowBeCQuLJUsS_Tv\n", + "ID: 3Pn8P4wBeCQuLJUsMR_I\n", "Doc Title: Wfh Policy Update May 2023\n", + "parent text:\n", + "\n", "Passage Text:\n", "As we continue to prioritize the well-being of our employees, we are making a slight adjustment to our hybrid work policy. Starting May 1, 2023, employees will be required to work from the office three days a week, with two days designated for remote work. Please communicate with your supervisor and HR department to establish your updated in-office workdays.\n", "\n", @@ -417,15 +434,19 @@ "\n", "---\n", "\n", - "ID: EPgyPowBeCQuLJUsS_Tv\n", + "ID: 6Pn8P4wBeCQuLJUsMR_I\n", "Doc Title: New Employee Onboarding Guide\n", + "parent text:\n", + "\n", "Passage Text:\n", + "Review benefits options: Carefully review the benefits package and choose the options that best meet your needs.\n", + "Complete enrollment forms: Fill out the necessary forms to enroll in your chosen benefits. Submit these forms to the HR department within 30 days of your start date.\n", "Designate beneficiaries: If applicable, designate beneficiaries for your life insurance and retirement plans.\n", "Getting Settled in Your Workspace\n", "To help you feel comfortable and productive in your new workspace, take the following steps:\n", "\n", "\n", - "Score: 0.69544923\n", + "Score: 0.6890813\n", "\n", "---\n" ] @@ -468,19 +489,19 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 73, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Document(page_content='Effective: March 2020\\nPurpose\\n\\nThe purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.\\nScope\\n\\n', metadata={'summary': 'This policy outlines the guidelines for full-time remote work, including eligibility, equipment and resources, workspace requirements, communication expectations, performance expectations, time tracking and overtime, confidentiality and data security, health and well-being, and policy reviews and updates. Employees are encouraged to direct any questions or concerns', 'updated_at': '2020-03-01', 'name': 'Work From Home Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'teams', 'seq_num': 1, 'url': './sharepoint/Work from home policy.txt'}),\n", - " Document(page_content='Scope\\nThis policy applies to all employees, including full-time, part-time, temporary, and contract employees.\\n\\n', metadata={'summary': \"This Intellectual Property Policy outlines guidelines and procedures for the ownership, protection, and utilization of intellectual property generated by employees during their employment. It establishes the company's ownership of work generated on company time, while recognizing employee ownership of work generated outside of company time without the use of company resources. The policy\", 'updated_at': None, 'name': 'Intellectual Property Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'sharepoint', 'seq_num': 8, 'url': 'https://enterprisesearch.sharepoint.com/:t:/s/MSBuilddemo/EWz3cYEVdzBNsiHsYbKhms4BVYGhravyrUw3T3lzxL4pTg?e=mPIgbO'}),\n", - " Document(page_content='Purpose\\n\\nThe purpose of this vacation policy is to outline the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. This policy aims to promote a healthy work-life balance and encourage employees to take time to rest and recharge.\\nScope\\n\\n', metadata={'summary': ': This policy outlines the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. Full-time employees accrue vacation time at a rate of [X hours] per month, equivalent to [Y days] per year. Vacation requests must be submitted to supervisors at least', 'updated_at': '2018-04-16', 'name': 'Company Vacation Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'sharepoint', 'seq_num': 5, 'url': 'https://enterprisesearch.sharepoint.com/:t:/s/MSBuilddemo/ES6rw9bKZxVBobG1WUoJpikBF9Bhx1pw_GvJWbsg-Z_HNA?e=faSHVt'}),\n", + "[Document(page_content='Effective: March 2020\\nPurpose\\n\\nThe purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.\\nScope\\n\\nThis policy applies to all employees who are eligible for remote work as determined by their role and responsibilities. It is designed to allow employees to work from home full time while maintaining the same level of performance and collaboration as they would in the office.\\nEligibility\\n\\n', metadata={'summary': 'This policy outlines the guidelines for full-time remote work, including eligibility, equipment and resources, workspace requirements, communication expectations, performance expectations, time tracking and overtime, confidentiality and data security, health and well-being, and policy reviews and updates. Employees are encouraged to direct any questions or concerns', 'updated_at': '2020-03-01', 'name': 'Work From Home Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'teams', 'seq_num': 1, 'url': './sharepoint/Work from home policy.txt'}),\n", + " Document(page_content='Purpose\\nThe purpose of this Intellectual Property Policy is to establish guidelines and procedures for the ownership, protection, and utilization of intellectual property generated by employees during their employment. This policy aims to encourage creativity and innovation while ensuring that the interests of both the company and its employees are protected.\\n\\nScope\\nThis policy applies to all employees, including full-time, part-time, temporary, and contract employees.\\n\\n', metadata={'summary': \"This Intellectual Property Policy outlines guidelines and procedures for the ownership, protection, and utilization of intellectual property generated by employees during their employment. It establishes the company's ownership of work generated on company time, while recognizing employee ownership of work generated outside of company time without the use of company resources. The policy\", 'updated_at': None, 'name': 'Intellectual Property Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'sharepoint', 'seq_num': 8, 'url': 'https://enterprisesearch.sharepoint.com/:t:/s/MSBuilddemo/EWz3cYEVdzBNsiHsYbKhms4BVYGhravyrUw3T3lzxL4pTg?e=mPIgbO'}),\n", + " Document(page_content='Purpose\\n\\nThe purpose of this vacation policy is to outline the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. This policy aims to promote a healthy work-life balance and encourage employees to take time to rest and recharge.\\nScope\\n\\nThis policy applies to all full-time and part-time employees who have completed their probationary period.\\nVacation Accrual\\n\\n', metadata={'summary': ': This policy outlines the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. Full-time employees accrue vacation time at a rate of [X hours] per month, equivalent to [Y days] per year. Vacation requests must be submitted to supervisors at least', 'updated_at': '2018-04-16', 'name': 'Company Vacation Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'sharepoint', 'seq_num': 5, 'url': 'https://enterprisesearch.sharepoint.com/:t:/s/MSBuilddemo/ES6rw9bKZxVBobG1WUoJpikBF9Bhx1pw_GvJWbsg-Z_HNA?e=faSHVt'}),\n", " Document(page_content='As we continue to prioritize the well-being of our employees, we are making a slight adjustment to our hybrid work policy. Starting May 1, 2023, employees will be required to work from the office three days a week, with two days designated for remote work. Please communicate with your supervisor and HR department to establish your updated in-office workdays.\\n\\n', metadata={'summary': 'Starting May 1, 2023, our hybrid work policy will require employees to work from the office three days a week and two days remotely.', 'updated_at': '2023-05-01', 'name': 'Wfh Policy Update May 2023', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'teams', 'seq_num': 3, 'url': './sharepoint/WFH policy update May 2023.txt'})]" ] }, - "execution_count": 62, + "execution_count": 73, "metadata": {}, "output_type": "execute_result" } @@ -559,7 +580,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 74, "metadata": {}, "outputs": [ { @@ -574,6 +595,7 @@ "# delete documents in the index\n", "client.delete_by_query(index=INDEX_NAME, query={\"match_all\": {}})\n", "\n", + "# Index the documents, this time with parent-child splitting\n", "chunked_docs = parent_child_splitter(loader.load(), parent_chunk_size=2000, child_chunk_size=200)\n", "\n", "count, errors = helpers.bulk(\n", @@ -590,12 +612,14 @@ "metadata": {}, "source": [ "### Retrieving the parent Chunks\n", - "We can perform a normal nested dense vector query to retrieve the parent chunks. We can see that the parent chunks are returned, but the passages are not." + "We can perform a normal nested dense vector query to retrieve the parent chunks. We can see that the parent chunks are returned, but the passages are not.\n", + "\n", + "In this example we can see the content is much larger, giving us more context and the embeddings remain as precise as the previous example." ] }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 75, "metadata": {}, "outputs": [ { @@ -603,7 +627,7 @@ "output_type": "stream", "text": [ "\n", - "ID: FfltP4wBeCQuLJUsARJC\n", + "ID: 7_n8P4wBeCQuLJUsox-g\n", "Doc Title: Work From Home Policy\n", "parent text:\n", "Effective: March 2020\n", @@ -637,7 +661,7 @@ "\n", "---\n", "\n", - "ID: FvltP4wBeCQuLJUsARJC\n", + "ID: 8Pn8P4wBeCQuLJUsox-g\n", "Doc Title: Work From Home Policy\n", "parent text:\n", "Employees working from home are expected to maintain the same level of performance and productivity as if they were working in the office. Supervisors and team members will collaborate to establish clear expectations and goals for remote work.\n", @@ -664,7 +688,7 @@ "\n", "---\n", "\n", - "ID: GPltP4wBeCQuLJUsARJC\n", + "ID: 8vn8P4wBeCQuLJUsox-g\n", "Doc Title: Wfh Policy Update May 2023\n", "parent text:\n", "As we continue to prioritize the well-being of our employees, we are making a slight adjustment to our hybrid work policy. Starting May 1, 2023, employees will be required to work from the office three days a week, with two days designated for remote work. Please communicate with your supervisor and HR department to establish your updated in-office workdays.\n", @@ -676,7 +700,7 @@ "\n", "---\n", "\n", - "ID: IvltP4wBeCQuLJUsARJC\n", + "ID: _Pn8P4wBeCQuLJUsox-g\n", "Doc Title: Intellectual Property Policy\n", "parent text:\n", "Purpose\n", @@ -697,7 +721,7 @@ "\n", "---\n", "\n", - "ID: G_ltP4wBeCQuLJUsARJC\n", + "ID: 9fn8P4wBeCQuLJUsox-g\n", "Doc Title: Company Vacation Policy\n", "parent text:\n", "Purpose\n", @@ -766,19 +790,19 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 76, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Document(page_content=\"Effective: March 2020\\nPurpose\\n\\nThe purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.\\nScope\\n\\nThis policy applies to all employees who are eligible for remote work as determined by their role and responsibilities. It is designed to allow employees to work from home full time while maintaining the same level of performance and collaboration as they would in the office.\\nEligibility\\n\\nEmployees who can perform their work duties remotely and have received approval from their direct supervisor and the HR department are eligible for this work-from-home arrangement.\\nEquipment and Resources\\n\\nThe necessary equipment and resources will be provided to employees for remote work, including a company-issued laptop, software licenses, and access to secure communication tools. Employees are responsible for maintaining and protecting the company's equipment and data.\\nWorkspace\\n\\nEmployees working from home are responsible for creating a comfortable and safe workspace that is conducive to productivity. This includes ensuring that their home office is ergonomically designed, well-lit, and free from distractions.\\nCommunication\\n\\nEffective communication is vital for successful remote work. Employees are expected to maintain regular communication with their supervisors, colleagues, and team members through email, phone calls, video conferences, and other approved communication tools.\\nWork Hours and Availability\\n\\nEmployees are expected to maintain their regular work hours and be available during normal business hours, unless otherwise agreed upon with their supervisor. Any changes to work hours or availability must be communicated to the employee's supervisor and the HR department.\\nPerformance Expectations\\n\\nEmployees working from home are expected to maintain the same level of performance and productivity as if they were working in the office. Supervisors and team members will collaborate to establish clear expectations and goals for remote work.\\nTime Tracking and Overtime\\n\\nEmployees are required to accurately track their work hours using the company's time tracking system. Non-exempt employees must obtain approval from their supervisor before working overtime.\\nConfidentiality and Data Security\\n\\nEmployees must adhere to the company's confidentiality and data security policies while working from home. This includes safeguarding sensitive information, securing personal devices and internet connections, and reporting any security breaches to the IT department.\\nHealth and Well-being\\n\\nThe company encourages employees to prioritize their health and well-being while working from home. This includes taking regular breaks, maintaining a work-life balance, and seeking support from supervisors and colleagues when needed.\\nPolicy Review and Updates\\n\\nThis work-from-home policy will be reviewed periodically and updated as necessary, taking into account changes in public health guidance, business needs, and employee feedback.\\nQuestions and Concerns\\n\\nEmployees are encouraged to direct any questions or concerns about this policy to their supervisor or the HR department.\", metadata={'summary': 'This policy outlines the guidelines for full-time remote work, including eligibility, equipment and resources, workspace requirements, communication expectations, performance expectations, time tracking and overtime, confidentiality and data security, health and well-being, and policy reviews and updates. Employees are encouraged to direct any questions or concerns', 'updated_at': '2020-03-01', 'name': 'Work From Home Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'teams', 'seq_num': 1, 'url': './sharepoint/Work from home policy.txt'}),\n", - " Document(page_content=\"Purpose\\nThe purpose of this Intellectual Property Policy is to establish guidelines and procedures for the ownership, protection, and utilization of intellectual property generated by employees during their employment. This policy aims to encourage creativity and innovation while ensuring that the interests of both the company and its employees are protected.\\n\\nScope\\nThis policy applies to all employees, including full-time, part-time, temporary, and contract employees.\\n\\nDefinitions\\na. Intellectual Property (IP): Refers to creations of the mind, such as inventions, literary and artistic works, designs, symbols, and images, that are protected by copyright, trademark, patent, or other forms of legal protection.\\nb. Company Time: Refers to the time during which an employee is actively engaged in performing their job duties.\\nc. Outside Company Time: Refers to the time during which an employee is not engaged in performing their job duties.\\n\\nOwnership of Intellectual Property\\na. Work Generated on Company Time\\ni. Any intellectual property created, conceived, or developed by an employee during company time or using company resources, equipment, or facilities shall be considered the property of the Company.\\nii. Employees are required to promptly disclose any such intellectual property to their supervisor or the appropriate department head.\\nb. Work Generated Outside Company Time\\ni. Intellectual property created, conceived, or developed by an employee outside of company time and without the use of company resources, equipment, or facilities shall generally remain the property of the employee.\\nii. However, if the intellectual property is directly related to the employee's job responsibilities, or if the employee has used company resources, equipment, or facilities in its creation, it may be considered the property of the Company.\\nProtection and Utilization of Intellectual Property\\na. The Company shall have the right to protect, license, and commercialize any intellectual property owned by the company as it deems appropriate.\\nb. Employees are expected to cooperate with the Company in obtaining any necessary legal protection for intellectual property owned by the company, including by signing any documents or providing any necessary information or assistance.\\nConfidentiality\\nEmployees are expected to maintain the confidentiality of any intellectual property owned by the Company and not disclose it to any third parties without the express written consent of an authorized representative of the company.\\nEmployee Acknowledgment\\nAll employees are required to sign an acknowledgment of this Intellectual Property Policy as a condition of their employment with [Company Name]. By signing the acknowledgment, employees agree to abide by the terms of this policy and understand that any violations may result in disciplinary action, up to and including termination of employment.\\nPolicy Review\\nThis Intellectual Property Policy shall be reviewed periodically and may be amended as necessary to ensure its continued effectiveness and compliance with applicable laws and regulations. Employees will be notified of any significant changes to this policy.\", metadata={'summary': \"This Intellectual Property Policy outlines guidelines and procedures for the ownership, protection, and utilization of intellectual property generated by employees during their employment. It establishes the company's ownership of work generated on company time, while recognizing employee ownership of work generated outside of company time without the use of company resources. The policy\", 'updated_at': None, 'name': 'Intellectual Property Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'sharepoint', 'seq_num': 8, 'url': 'https://enterprisesearch.sharepoint.com/:t:/s/MSBuilddemo/EWz3cYEVdzBNsiHsYbKhms4BVYGhravyrUw3T3lzxL4pTg?e=mPIgbO'}),\n", - " Document(page_content=\"Purpose\\n\\nThe purpose of this vacation policy is to outline the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. This policy aims to promote a healthy work-life balance and encourage employees to take time to rest and recharge.\\nScope\\n\\nThis policy applies to all full-time and part-time employees who have completed their probationary period.\\nVacation Accrual\\n\\nFull-time employees accrue vacation time at a rate of [X hours] per month, equivalent to [Y days] per year. Part-time employees accrue vacation time on a pro-rata basis, calculated according to their scheduled work hours.\\n\\nVacation time will begin to accrue from the first day of employment, but employees are eligible to take vacation time only after completing their probationary period. Unused vacation time will be carried over to the next year, up to a maximum of [Z days]. Any additional unused vacation time will be forfeited.\\nVacation Scheduling\\n\\nEmployees are required to submit vacation requests to their supervisor at least [A weeks] in advance, specifying the start and end dates of their vacation. Supervisors will review and approve vacation requests based on business needs, ensuring adequate coverage during the employee's absence.\\n\\nEmployees are encouraged to plan their vacations around the company's peak and non-peak periods to minimize disruptions. Vacation requests during peak periods may be subject to limitations and require additional advance notice.\\nVacation Pay\\n\\nEmployees will receive their regular pay during their approved vacation time. Vacation pay will be calculated based on the employee's average earnings over the [B weeks] preceding their vacation.\\nUnplanned Absences and Vacation Time\\n\\nIn the event of an unplanned absence due to illness or personal emergencies, employees may use their accrued vacation time, subject to supervisor approval. Employees must inform their supervisor as soon as possible and provide any required documentation upon their return to work.\\nVacation Time and Termination of Employment\\n\\nIf an employee's employment is terminated, they will be paid out for any unused vacation time, calculated based on their current rate of pay.\\nPolicy Review and Updates\\n\\nThis vacation policy will be reviewed periodically and updated as necessary, taking into account changes in labor laws, business needs, and employee feedback.\\nQuestions and Concerns\\n\\nEmployees are encouraged to direct any questions or concerns about this policy to their supervisor or the HR department.\", metadata={'summary': ': This policy outlines the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. Full-time employees accrue vacation time at a rate of [X hours] per month, equivalent to [Y days] per year. Vacation requests must be submitted to supervisors at least', 'updated_at': '2018-04-16', 'name': 'Company Vacation Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'sharepoint', 'seq_num': 5, 'url': 'https://enterprisesearch.sharepoint.com/:t:/s/MSBuilddemo/ES6rw9bKZxVBobG1WUoJpikBF9Bhx1pw_GvJWbsg-Z_HNA?e=faSHVt'}),\n", - " Document(page_content='As we continue to prioritize the well-being of our employees, we are making a slight adjustment to our hybrid work policy. Starting May 1, 2023, employees will be required to work from the office three days a week, with two days designated for remote work. Please communicate with your supervisor and HR department to establish your updated in-office workdays.', metadata={'summary': 'Starting May 1, 2023, our hybrid work policy will require employees to work from the office three days a week and two days remotely.', 'updated_at': '2023-05-01', 'name': 'Wfh Policy Update May 2023', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'teams', 'seq_num': 3, 'url': './sharepoint/WFH policy update May 2023.txt'})]" + "[Document(page_content=\"Effective: March 2020\\nPurpose\\n\\nThe purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.\\nScope\\n\\nThis policy applies to all employees who are eligible for remote work as determined by their role and responsibilities. It is designed to allow employees to work from home full time while maintaining the same level of performance and collaboration as they would in the office.\\nEligibility\\n\\nEmployees who can perform their work duties remotely and have received approval from their direct supervisor and the HR department are eligible for this work-from-home arrangement.\\nEquipment and Resources\\n\\nThe necessary equipment and resources will be provided to employees for remote work, including a company-issued laptop, software licenses, and access to secure communication tools. Employees are responsible for maintaining and protecting the company's equipment and data.\\nWorkspace\\n\\nEmployees working from home are responsible for creating a comfortable and safe workspace that is conducive to productivity. This includes ensuring that their home office is ergonomically designed, well-lit, and free from distractions.\\nCommunication\\n\\nEffective communication is vital for successful remote work. Employees are expected to maintain regular communication with their supervisors, colleagues, and team members through email, phone calls, video conferences, and other approved communication tools.\\nWork Hours and Availability\\n\\nEmployees are expected to maintain their regular work hours and be available during normal business hours, unless otherwise agreed upon with their supervisor. Any changes to work hours or availability must be communicated to the employee's supervisor and the HR department.\\nPerformance Expectations\", metadata={'summary': 'This policy outlines the guidelines for full-time remote work, including eligibility, equipment and resources, workspace requirements, communication expectations, performance expectations, time tracking and overtime, confidentiality and data security, health and well-being, and policy reviews and updates. Employees are encouraged to direct any questions or concerns', 'updated_at': '2020-03-01', 'name': 'Work From Home Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'teams', 'seq_num': 1, 'url': './sharepoint/Work from home policy.txt'}),\n", + " Document(page_content=\"Employees working from home are expected to maintain the same level of performance and productivity as if they were working in the office. Supervisors and team members will collaborate to establish clear expectations and goals for remote work.\\nTime Tracking and Overtime\\n\\nEmployees are required to accurately track their work hours using the company's time tracking system. Non-exempt employees must obtain approval from their supervisor before working overtime.\\nConfidentiality and Data Security\\n\\nEmployees must adhere to the company's confidentiality and data security policies while working from home. This includes safeguarding sensitive information, securing personal devices and internet connections, and reporting any security breaches to the IT department.\\nHealth and Well-being\\n\\nThe company encourages employees to prioritize their health and well-being while working from home. This includes taking regular breaks, maintaining a work-life balance, and seeking support from supervisors and colleagues when needed.\\nPolicy Review and Updates\\n\\nThis work-from-home policy will be reviewed periodically and updated as necessary, taking into account changes in public health guidance, business needs, and employee feedback.\\nQuestions and Concerns\\n\\nEmployees are encouraged to direct any questions or concerns about this policy to their supervisor or the HR department.\", metadata={'summary': 'This policy outlines the guidelines for full-time remote work, including eligibility, equipment and resources, workspace requirements, communication expectations, performance expectations, time tracking and overtime, confidentiality and data security, health and well-being, and policy reviews and updates. Employees are encouraged to direct any questions or concerns', 'updated_at': '2020-03-01', 'name': 'Work From Home Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'teams', 'seq_num': 1, 'url': './sharepoint/Work from home policy.txt'}),\n", + " Document(page_content='As we continue to prioritize the well-being of our employees, we are making a slight adjustment to our hybrid work policy. Starting May 1, 2023, employees will be required to work from the office three days a week, with two days designated for remote work. Please communicate with your supervisor and HR department to establish your updated in-office workdays.', metadata={'summary': 'Starting May 1, 2023, our hybrid work policy will require employees to work from the office three days a week and two days remotely.', 'updated_at': '2023-05-01', 'name': 'Wfh Policy Update May 2023', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'teams', 'seq_num': 3, 'url': './sharepoint/WFH policy update May 2023.txt'}),\n", + " Document(page_content='Purpose\\nThe purpose of this Intellectual Property Policy is to establish guidelines and procedures for the ownership, protection, and utilization of intellectual property generated by employees during their employment. This policy aims to encourage creativity and innovation while ensuring that the interests of both the company and its employees are protected.\\n\\nScope\\nThis policy applies to all employees, including full-time, part-time, temporary, and contract employees.\\n\\nDefinitions\\na. Intellectual Property (IP): Refers to creations of the mind, such as inventions, literary and artistic works, designs, symbols, and images, that are protected by copyright, trademark, patent, or other forms of legal protection.\\nb. Company Time: Refers to the time during which an employee is actively engaged in performing their job duties.\\nc. Outside Company Time: Refers to the time during which an employee is not engaged in performing their job duties.', metadata={'summary': \"This Intellectual Property Policy outlines guidelines and procedures for the ownership, protection, and utilization of intellectual property generated by employees during their employment. It establishes the company's ownership of work generated on company time, while recognizing employee ownership of work generated outside of company time without the use of company resources. The policy\", 'updated_at': None, 'name': 'Intellectual Property Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'sharepoint', 'seq_num': 8, 'url': 'https://enterprisesearch.sharepoint.com/:t:/s/MSBuilddemo/EWz3cYEVdzBNsiHsYbKhms4BVYGhravyrUw3T3lzxL4pTg?e=mPIgbO'})]" ] }, - "execution_count": 68, + "execution_count": 76, "metadata": {}, "output_type": "execute_result" } From 0792ec65636263998298351ec4e6d54bfefc0d39 Mon Sep 17 00:00:00 2001 From: Joseph McElroy Date: Wed, 6 Dec 2023 16:39:06 +0000 Subject: [PATCH 6/7] add colab link --- notebooks/langchain/langchain-parent-retriever.ipynb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/notebooks/langchain/langchain-parent-retriever.ipynb b/notebooks/langchain/langchain-parent-retriever.ipynb index eb7e3cde..b61f58ef 100644 --- a/notebooks/langchain/langchain-parent-retriever.ipynb +++ b/notebooks/langchain/langchain-parent-retriever.ipynb @@ -5,6 +5,8 @@ "metadata": {}, "source": [ "# Parent Child Retriever Examples\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elastic/elasticsearch-labs/blob/main/notebooks/langchain/langchain-parent-retriever.ipynb)\n", + "\n", "**Using Elasticsearch Nested Dense Vector Support**\n", "\n", "When splitting documents for retrieval, there are often conflicting desires:\n", From 1ae09e742f0253db158ef7bb400bcab51ab3a31f Mon Sep 17 00:00:00 2001 From: Joseph McElroy Date: Mon, 18 Dec 2023 11:04:01 +0000 Subject: [PATCH 7/7] rename to langchain splitters --- README.md | 2 + .../with-langchain-splitters.ipynb} | 361 ++++-------------- 2 files changed, 74 insertions(+), 289 deletions(-) rename notebooks/{langchain/langchain-parent-retriever.ipynb => document-chunking/with-langchain-splitters.ipynb} (50%) diff --git a/README.md b/README.md index 8671f74a..ba49ad63 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,9 @@ The [`notebooks`](notebooks/README.md) folder contains a range of executable Pyt - [`langchain-using-own-model.ipynb`](./notebooks/langchain/langchain-using-own-model.ipynb) ### Document Chunking + - [`Document Chunking with Ingest Pipelines`](./notebooks/document-chunking/with-index-pipelines.ipynb) +- ['Document Chunking with LangChain Splitters'](./notebooks/document-chunking/with-langchain-splitters.ipynb) ### Search diff --git a/notebooks/langchain/langchain-parent-retriever.ipynb b/notebooks/document-chunking/with-langchain-splitters.ipynb similarity index 50% rename from notebooks/langchain/langchain-parent-retriever.ipynb rename to notebooks/document-chunking/with-langchain-splitters.ipynb index b61f58ef..cb28df95 100644 --- a/notebooks/langchain/langchain-parent-retriever.ipynb +++ b/notebooks/document-chunking/with-langchain-splitters.ipynb @@ -4,19 +4,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Parent Child Retriever Examples\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elastic/elasticsearch-labs/blob/main/notebooks/langchain/langchain-parent-retriever.ipynb)\n", + "# Document Chunking With LangChain Document Splitters\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elastic/elasticsearch-labs/blob/main/notebooks/document-chunking/with-langchain-splitters.ipynb)\n", "\n", "**Using Elasticsearch Nested Dense Vector Support**\n", "\n", - "When splitting documents for retrieval, there are often conflicting desires:\n", - "\n", - "- You may want to have small documents, so that their embeddings can most accurately reflect their meaning. If too long, then the embeddings can lose meaning.\n", - "- You want to have long enough documents that the context of each chunk is retained.\n", - "\n", - "We can take advantage of Nested Dense Vector capability in Elasticsearch to store both large passages and smaller linked passages in one document. During retrieval, we query for small passages which link back to a larger parent passage.\n", - "\n", - "Note that “parent document” refers to the document that a small chunk originated from. This can either be the whole raw document OR a larger chunk." + "This interactive notebook will:\n", + "- load the model \"sentence-transformers__all-minilm-l6-v2\" from Hugging Face and into Elasticsearch ML Node\n", + "- Use LangChain splitters to chunk the passages into sentences and index them into Elasticsearch with nested dense vector\n", + "- perform a search and return docs with the most relevant passages" ] }, { @@ -31,9 +27,21 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "langserve 0.0.21 requires pydantic<2,>=1, but you have pydantic 2.3.0 which is incompatible.\n", + "poetry 1.6.1 requires build<0.11.0,>=0.10.0, but you have build 1.0.3 which is incompatible.\n", + "poetry 1.6.1 requires jsonschema<4.18.0,>=4.10.0, but you have jsonschema 4.19.1 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], "source": [ "!python3 -m pip install -qU langchain elasticsearch " ] @@ -51,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -67,7 +75,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -88,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -131,6 +139,32 @@ ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Model from hugging face\n", + "The first thing you will need is a model to create the text embeddings out of the chunks, you can use whatever you would like, but this example will run end to end on the minilm-l6-v2 model. With an Elastic Cloud cluster created or another Elasticsearch cluster ready, we can upload the text embedding model using the eland library." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "MODEL_ID = \"sentence-transformers__all-minilm-l6-v2\"\n", + "\n", + "!eland_import_hub_model \\\n", + " --cloud-id $ELASTIC_CLOUD_ID \\\n", + " --es-username elastic \\\n", + " --es-api-key $ELASTIC_API_KEY \\\n", + " --hub-model-id \"sentence-transformers/all-MiniLM-L6-v2\" \\\n", + " --task-type text_embedding \\\n", + " --clear-previous \\\n", + " --start" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -143,7 +177,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -152,14 +186,13 @@ "ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'nb_parent_retriever_index'})" ] }, - "execution_count": 43, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "PIPELINE_ID = \"chunk_text_to_passages\"\n", - "MODEL_ID = \"sentence-transformers__all-minilm-l6-v2\"\n", "MODEL_DIMS = 384\n", "INDEX_NAME = \"nb_parent_retriever_index\"\n", "\n", @@ -241,20 +274,15 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "\n", - "def parent_child_splitter(data, parent_chunk_size: int | None = None, child_chunk_size: int = 200):\n", - " if parent_chunk_size:\n", - " parent_splitter = RecursiveCharacterTextSplitter(chunk_size=parent_chunk_size)\n", - " documents = parent_splitter.split_documents(data)\n", - " else:\n", - " documents = data\n", + "def parent_child_splitter(documents, chunk_size: int = 200):\n", "\n", - " child_splitter = RecursiveCharacterTextSplitter(chunk_size=child_chunk_size)\n", + " child_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size)\n", "\n", " docs = []\n", " for i, doc in enumerate(documents):\n", @@ -285,7 +313,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -316,7 +344,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Example 1: Full Document, nested passages\n", + "## Full Document, nested passages\n", "In this example we will split a document into passages, and store the full document as a parent document. We will then store the passages as nested documents, with a link back to the parent document.\n", "\n", "Below we are using the parent child splitter to split the full documents into passages. The `parent_child_splitter` fn returns a list of documents, with an array of nested passages. \n", @@ -328,7 +356,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -342,7 +370,7 @@ "source": [ "from elasticsearch import helpers\n", "\n", - "chunked_docs = parent_child_splitter(loader.load(), parent_chunk_size=None, child_chunk_size=600)\n", + "chunked_docs = parent_child_splitter(loader.load(), chunk_size=600)\n", "\n", "count, errors = helpers.bulk(\n", " client, \n", @@ -363,7 +391,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -371,7 +399,7 @@ "output_type": "stream", "text": [ "\n", - "ID: 2vn8P4wBeCQuLJUsMR_I\n", + "ID: HP6WfIwBeCQuLJUs19ov\n", "Doc Title: Work From Home Policy\n", "parent text:\n", "\n", @@ -390,7 +418,7 @@ "\n", "---\n", "\n", - "ID: 4fn8P4wBeCQuLJUsMR_I\n", + "ID: I_6WfIwBeCQuLJUs19ov\n", "Doc Title: Intellectual Property Policy\n", "parent text:\n", "\n", @@ -406,7 +434,7 @@ "\n", "---\n", "\n", - "ID: 3vn8P4wBeCQuLJUsMR_I\n", + "ID: IP6WfIwBeCQuLJUs19ov\n", "Doc Title: Company Vacation Policy\n", "parent text:\n", "\n", @@ -424,7 +452,7 @@ "\n", "---\n", "\n", - "ID: 3Pn8P4wBeCQuLJUsMR_I\n", + "ID: Hv6WfIwBeCQuLJUs19ov\n", "Doc Title: Wfh Policy Update May 2023\n", "parent text:\n", "\n", @@ -436,7 +464,7 @@ "\n", "---\n", "\n", - "ID: 6Pn8P4wBeCQuLJUsMR_I\n", + "ID: Kv6WfIwBeCQuLJUs19ov\n", "Doc Title: New Employee Onboarding Guide\n", "parent text:\n", "\n", @@ -491,19 +519,19 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Document(page_content='Effective: March 2020\\nPurpose\\n\\nThe purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.\\nScope\\n\\nThis policy applies to all employees who are eligible for remote work as determined by their role and responsibilities. It is designed to allow employees to work from home full time while maintaining the same level of performance and collaboration as they would in the office.\\nEligibility\\n\\n', metadata={'summary': 'This policy outlines the guidelines for full-time remote work, including eligibility, equipment and resources, workspace requirements, communication expectations, performance expectations, time tracking and overtime, confidentiality and data security, health and well-being, and policy reviews and updates. Employees are encouraged to direct any questions or concerns', 'updated_at': '2020-03-01', 'name': 'Work From Home Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'teams', 'seq_num': 1, 'url': './sharepoint/Work from home policy.txt'}),\n", - " Document(page_content='Purpose\\nThe purpose of this Intellectual Property Policy is to establish guidelines and procedures for the ownership, protection, and utilization of intellectual property generated by employees during their employment. This policy aims to encourage creativity and innovation while ensuring that the interests of both the company and its employees are protected.\\n\\nScope\\nThis policy applies to all employees, including full-time, part-time, temporary, and contract employees.\\n\\n', metadata={'summary': \"This Intellectual Property Policy outlines guidelines and procedures for the ownership, protection, and utilization of intellectual property generated by employees during their employment. It establishes the company's ownership of work generated on company time, while recognizing employee ownership of work generated outside of company time without the use of company resources. The policy\", 'updated_at': None, 'name': 'Intellectual Property Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'sharepoint', 'seq_num': 8, 'url': 'https://enterprisesearch.sharepoint.com/:t:/s/MSBuilddemo/EWz3cYEVdzBNsiHsYbKhms4BVYGhravyrUw3T3lzxL4pTg?e=mPIgbO'}),\n", - " Document(page_content='Purpose\\n\\nThe purpose of this vacation policy is to outline the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. This policy aims to promote a healthy work-life balance and encourage employees to take time to rest and recharge.\\nScope\\n\\nThis policy applies to all full-time and part-time employees who have completed their probationary period.\\nVacation Accrual\\n\\n', metadata={'summary': ': This policy outlines the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. Full-time employees accrue vacation time at a rate of [X hours] per month, equivalent to [Y days] per year. Vacation requests must be submitted to supervisors at least', 'updated_at': '2018-04-16', 'name': 'Company Vacation Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'sharepoint', 'seq_num': 5, 'url': 'https://enterprisesearch.sharepoint.com/:t:/s/MSBuilddemo/ES6rw9bKZxVBobG1WUoJpikBF9Bhx1pw_GvJWbsg-Z_HNA?e=faSHVt'}),\n", - " Document(page_content='As we continue to prioritize the well-being of our employees, we are making a slight adjustment to our hybrid work policy. Starting May 1, 2023, employees will be required to work from the office three days a week, with two days designated for remote work. Please communicate with your supervisor and HR department to establish your updated in-office workdays.\\n\\n', metadata={'summary': 'Starting May 1, 2023, our hybrid work policy will require employees to work from the office three days a week and two days remotely.', 'updated_at': '2023-05-01', 'name': 'Wfh Policy Update May 2023', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'teams', 'seq_num': 3, 'url': './sharepoint/WFH policy update May 2023.txt'})]" + "[Document(page_content='Effective: March 2020\\nPurpose\\n\\nThe purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.\\nScope\\n\\nThis policy applies to all employees who are eligible for remote work as determined by their role and responsibilities. It is designed to allow employees to work from home full time while maintaining the same level of performance and collaboration as they would in the office.\\nEligibility\\n\\n', metadata={'summary': 'This policy outlines the guidelines for full-time remote work, including eligibility, equipment and resources, workspace requirements, communication expectations, performance expectations, time tracking and overtime, confidentiality and data security, health and well-being, and policy reviews and updates. Employees are encouraged to direct any questions or concerns', 'updated_at': '2020-03-01', 'name': 'Work From Home Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/document-chunking/temp.json', 'category': 'teams', 'seq_num': 1, 'url': './sharepoint/Work from home policy.txt'}),\n", + " Document(page_content='Purpose\\nThe purpose of this Intellectual Property Policy is to establish guidelines and procedures for the ownership, protection, and utilization of intellectual property generated by employees during their employment. This policy aims to encourage creativity and innovation while ensuring that the interests of both the company and its employees are protected.\\n\\nScope\\nThis policy applies to all employees, including full-time, part-time, temporary, and contract employees.\\n\\n', metadata={'summary': \"This Intellectual Property Policy outlines guidelines and procedures for the ownership, protection, and utilization of intellectual property generated by employees during their employment. It establishes the company's ownership of work generated on company time, while recognizing employee ownership of work generated outside of company time without the use of company resources. The policy\", 'updated_at': None, 'name': 'Intellectual Property Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/document-chunking/temp.json', 'category': 'sharepoint', 'seq_num': 8, 'url': 'https://enterprisesearch.sharepoint.com/:t:/s/MSBuilddemo/EWz3cYEVdzBNsiHsYbKhms4BVYGhravyrUw3T3lzxL4pTg?e=mPIgbO'}),\n", + " Document(page_content='Purpose\\n\\nThe purpose of this vacation policy is to outline the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. This policy aims to promote a healthy work-life balance and encourage employees to take time to rest and recharge.\\nScope\\n\\nThis policy applies to all full-time and part-time employees who have completed their probationary period.\\nVacation Accrual\\n\\n', metadata={'summary': ': This policy outlines the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. Full-time employees accrue vacation time at a rate of [X hours] per month, equivalent to [Y days] per year. Vacation requests must be submitted to supervisors at least', 'updated_at': '2018-04-16', 'name': 'Company Vacation Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/document-chunking/temp.json', 'category': 'sharepoint', 'seq_num': 5, 'url': 'https://enterprisesearch.sharepoint.com/:t:/s/MSBuilddemo/ES6rw9bKZxVBobG1WUoJpikBF9Bhx1pw_GvJWbsg-Z_HNA?e=faSHVt'}),\n", + " Document(page_content='As we continue to prioritize the well-being of our employees, we are making a slight adjustment to our hybrid work policy. Starting May 1, 2023, employees will be required to work from the office three days a week, with two days designated for remote work. Please communicate with your supervisor and HR department to establish your updated in-office workdays.\\n\\n', metadata={'summary': 'Starting May 1, 2023, our hybrid work policy will require employees to work from the office three days a week and two days remotely.', 'updated_at': '2023-05-01', 'name': 'Wfh Policy Update May 2023', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/document-chunking/temp.json', 'category': 'teams', 'seq_num': 3, 'url': './sharepoint/WFH policy update May 2023.txt'})]" ] }, - "execution_count": 73, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -568,254 +596,9 @@ "vector_store.similarity_search(query=\"Whats the work from home policy?\", doc_builder=doc_builder)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Example 2: Parent Child Retriever\n", - "In the above example, we are storing the full document in the parent document. You can also still chunk the document into chunks that are large enough to retain context, but split the chunk into many small passages and store them in the parent chunk. This allows you to retrieve the parent chunk, but the passage embeddings can be very precise which link back to the parent chunk.\n", - "\n", - "Below we are using the same parent_child_splitter, but we are specifying the `parent_chunk_size` to be 2000 characters. This means that the parent chunk will be 2000 characters long, and the passages will be 200 characters long.\n", - "\n", - "You can see from the response we have now stored 32 documents in our index, representing the 15 documents from our dataset." - ] - }, - { - "cell_type": "code", - "execution_count": 74, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Indexed 32 documents with [] errors\n" - ] - } - ], - "source": [ - "# delete documents in the index\n", - "client.delete_by_query(index=INDEX_NAME, query={\"match_all\": {}})\n", - "\n", - "# Index the documents, this time with parent-child splitting\n", - "chunked_docs = parent_child_splitter(loader.load(), parent_chunk_size=2000, child_chunk_size=200)\n", - "\n", - "count, errors = helpers.bulk(\n", - " client, \n", - " chunked_docs,\n", - " index=INDEX_NAME\n", - ")\n", - "\n", - "print(f\"Indexed {count} documents with {errors} errors\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Retrieving the parent Chunks\n", - "We can perform a normal nested dense vector query to retrieve the parent chunks. We can see that the parent chunks are returned, but the passages are not.\n", - "\n", - "In this example we can see the content is much larger, giving us more context and the embeddings remain as precise as the previous example." - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "ID: 7_n8P4wBeCQuLJUsox-g\n", - "Doc Title: Work From Home Policy\n", - "parent text:\n", - "Effective: March 2020\n", - "Purpose\n", - "\n", - "The purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.\n", - "Scope\n", - "\n", - "This policy applies to all employees who are eligible for remote work as determined by their role and responsibilities. It is designed to allow employees to work from home full time while maintaining the same level of performance and collaboration as they would in the office.\n", - "Eligibility\n", - "\n", - "Employees who can perform their work duties remotely and have received approval from their direct supervisor and the HR department are eligible for this work-from-home arrangement.\n", - "Equipment and Resources\n", - "\n", - "The necessary equipment and resources will be provided to employees for remote work, including a company-issued laptop, software licenses, and access to secure communication tools. Employees are responsible for maintaining and protecting the company's equipment and data.\n", - "Workspace\n", - "\n", - "Employees working from home are responsible for creating a comfortable and safe workspace that is conducive to productivity. This includes ensuring that their home office is ergonomically designed, well-lit, and free from distractions.\n", - "Communication\n", - "\n", - "Effective communication is vital for successful remote work. Employees are expected to maintain regular communication with their supervisors, colleagues, and team members through email, phone calls, video conferences, and other approved communication tools.\n", - "Work Hours and Availability\n", - "\n", - "Employees are expected to maintain their regular work hours and be available during normal business hours, unless otherwise agreed upon with their supervisor. Any changes to work hours or availability must be communicated to the employee's supervisor and the HR department.\n", - "Performance Expectations\n", - "Passage Text:\n", - "The purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations\n", - "\n", - "\n", - "Score: 0.8779937\n", - "\n", - "---\n", - "\n", - "ID: 8Pn8P4wBeCQuLJUsox-g\n", - "Doc Title: Work From Home Policy\n", - "parent text:\n", - "Employees working from home are expected to maintain the same level of performance and productivity as if they were working in the office. Supervisors and team members will collaborate to establish clear expectations and goals for remote work.\n", - "Time Tracking and Overtime\n", - "\n", - "Employees are required to accurately track their work hours using the company's time tracking system. Non-exempt employees must obtain approval from their supervisor before working overtime.\n", - "Confidentiality and Data Security\n", - "\n", - "Employees must adhere to the company's confidentiality and data security policies while working from home. This includes safeguarding sensitive information, securing personal devices and internet connections, and reporting any security breaches to the IT department.\n", - "Health and Well-being\n", - "\n", - "The company encourages employees to prioritize their health and well-being while working from home. This includes taking regular breaks, maintaining a work-life balance, and seeking support from supervisors and colleagues when needed.\n", - "Policy Review and Updates\n", - "\n", - "This work-from-home policy will be reviewed periodically and updated as necessary, taking into account changes in public health guidance, business needs, and employee feedback.\n", - "Questions and Concerns\n", - "\n", - "Employees are encouraged to direct any questions or concerns about this policy to their supervisor or the HR department.\n", - "Passage Text:\n", - "policies while working from home. This includes safeguarding sensitive information, securing personal devices and internet connections, and reporting any security breaches to the IT department.\n", - "\n", - "\n", - "Score: 0.8585499\n", - "\n", - "---\n", - "\n", - "ID: 8vn8P4wBeCQuLJUsox-g\n", - "Doc Title: Wfh Policy Update May 2023\n", - "parent text:\n", - "As we continue to prioritize the well-being of our employees, we are making a slight adjustment to our hybrid work policy. Starting May 1, 2023, employees will be required to work from the office three days a week, with two days designated for remote work. Please communicate with your supervisor and HR department to establish your updated in-office workdays.\n", - "Passage Text:\n", - "adjustment to our hybrid work policy. Starting May 1, 2023, employees will be required to work from the office three days a week, with two days designated for remote work. Please communicate with\n", - "\n", - "\n", - "Score: 0.7466323\n", - "\n", - "---\n", - "\n", - "ID: _Pn8P4wBeCQuLJUsox-g\n", - "Doc Title: Intellectual Property Policy\n", - "parent text:\n", - "Purpose\n", - "The purpose of this Intellectual Property Policy is to establish guidelines and procedures for the ownership, protection, and utilization of intellectual property generated by employees during their employment. This policy aims to encourage creativity and innovation while ensuring that the interests of both the company and its employees are protected.\n", - "\n", - "Scope\n", - "This policy applies to all employees, including full-time, part-time, temporary, and contract employees.\n", - "\n", - "Definitions\n", - "a. Intellectual Property (IP): Refers to creations of the mind, such as inventions, literary and artistic works, designs, symbols, and images, that are protected by copyright, trademark, patent, or other forms of legal protection.\n", - "b. Company Time: Refers to the time during which an employee is actively engaged in performing their job duties.\n", - "c. Outside Company Time: Refers to the time during which an employee is not engaged in performing their job duties.\n", - "Passage Text:\n", - "property generated by employees during their employment. This policy aims to encourage creativity and innovation while ensuring that the interests of both the company and its employees are protected.\n", - "\n", - "\n", - "Score: 0.74160063\n", - "\n", - "---\n", - "\n", - "ID: 9fn8P4wBeCQuLJUsox-g\n", - "Doc Title: Company Vacation Policy\n", - "parent text:\n", - "Purpose\n", - "\n", - "The purpose of this vacation policy is to outline the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. This policy aims to promote a healthy work-life balance and encourage employees to take time to rest and recharge.\n", - "Scope\n", - "\n", - "This policy applies to all full-time and part-time employees who have completed their probationary period.\n", - "Vacation Accrual\n", - "\n", - "Full-time employees accrue vacation time at a rate of [X hours] per month, equivalent to [Y days] per year. Part-time employees accrue vacation time on a pro-rata basis, calculated according to their scheduled work hours.\n", - "\n", - "Vacation time will begin to accrue from the first day of employment, but employees are eligible to take vacation time only after completing their probationary period. Unused vacation time will be carried over to the next year, up to a maximum of [Z days]. Any additional unused vacation time will be forfeited.\n", - "Vacation Scheduling\n", - "\n", - "Employees are required to submit vacation requests to their supervisor at least [A weeks] in advance, specifying the start and end dates of their vacation. Supervisors will review and approve vacation requests based on business needs, ensuring adequate coverage during the employee's absence.\n", - "\n", - "Employees are encouraged to plan their vacations around the company's peak and non-peak periods to minimize disruptions. Vacation requests during peak periods may be subject to limitations and require additional advance notice.\n", - "Vacation Pay\n", - "\n", - "Employees will receive their regular pay during their approved vacation time. Vacation pay will be calculated based on the employee's average earnings over the [B weeks] preceding their vacation.\n", - "Unplanned Absences and Vacation Time\n", - "Passage Text:\n", - "of this vacation policy is to outline the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. This policy aims to promote a healthy work-life\n", - "\n", - "\n", - "Score: 0.7381881\n", - "\n", - "---\n" - ] - } - ], - "source": [ - "response = client.search(\n", - " index=INDEX_NAME,\n", - " source_includes=[\"content\", \"metadata\"],\n", - " knn={\n", - " \"inner_hits\": {\n", - " \"_source\": False,\n", - " \"fields\": [\n", - " \"passages.text\"\n", - " ]\n", - " },\n", - " \"field\": \"passages.vector.predicted_value\",\n", - " \"k\": 5,\n", - " \"num_candidates\": 100,\n", - " \"query_vector_builder\": {\n", - " \"text_embedding\": {\n", - " \"model_id\": \"sentence-transformers__all-minilm-l6-v2\",\n", - " \"model_text\": \"Whats the work from home policy?\"\n", - " }\n", - " }\n", - " }\n", - ")\n", - "\n", - "pretty_response(response, show_parent_text=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### With Langchain\n", - "You can also use Langchain to retrieve the passages from the parent chunks. In combination with the nested query search configured in the ElasticsearchStore strategy, we retrieve the parent chunks that are relevant to one or more chunked passages. " - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Document(page_content=\"Effective: March 2020\\nPurpose\\n\\nThe purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.\\nScope\\n\\nThis policy applies to all employees who are eligible for remote work as determined by their role and responsibilities. It is designed to allow employees to work from home full time while maintaining the same level of performance and collaboration as they would in the office.\\nEligibility\\n\\nEmployees who can perform their work duties remotely and have received approval from their direct supervisor and the HR department are eligible for this work-from-home arrangement.\\nEquipment and Resources\\n\\nThe necessary equipment and resources will be provided to employees for remote work, including a company-issued laptop, software licenses, and access to secure communication tools. Employees are responsible for maintaining and protecting the company's equipment and data.\\nWorkspace\\n\\nEmployees working from home are responsible for creating a comfortable and safe workspace that is conducive to productivity. This includes ensuring that their home office is ergonomically designed, well-lit, and free from distractions.\\nCommunication\\n\\nEffective communication is vital for successful remote work. Employees are expected to maintain regular communication with their supervisors, colleagues, and team members through email, phone calls, video conferences, and other approved communication tools.\\nWork Hours and Availability\\n\\nEmployees are expected to maintain their regular work hours and be available during normal business hours, unless otherwise agreed upon with their supervisor. Any changes to work hours or availability must be communicated to the employee's supervisor and the HR department.\\nPerformance Expectations\", metadata={'summary': 'This policy outlines the guidelines for full-time remote work, including eligibility, equipment and resources, workspace requirements, communication expectations, performance expectations, time tracking and overtime, confidentiality and data security, health and well-being, and policy reviews and updates. Employees are encouraged to direct any questions or concerns', 'updated_at': '2020-03-01', 'name': 'Work From Home Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'teams', 'seq_num': 1, 'url': './sharepoint/Work from home policy.txt'}),\n", - " Document(page_content=\"Employees working from home are expected to maintain the same level of performance and productivity as if they were working in the office. Supervisors and team members will collaborate to establish clear expectations and goals for remote work.\\nTime Tracking and Overtime\\n\\nEmployees are required to accurately track their work hours using the company's time tracking system. Non-exempt employees must obtain approval from their supervisor before working overtime.\\nConfidentiality and Data Security\\n\\nEmployees must adhere to the company's confidentiality and data security policies while working from home. This includes safeguarding sensitive information, securing personal devices and internet connections, and reporting any security breaches to the IT department.\\nHealth and Well-being\\n\\nThe company encourages employees to prioritize their health and well-being while working from home. This includes taking regular breaks, maintaining a work-life balance, and seeking support from supervisors and colleagues when needed.\\nPolicy Review and Updates\\n\\nThis work-from-home policy will be reviewed periodically and updated as necessary, taking into account changes in public health guidance, business needs, and employee feedback.\\nQuestions and Concerns\\n\\nEmployees are encouraged to direct any questions or concerns about this policy to their supervisor or the HR department.\", metadata={'summary': 'This policy outlines the guidelines for full-time remote work, including eligibility, equipment and resources, workspace requirements, communication expectations, performance expectations, time tracking and overtime, confidentiality and data security, health and well-being, and policy reviews and updates. Employees are encouraged to direct any questions or concerns', 'updated_at': '2020-03-01', 'name': 'Work From Home Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'teams', 'seq_num': 1, 'url': './sharepoint/Work from home policy.txt'}),\n", - " Document(page_content='As we continue to prioritize the well-being of our employees, we are making a slight adjustment to our hybrid work policy. Starting May 1, 2023, employees will be required to work from the office three days a week, with two days designated for remote work. Please communicate with your supervisor and HR department to establish your updated in-office workdays.', metadata={'summary': 'Starting May 1, 2023, our hybrid work policy will require employees to work from the office three days a week and two days remotely.', 'updated_at': '2023-05-01', 'name': 'Wfh Policy Update May 2023', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'teams', 'seq_num': 3, 'url': './sharepoint/WFH policy update May 2023.txt'}),\n", - " Document(page_content='Purpose\\nThe purpose of this Intellectual Property Policy is to establish guidelines and procedures for the ownership, protection, and utilization of intellectual property generated by employees during their employment. This policy aims to encourage creativity and innovation while ensuring that the interests of both the company and its employees are protected.\\n\\nScope\\nThis policy applies to all employees, including full-time, part-time, temporary, and contract employees.\\n\\nDefinitions\\na. Intellectual Property (IP): Refers to creations of the mind, such as inventions, literary and artistic works, designs, symbols, and images, that are protected by copyright, trademark, patent, or other forms of legal protection.\\nb. Company Time: Refers to the time during which an employee is actively engaged in performing their job duties.\\nc. Outside Company Time: Refers to the time during which an employee is not engaged in performing their job duties.', metadata={'summary': \"This Intellectual Property Policy outlines guidelines and procedures for the ownership, protection, and utilization of intellectual property generated by employees during their employment. It establishes the company's ownership of work generated on company time, while recognizing employee ownership of work generated outside of company time without the use of company resources. The policy\", 'updated_at': None, 'name': 'Intellectual Property Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/langchain/temp.json', 'category': 'sharepoint', 'seq_num': 8, 'url': 'https://enterprisesearch.sharepoint.com/:t:/s/MSBuilddemo/EWz3cYEVdzBNsiHsYbKhms4BVYGhravyrUw3T3lzxL4pTg?e=mPIgbO'})]" - ] - }, - "execution_count": 76, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "vector_store.similarity_search(query=\"Whats the work from home policy?\")" - ] - }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -824,7 +607,7 @@ "ObjectApiResponse({'acknowledged': True})" ] }, - "execution_count": 22, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" }