Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add custom port to ElasticsearchDocumentStore #129

Merged
merged 1 commit into from
Jun 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions haystack/api/config.py
Expand Up @@ -9,6 +9,7 @@

# DB
DB_HOST = os.getenv("DB_HOST", "localhost")
DB_PORT = int(os.getenv("DB_PORT", 9200))
DB_USER = os.getenv("DB_USER", "")
DB_PW = os.getenv("DB_PW", "")
DB_INDEX = os.getenv("DB_INDEX", "document")
Expand Down
4 changes: 2 additions & 2 deletions haystack/api/elasticsearch_client.py
@@ -1,7 +1,7 @@
from elasticsearch import Elasticsearch

from haystack.api.config import DB_HOST, DB_USER, DB_PW
from haystack.api.config import DB_HOST, DB_USER, DB_PW, DB_PORT, ES_CONN_SCHEME

elasticsearch_client = Elasticsearch(
hosts=[{"host": DB_HOST}], http_auth=(DB_USER, DB_PW), scheme="http", ca_certs=False, verify_certs=False
hosts=[{"host": DB_HOST, "port": DB_PORT}], http_auth=(DB_USER, DB_PW), scheme=ES_CONN_SCHEME, ca_certs=False, verify_certs=False
)
66 changes: 46 additions & 20 deletions haystack/database/elasticsearch.py
@@ -1,7 +1,7 @@
import json
import logging
from string import Template

from typing import Union
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, scan

Expand All @@ -13,25 +13,52 @@
class ElasticsearchDocumentStore(BaseDocumentStore):
def __init__(
self,
host="localhost",
username="",
password="",
index="document",
search_fields="text",
text_field="text",
name_field="name",
external_source_id_field="external_source_id",
tag_fields=None,
embedding_field=None,
embedding_dim=None,
custom_mapping=None,
excluded_meta_data=None,
scheme="http",
ca_certs=False,
verify_certs=True,
create_index=True
host: str = "localhost",
port: int = 9200,
username: str = "",
password: str = "",
index: str = "document",
search_fields: Union[str,list] = "text",
text_field: str = "text",
name_field: str = "name",
external_source_id_field: str = "external_source_id",
embedding_field: str = None,
embedding_dim: str = None,
custom_mapping: dict = None,
excluded_meta_data: list = None,
scheme: str = "http",
ca_certs: bool = False,
verify_certs: bool = True,
create_index: bool = True
):
self.client = Elasticsearch(hosts=[{"host": host}], http_auth=(username, password),
"""
A DocumentStore using Elasticsearch to store and query the documents for our search.

* Keeps all the logic to store and query documents from Elastic, incl. mapping of fields, adding filters or boosts to your queries, and storing embeddings
* You can either use an existing Elasticsearch index or create a new one via haystack
* Retrievers operate on top of this DocumentStore to find the relevant documents for a query

:param host: url of elasticsearch
:param port: port of elasticsearch
:param username: username
:param password: password
:param index: Name of index in elasticsearch to use. If not existing yet, we will create one.
:param search_fields: Name of fields used by ElasticsearchRetriever to find matches in the docs to our incoming query (using elastic's multi_match query), e.g. ["title", "full_text"]
:param text_field: Name of field that might contain the answer and will therefore be passed to the Reader Model (e.g. "full_text").
If no Reader is used (e.g. in FAQ-Style QA) the plain content of this field will just be returned.
:param name_field: Name of field that contains the title of the the doc
:param external_source_id_field: If you have an external id (= non-elasticsearch) that identifies your documents, you can specify it here.
:param embedding_field: Name of field containing an embedding vector (Only needed when using the EmbeddingRetriever on top)
:param embedding_dim: Dimensionality of embedding vector (Only needed when using the EmbeddingRetriever on top)
:param custom_mapping: If you want to use your own custom mapping for creating a new index in Elasticsearch, you can supply it here as a dictionary.
:param excluded_meta_data: Name of fields in Elasticsearch that should not be returned (e.g. [field_one, field_two]).
Helpful if you have fields with long, irrelevant content that you don't want to display in results (e.g. embedding vectors).
:param scheme: 'https' or 'http', protocol used to connect to your elasticsearch instance
:param ca_certs: Root certificates for SSL
:param verify_certs: Whether to be strict about ca certificates
:param create_index: Whether to try creating a new index (If the index of that name is already existing, we will just continue in any case)
"""
self.client = Elasticsearch(hosts=[{"host": host, "port": port}], http_auth=(username, password),
scheme=scheme, ca_certs=ca_certs, verify_certs=verify_certs)

# if no custom_mapping is supplied, use the default mapping
Expand Down Expand Up @@ -62,7 +89,6 @@ def __init__(
self.search_fields = search_fields
self.text_field = text_field
self.name_field = name_field
self.tag_fields = tag_fields
self.external_source_id_field = external_source_id_field
self.embedding_field = embedding_field
self.excluded_meta_data = excluded_meta_data
Expand Down