deepset-ai · brandenchan · Feb 1, 2021 · Jan 24, 2021 · Jan 29, 2021 · Feb 1, 2021
diff --git a/haystack/document_store/base.py b/haystack/document_store/base.py
@@ -1,11 +1,13 @@
 import logging
 from abc import abstractmethod, ABC
 from pathlib import Path
-from typing import Any, Optional, Dict, List, Union
+from typing import Optional, Dict, List, Union
+
+import numpy as np
+
 from haystack import Document, Label, MultiLabel
-from haystack.preprocessor.utils import eval_data_from_json, eval_data_from_jsonl, squad_json_to_jsonl
 from haystack.preprocessor.preprocessor import PreProcessor
-
+from haystack.preprocessor.utils import eval_data_from_json, eval_data_from_jsonl, squad_json_to_jsonl
 
 logger = logging.getLogger(__name__)
 
@@ -64,7 +66,7 @@ def get_all_labels_aggregated(self,
         all_labels = self.get_all_labels(index=index, filters=filters)
 
         # Collect all answers to a question in a dict
-        question_ans_dict = {} # type: ignore
+        question_ans_dict: dict = {}
         for l in all_labels:
             # only aggregate labels with correct answers, as only those can be currently used in evaluation
             if not l.is_correct_answer:
@@ -125,7 +127,7 @@ def get_document_count(self, filters: Optional[Dict[str, List[str]]] = None, ind
 
     @abstractmethod
     def query_by_embedding(self,
-                           query_emb: List[float],
+                           query_emb: np.ndarray,
                            filters: Optional[Optional[Dict[str, List[str]]]] = None,
                            top_k: int = 10,
                            index: Optional[str] = None,
@@ -198,6 +200,6 @@ def add_eval_data(self, filename: str, doc_index: str = "eval_document", label_i
             logger.error("File needs to be in json or jsonl format.")
 
     @abstractmethod
-    def delete_all_documents(self, index: str, filters: Optional[Dict[str, List[str]]] = None):
+    def delete_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None):
         pass
 
diff --git a/haystack/document_store/elasticsearch.py b/haystack/document_store/elasticsearch.py
@@ -522,7 +522,7 @@ def query(
         elif custom_query:  # substitute placeholder for query and filters for the custom_query template string
             template = Template(custom_query)
             # replace all "${query}" placeholder(s) with query
-            substitutions = {"query": query}
+            substitutions = {"query": f'"{query}"'}
             # For each filter we got passed, we'll try to find & replace the corresponding placeholder in the template
             # Example: filters={"years":[2018]} => replaces {$years} in custom_query with '[2018]'
             if filters:
@@ -568,7 +568,7 @@ def query(
         return documents
 
     def query_by_embedding(self,
-                           query_emb: np.array,
+                           query_emb: np.ndarray,
                            filters: Optional[Dict[str, List[str]]] = None,
                            top_k: int = 10,
                            index: Optional[str] = None,
@@ -631,7 +631,7 @@ def query_by_embedding(self,
             ]
             return documents
 
-    def _get_vector_similarity_query(self, query_emb: np.array, top_k: int):
+    def _get_vector_similarity_query(self, query_emb: np.ndarray, top_k: int):
         """
         Generate Elasticsearch query for vector similarity.
         """
@@ -757,14 +757,15 @@ def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = Non
 
             bulk(self.client, doc_updates, request_timeout=300, refresh=self.refresh_type)
 
-    def delete_all_documents(self, index: str, filters: Optional[Dict[str, List[str]]] = None):
+    def delete_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None):
         """
         Delete documents in an index. All documents are deleted if no filters are passed.
 
         :param index: Index name to delete the document from.
         :param filters: Optional filters to narrow down the documents to be deleted.
         :return: None
         """
+        index = index or self.index
         query: Dict[str, Any] = {"query": {}}
         if filters:
             filter_clause = []
@@ -848,7 +849,7 @@ def _create_document_index(self, index_name: str):
             if not self.client.indices.exists(index=index_name):
                 raise e
 
-    def _get_vector_similarity_query(self, query_emb: np.array, top_k: int):
+    def _get_vector_similarity_query(self, query_emb: np.ndarray, top_k: int):
         """
         Generate Elasticsearch query for vector similarity.
         """

diff --git a/haystack/document_store/faiss.py b/haystack/document_store/faiss.py
@@ -41,6 +41,7 @@ def __init__(
         update_existing_documents: bool = False,
         index: str = "document",
         similarity: str = "dot_product",
+        embedding_field: str = "embedding",
         **kwargs,
     ):
         """
@@ -72,6 +73,7 @@ def __init__(
         :param index: Name of index in document store to use.
         :param similarity: The similarity function used to compare document vectors. 'dot_product' is the default sine it is
                    more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence BERT model.
+        :param embedding_field: Name of field containing an embedding vector.
         """
         self.vector_dim = vector_dim
 
@@ -83,6 +85,7 @@ def __init__(
                 self.faiss_index.set_direct_map_type(faiss.DirectMap.Hashtable)
 
         self.return_embedding = return_embedding
+        self.embedding_field = embedding_field
         if similarity == "dot_product":
             self.similarity = similarity
         else:
@@ -139,8 +142,8 @@ def write_documents(
         for i in range(0, len(document_objects), batch_size):
             if add_vectors:
                 embeddings = [doc.embedding for doc in document_objects[i: i + batch_size]]
-                embeddings = np.array(embeddings, dtype="float32")
-                self.faiss_index.add(embeddings)
+                embeddings_to_index = np.array(embeddings, dtype="float32")
+                self.faiss_index.add(embeddings_to_index)
 
             docs_to_write_in_sql = []
             for doc in document_objects[i: i + batch_size]:
@@ -154,7 +157,7 @@ def write_documents(
 
     def _create_document_field_map(self) -> Dict:
         return {
-            self.index: "embedding",
+            self.index: self.embedding_field,
         }
 
     def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None, batch_size: int = 10_000):
@@ -256,7 +259,9 @@ def get_documents_by_id(
                     doc.embedding = self.faiss_index.reconstruct(int(doc.meta["vector_id"]))
         return documents
 
-    def train_index(self, documents: Optional[Union[List[dict], List[Document]]], embeddings: Optional[np.array] = None):
+    def train_index(
+            self, documents: Optional[Union[List[dict], List[Document]]], embeddings: Optional[np.ndarray] = None
+    ):
         """
         Some FAISS indices (e.g. IVF) require initial "training" on a sample of vectors before you can add your final vectors.
         The train vectors should come from the same distribution as your final ones.
@@ -271,20 +276,22 @@ def train_index(self, documents: Optional[Union[List[dict], List[Document]]], em
             raise ValueError("Either pass `documents` or `embeddings`. You passed both.")
         if documents:
             document_objects = [Document.from_dict(d) if isinstance(d, dict) else d for d in documents]
-            embeddings = [doc.embedding for doc in document_objects]
-            embeddings = np.array(embeddings, dtype="float32")
-        self.faiss_index.train(embeddings)
+            doc_embeddings = [doc.embedding for doc in document_objects]
+            embeddings_for_train = np.array(doc_embeddings, dtype="float32")
+            self.faiss_index.train(embeddings_for_train)
+        if embeddings:
+            self.faiss_index.train(embeddings)
 
-    def delete_all_documents(self, index=None):
+    def delete_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None):
         """
         Delete all documents from the document store.
         """
         index = index or self.index
         self.faiss_index.reset()
-        super().delete_all_documents(index=index)
+        super().delete_all_documents(index=index, filters=filters)
 
     def query_by_embedding(self,
-                           query_emb: np.array,
+                           query_emb: np.ndarray,
                            filters: Optional[dict] = None,
                            top_k: int = 10,
                            index: Optional[str] = None,

diff --git a/haystack/document_store/memory.py b/haystack/document_store/memory.py
@@ -1,15 +1,15 @@
+import logging
+from collections import defaultdict
 from copy import deepcopy
 from typing import Dict, List, Optional, Union, Generator
 from uuid import uuid4
-from collections import defaultdict
-
-from haystack.document_store.base import BaseDocumentStore
-from haystack import Document, Label
-from haystack.retriever.base import BaseRetriever
 
+import numpy as np
 from scipy.spatial.distance import cosine
 
-import logging
+from haystack import Document, Label
+from haystack.document_store.base import BaseDocumentStore
+from haystack.retriever.base import BaseRetriever
 
 logger = logging.getLogger(__name__)
 
@@ -94,7 +94,7 @@ def get_documents_by_id(self, ids: List[str], index: Optional[str] = None) -> Li
         return documents
 
     def query_by_embedding(self,
-                           query_emb: List[float],
+                           query_emb: np.ndarray,
                            filters: Optional[Dict[str, List[str]]] = None,
                            top_k: int = 10,
                            index: Optional[str] = None,