deepset-ai · tanaysoni · Jul 14, 2020 · Jul 10, 2020 · Jul 10, 2020 · Jul 10, 2020
diff --git a/haystack/database/base.py b/haystack/database/base.py
@@ -12,10 +12,9 @@ class Document(BaseModel):
         description="id for the source file the document was created from. In the case when a large file is divided "
         "across multiple Elasticsearch documents, this id can be used to reference original source file.",
     )
-    # name: Optional[str] = Field(None, description="Title of the document")
     question: Optional[str] = Field(None, description="Question text for FAQs.")
     query_score: Optional[float] = Field(None, description="Elasticsearch query score for a retrieved document")
-    meta: Dict[str, Any] = Field({}, description="")
+    meta: Dict[str, Any] = Field({}, description="Meta fields for a document like name, url, or author.")
     tags: Optional[Dict[str, Any]] = Field(None, description="Tags that allow filtering of the data")
 
 
@@ -30,8 +29,11 @@ def write_documents(self, documents: List[dict]):
         """
         Indexes documents for later queries.
 
-        :param documents: List of dictionaries in the format {"name": "<some-document-name>, "text": "<the-actual-text>"}.
-                          Optionally, further fields can be supplied depending on the child class.
+        :param documents: List of dictionaries.
+                          Default format: {"text": "<the-actual-text>"}
+                          Optionally: Include meta data via {"text": "<the-actual-text>",
+                          "meta":{"name": "<some-document-name>, "author": "somebody", ...}}
+                          It can be used for filtering and is accessible in the responses of the Finder.
 
         :return: None
         """

diff --git a/haystack/database/elasticsearch.py b/haystack/database/elasticsearch.py
@@ -117,9 +117,9 @@ def write_documents(self, documents: List[dict]):
         Indexes documents for later queries in Elasticsearch.
 
         :param documents: List of dictionaries.
-                          Default format: {"name": "<some-document-name>, "text": "<the-actual-text>"}
-                          Optionally: Include meta data via {"name": "<some-document-name>,
-                          "text": "<the-actual-text>", "meta":{"author": "somebody", ...}}
+                          Default format: {"text": "<the-actual-text>"}
+                          Optionally: Include meta data via {"text": "<the-actual-text>",
+                          "meta":{"name": "<some-document-name>, "author": "somebody", ...}}
                           It can be used for filtering and is accessible in the responses of the Finder.
                           Advanced: If you are using your own Elasticsearch mapping, the key names in the dictionary
                           should be changed to what you have set for self.text_field and self.name_field .

diff --git a/haystack/database/memory.py b/haystack/database/memory.py
@@ -18,9 +18,9 @@ def write_documents(self, documents: List[dict]):
         """
         Indexes documents for later queries.
 
-        :param documents: List of dictionaries in the format {"name": "<some-document-name>, "text": "<the-actual-text>"}.
+        :param documents: List of dictionaries in the format {"text": "<the-actual-text>"}.
                           Optionally, you can also supply "tags": ["one-tag", "another-one"]
-                          or additional meta data via "meta": {"author": "someone", "url":"some-url" ...}
+                          or additional meta data via "meta": {"name": "<some-document-name>, "author": "someone", "url":"some-url" ...}
 
         :return: None
         """
@@ -30,19 +30,21 @@ def write_documents(self, documents: List[dict]):
             return
 
         for document in documents:
-            name = document.get("name", None)
-            text = document.get("text", None)
+            text = document["text"]
+            if "meta" not in document.keys():
+                document["meta"] = {}
+            for k, v in document.items():  # put additional fields other than text in meta
+                if k not in ["text", "meta", "tags"]:
+                    document["meta"][k] = v
 
-            if name is None or text is None:
-                continue
+            if not text:
+                raise Exception("A document cannot have empty text field.")
 
-            signature = name + text
-
-            hash = hashlib.md5(signature.encode("utf-8")).hexdigest()
+            hash = hashlib.md5(text.encode("utf-8")).hexdigest()
 
             self.docs[hash] = document
 
-            tags = document.get('tags', [])
+            tags = document.get("tags", [])
 
             self._map_tags_to_ids(hash, tags)
 
@@ -65,12 +67,12 @@ def get_document_by_id(self, id: str) -> Document:
         document = self._convert_memory_hit_to_document(self.docs[id], doc_id=id)
         return document
 
-    def _convert_memory_hit_to_document(self, hit: Tuple[Any, Any], doc_id: Optional[str] = None) -> Document:
+    def _convert_memory_hit_to_document(self, hit: Dict[str, Any], doc_id: Optional[str] = None) -> Document:
         document = Document(
             id=doc_id,
-            text=hit[0].get('text', None),
-            meta=hit[0].get('meta', {}),
-            query_score=hit[1],
+            text=hit.get("text", None),
+            meta=hit.get("meta", {}),
+            query_score=hit.get("query_score", None),
         )
         return document
 
@@ -89,14 +91,21 @@ def query_by_embedding(self,
                                       "use a different DocumentStore (e.g. ElasticsearchDocumentStore).")
 
         if self.embedding_field is None:
-            return []
+            raise Exception(
+                "To use query_by_embedding() 'embedding field' must "
+                "be specified when initializing the document store."
+            )
 
         if query_emb is None:
             return []
 
-        candidate_docs = [self._convert_memory_hit_to_document(
-            (doc, dot(query_emb, doc[self.embedding_field]) / (norm(query_emb) * norm(doc[self.embedding_field]))), doc_id=idx) for idx, doc in self.docs.items()
-        ]
+        candidate_docs = []
+        for idx, hit in self.docs.items():
+            hit["query_score"] = dot(query_emb, hit[self.embedding_field]) / (
+                norm(query_emb) * norm(hit[self.embedding_field])
+            )
+            _doc = self._convert_memory_hit_to_document(hit=hit, doc_id=idx)
+            candidate_docs.append(_doc)
 
         return sorted(candidate_docs, key=lambda x: x.query_score, reverse=True)[0:top_k]
 
@@ -139,4 +148,7 @@ def get_document_count(self) -> int:
         return len(self.docs.items())
 
     def get_all_documents(self) -> List[Document]:
-        return [Document(id=item[0], text=item[1]['text'], name=item[1]['name'], meta=item[1].get('meta', {})) for item in self.docs.items()]
+        return [
+            Document(id=item[0], text=item[1]["text"], meta=item[1].get("meta", {}))
+            for item in self.docs.items()
+        ]
diff --git a/haystack/database/sql.py b/haystack/database/sql.py
@@ -20,7 +20,6 @@ class ORMBase(Base):
 class Document(ORMBase):
     __tablename__ = "document"
 
-    name = Column(String)
     text = Column(String)
     meta_data = Column(PickleType)
 
@@ -96,14 +95,19 @@ def write_documents(self, documents: List[dict]):
         """
         Indexes documents for later queries.
 
-        :param documents: List of dictionaries in the format {"name": "<some-document-name>, "text": "<the-actual-text>"}.
+        :param documents: List of dictionaries in the format {"text": "<the-actual-text>"}.
                           Optionally, you can also supply meta data via "meta": {"author": "someone", "url":"some-url" ...}
 
         :return: None
         """
 
         for doc in documents:
-            row = Document(name=doc["name"], text=doc["text"], meta_data=doc.get("meta", {}))
+            if "meta" not in doc.keys():
+                doc["meta"] = {}
+            for k, v in doc.items():  # put additional fields other than text in meta
+                if k not in ["text", "meta", "tags"]:
+                    doc["meta"][k] = v
+            row = Document(text=doc["text"], meta_data=doc.get("meta", {}))
             self.session.add(row)
         self.session.commit()
 

diff --git a/haystack/indexing/utils.py b/haystack/indexing/utils.py
@@ -48,9 +48,9 @@ def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None,
             for para in text.split("\n\n"):
                 if not para.strip():  # skip empty paragraphs
                     continue
-                documents.append({"name": path.name, "text": para})
+                documents.append({"text": para, "meta": {"name": path.name}})
         else:
-            documents.append({"name": path.name, "text": text})
+            documents.append({"text": text, "meta": {"name": path.name}})
 
     return documents
 

diff --git a/test/conftest.py b/test/conftest.py
@@ -54,9 +54,10 @@ def xpdf_fixture():
 @pytest.fixture()
 def test_docs_xs():
     return [
-        {"name": "filename1", "text": "My name is Carla and I live in Berlin", "meta": {"meta_field": "test1"}},
-        {"name": "filename2", "text": "My name is Paul and I live in New York", "meta": {"meta_field": "test2"}},
-        {"name": "filename3", "text": "My name is Christelle and I live in Paris", "meta": {"meta_field": "test3"}}
+        {"text": "My name is Carla and I live in Berlin", "meta": {"meta_field": "test1", "name": "filename1"}},
+        {"text": "My name is Paul and I live in New York", "meta": {"meta_field": "test2", "name": "filename2"}},
+        {"text": "My name is Christelle and I live in Paris", "meta_field": "test3", "meta": {"name": "filename3"}}
+        # last doc has meta_field at the top level for backward compatibility
     ]
 
 

diff --git a/test/test_db.py b/test/test_db.py
@@ -1,27 +1,12 @@
-from time import sleep
-
-from haystack.database.elasticsearch import ElasticsearchDocumentStore
-from haystack.database.sql import SQLDocumentStore
-from haystack.indexing.utils import convert_files_to_dicts
-
-
-def test_sql_write_read():
-    sql_document_store = SQLDocumentStore()
-    documents = convert_files_to_dicts(dir_path="samples/docs")
-    sql_document_store.write_documents(documents)
-    documents = sql_document_store.get_all_documents()
-    assert len(documents) == 2
-    doc = sql_document_store.get_document_by_id("1")
-    assert doc.id
-    assert doc.text
-
-
-def test_elasticsearch_write_read(elasticsearch_fixture):
-    document_store = ElasticsearchDocumentStore()
-    documents = convert_files_to_dicts(dir_path="samples/docs")
-    document_store.write_documents(documents)
-    sleep(2)  # wait for documents to be available for query
-    documents = document_store.get_all_documents()
-    assert len(documents) == 2
-    assert documents[0].id
-    assert documents[0].text
+from haystack.database.base import Document
+
+
+def test_get_all_documents(document_store_with_docs):
+    documents = document_store_with_docs.get_all_documents()
+    assert all(isinstance(d, Document) for d in documents)
+    assert len(documents) == 3
+    assert {d.meta["name"] for d in documents} == {"filename1", "filename2", "filename3"}
+    assert {d.meta["meta_field"] for d in documents} == {"test1", "test2", "test3"}
+    doc = document_store_with_docs.get_document_by_id(documents[0].id)
+    assert doc.id == documents[0].id
+    assert doc.text == documents[0].text
diff --git a/test/test_dpr_retriever.py b/test/test_dpr_retriever.py
@@ -1,20 +1,8 @@
-from haystack.retriever.dpr_utils import download_dpr
-
-def test_dpr_passage_encoder():
-    from haystack.retriever.dense import DensePassageRetriever
-
-    passage = ["Let's encode this one"]
-    retriever = DensePassageRetriever(document_store=None, embedding_model="dpr-bert-base-nq", gpu=False)
-    emb = retriever.embed_passages(passage)[0]
-    assert(emb.shape[0] == 768)
-    assert(emb[0]-0.52872 < 0.001)
+from haystack.database.memory import InMemoryDocumentStore
+from haystack.retriever.dense import DensePassageRetriever
 
 
 def test_dpr_inmemory_retrieval():
-
-    from haystack.database.memory import InMemoryDocumentStore
-    from haystack.retriever.dense import DensePassageRetriever
-
     document_store = InMemoryDocumentStore(embedding_field="embedding")
 
     documents = [
@@ -27,8 +15,13 @@ def test_dpr_inmemory_retrieval():
 
     embedded = []
     for doc in documents:
-        doc['embedding'] = retriever.embed_passages([doc['text']])[0]
+        embedding = retriever.embed_passages([doc['text']])[0]
+        doc['embedding'] = embedding
         embedded.append(doc)
+
+        assert (embedding.shape[0] == 768)
+        assert (embedding[0] - 0.52872 < 0.001)
+
     document_store.write_documents(embedded)
 
     res = retriever.retrieve(query="Which philosopher attacked Schopenhauer?")

diff --git a/test/test_faq_retriever.py b/test/test_faq_retriever.py
@@ -9,17 +9,17 @@ def test_faq_retriever_in_memory_store():
     document_store = InMemoryDocumentStore(embedding_field="embedding")
 
     documents = [
-        {'name': 'How to test this library?', 'text': 'By running tox in the command line!', 'meta': {'question': 'How to test this library?'}},
-        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
-        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
-        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
-        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
-        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
-        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
-        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
-        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
-        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
-        {'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
+        {'text': 'By running tox in the command line!', 'meta': {'name': 'How to test this library?', 'question': 'How to test this library?'}},
+        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
+        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
+        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
+        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
+        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
+        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
+        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
+        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
+        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
+        {'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
     ]
 
     retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", gpu=False)