Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move document_name attribute to meta #217

Merged
merged 7 commits into from
Jul 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 6 additions & 4 deletions haystack/database/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@ class Document(BaseModel):
description="id for the source file the document was created from. In the case when a large file is divided "
"across multiple Elasticsearch documents, this id can be used to reference original source file.",
)
# name: Optional[str] = Field(None, description="Title of the document")
question: Optional[str] = Field(None, description="Question text for FAQs.")
query_score: Optional[float] = Field(None, description="Elasticsearch query score for a retrieved document")
meta: Dict[str, Any] = Field({}, description="")
meta: Dict[str, Any] = Field({}, description="Meta fields for a document like name, url, or author.")
tags: Optional[Dict[str, Any]] = Field(None, description="Tags that allow filtering of the data")


Expand All @@ -30,8 +29,11 @@ def write_documents(self, documents: List[dict]):
"""
Indexes documents for later queries.

:param documents: List of dictionaries in the format {"name": "<some-document-name>, "text": "<the-actual-text>"}.
Optionally, further fields can be supplied depending on the child class.
:param documents: List of dictionaries.
Default format: {"text": "<the-actual-text>"}
Optionally: Include meta data via {"text": "<the-actual-text>",
"meta":{"name": "<some-document-name>, "author": "somebody", ...}}
It can be used for filtering and is accessible in the responses of the Finder.

:return: None
"""
Expand Down
6 changes: 3 additions & 3 deletions haystack/database/elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,9 @@ def write_documents(self, documents: List[dict]):
Indexes documents for later queries in Elasticsearch.

:param documents: List of dictionaries.
Default format: {"name": "<some-document-name>, "text": "<the-actual-text>"}
Optionally: Include meta data via {"name": "<some-document-name>,
"text": "<the-actual-text>", "meta":{"author": "somebody", ...}}
Default format: {"text": "<the-actual-text>"}
Optionally: Include meta data via {"text": "<the-actual-text>",
"meta":{"name": "<some-document-name>, "author": "somebody", ...}}
It can be used for filtering and is accessible in the responses of the Finder.
Advanced: If you are using your own Elasticsearch mapping, the key names in the dictionary
should be changed to what you have set for self.text_field and self.name_field .
Expand Down
50 changes: 31 additions & 19 deletions haystack/database/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ def write_documents(self, documents: List[dict]):
"""
Indexes documents for later queries.

:param documents: List of dictionaries in the format {"name": "<some-document-name>, "text": "<the-actual-text>"}.
:param documents: List of dictionaries in the format {"text": "<the-actual-text>"}.
Optionally, you can also supply "tags": ["one-tag", "another-one"]
or additional meta data via "meta": {"author": "someone", "url":"some-url" ...}
or additional meta data via "meta": {"name": "<some-document-name>, "author": "someone", "url":"some-url" ...}

:return: None
"""
Expand All @@ -30,19 +30,21 @@ def write_documents(self, documents: List[dict]):
return

for document in documents:
name = document.get("name", None)
text = document.get("text", None)
text = document["text"]
if "meta" not in document.keys():
document["meta"] = {}
for k, v in document.items(): # put additional fields other than text in meta
if k not in ["text", "meta", "tags"]:
document["meta"][k] = v

if name is None or text is None:
continue
if not text:
raise Exception("A document cannot have empty text field.")

signature = name + text

hash = hashlib.md5(signature.encode("utf-8")).hexdigest()
hash = hashlib.md5(text.encode("utf-8")).hexdigest()

self.docs[hash] = document

tags = document.get('tags', [])
tags = document.get("tags", [])

self._map_tags_to_ids(hash, tags)

Expand All @@ -65,12 +67,12 @@ def get_document_by_id(self, id: str) -> Document:
document = self._convert_memory_hit_to_document(self.docs[id], doc_id=id)
return document

def _convert_memory_hit_to_document(self, hit: Tuple[Any, Any], doc_id: Optional[str] = None) -> Document:
def _convert_memory_hit_to_document(self, hit: Dict[str, Any], doc_id: Optional[str] = None) -> Document:
document = Document(
id=doc_id,
text=hit[0].get('text', None),
meta=hit[0].get('meta', {}),
query_score=hit[1],
text=hit.get("text", None),
meta=hit.get("meta", {}),
query_score=hit.get("query_score", None),
)
return document

Expand All @@ -89,14 +91,21 @@ def query_by_embedding(self,
"use a different DocumentStore (e.g. ElasticsearchDocumentStore).")

if self.embedding_field is None:
return []
raise Exception(
"To use query_by_embedding() 'embedding field' must "
"be specified when initializing the document store."
)

if query_emb is None:
return []

candidate_docs = [self._convert_memory_hit_to_document(
(doc, dot(query_emb, doc[self.embedding_field]) / (norm(query_emb) * norm(doc[self.embedding_field]))), doc_id=idx) for idx, doc in self.docs.items()
]
candidate_docs = []
for idx, hit in self.docs.items():
hit["query_score"] = dot(query_emb, hit[self.embedding_field]) / (
norm(query_emb) * norm(hit[self.embedding_field])
)
_doc = self._convert_memory_hit_to_document(hit=hit, doc_id=idx)
candidate_docs.append(_doc)

return sorted(candidate_docs, key=lambda x: x.query_score, reverse=True)[0:top_k]

Expand Down Expand Up @@ -139,4 +148,7 @@ def get_document_count(self) -> int:
return len(self.docs.items())

def get_all_documents(self) -> List[Document]:
return [Document(id=item[0], text=item[1]['text'], name=item[1]['name'], meta=item[1].get('meta', {})) for item in self.docs.items()]
return [
Document(id=item[0], text=item[1]["text"], meta=item[1].get("meta", {}))
for item in self.docs.items()
]
10 changes: 7 additions & 3 deletions haystack/database/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ class ORMBase(Base):
class Document(ORMBase):
__tablename__ = "document"

name = Column(String)
text = Column(String)
meta_data = Column(PickleType)

Expand Down Expand Up @@ -96,14 +95,19 @@ def write_documents(self, documents: List[dict]):
"""
Indexes documents for later queries.

:param documents: List of dictionaries in the format {"name": "<some-document-name>, "text": "<the-actual-text>"}.
:param documents: List of dictionaries in the format {"text": "<the-actual-text>"}.
Optionally, you can also supply meta data via "meta": {"author": "someone", "url":"some-url" ...}

:return: None
"""

for doc in documents:
row = Document(name=doc["name"], text=doc["text"], meta_data=doc.get("meta", {}))
if "meta" not in doc.keys():
doc["meta"] = {}
for k, v in doc.items(): # put additional fields other than text in meta
if k not in ["text", "meta", "tags"]:
doc["meta"][k] = v
row = Document(text=doc["text"], meta_data=doc.get("meta", {}))
self.session.add(row)
self.session.commit()

Expand Down
4 changes: 2 additions & 2 deletions haystack/indexing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None,
for para in text.split("\n\n"):
if not para.strip(): # skip empty paragraphs
continue
documents.append({"name": path.name, "text": para})
documents.append({"text": para, "meta": {"name": path.name}})
else:
documents.append({"name": path.name, "text": text})
documents.append({"text": text, "meta": {"name": path.name}})

return documents

Expand Down
7 changes: 4 additions & 3 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,10 @@ def xpdf_fixture():
@pytest.fixture()
def test_docs_xs():
return [
{"name": "filename1", "text": "My name is Carla and I live in Berlin", "meta": {"meta_field": "test1"}},
{"name": "filename2", "text": "My name is Paul and I live in New York", "meta": {"meta_field": "test2"}},
{"name": "filename3", "text": "My name is Christelle and I live in Paris", "meta": {"meta_field": "test3"}}
{"text": "My name is Carla and I live in Berlin", "meta": {"meta_field": "test1", "name": "filename1"}},
{"text": "My name is Paul and I live in New York", "meta": {"meta_field": "test2", "name": "filename2"}},
{"text": "My name is Christelle and I live in Paris", "meta_field": "test3", "meta": {"name": "filename3"}}
# last doc has meta_field at the top level for backward compatibility
]


Expand Down
39 changes: 12 additions & 27 deletions test/test_db.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,12 @@
from time import sleep

from haystack.database.elasticsearch import ElasticsearchDocumentStore
from haystack.database.sql import SQLDocumentStore
from haystack.indexing.utils import convert_files_to_dicts


def test_sql_write_read():
sql_document_store = SQLDocumentStore()
documents = convert_files_to_dicts(dir_path="samples/docs")
sql_document_store.write_documents(documents)
documents = sql_document_store.get_all_documents()
assert len(documents) == 2
doc = sql_document_store.get_document_by_id("1")
assert doc.id
assert doc.text


def test_elasticsearch_write_read(elasticsearch_fixture):
document_store = ElasticsearchDocumentStore()
documents = convert_files_to_dicts(dir_path="samples/docs")
document_store.write_documents(documents)
sleep(2) # wait for documents to be available for query
documents = document_store.get_all_documents()
assert len(documents) == 2
assert documents[0].id
assert documents[0].text
from haystack.database.base import Document


def test_get_all_documents(document_store_with_docs):
documents = document_store_with_docs.get_all_documents()
assert all(isinstance(d, Document) for d in documents)
assert len(documents) == 3
assert {d.meta["name"] for d in documents} == {"filename1", "filename2", "filename3"}
assert {d.meta["meta_field"] for d in documents} == {"test1", "test2", "test3"}
doc = document_store_with_docs.get_document_by_id(documents[0].id)
assert doc.id == documents[0].id
assert doc.text == documents[0].text
23 changes: 8 additions & 15 deletions test/test_dpr_retriever.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,8 @@
from haystack.retriever.dpr_utils import download_dpr

def test_dpr_passage_encoder():
from haystack.retriever.dense import DensePassageRetriever

passage = ["Let's encode this one"]
retriever = DensePassageRetriever(document_store=None, embedding_model="dpr-bert-base-nq", gpu=False)
emb = retriever.embed_passages(passage)[0]
assert(emb.shape[0] == 768)
assert(emb[0]-0.52872 < 0.001)
from haystack.database.memory import InMemoryDocumentStore
from haystack.retriever.dense import DensePassageRetriever


def test_dpr_inmemory_retrieval():

from haystack.database.memory import InMemoryDocumentStore
from haystack.retriever.dense import DensePassageRetriever

document_store = InMemoryDocumentStore(embedding_field="embedding")

documents = [
Expand All @@ -27,8 +15,13 @@ def test_dpr_inmemory_retrieval():

embedded = []
for doc in documents:
doc['embedding'] = retriever.embed_passages([doc['text']])[0]
embedding = retriever.embed_passages([doc['text']])[0]
doc['embedding'] = embedding
embedded.append(doc)

assert (embedding.shape[0] == 768)
assert (embedding[0] - 0.52872 < 0.001)

document_store.write_documents(embedded)

res = retriever.retrieve(query="Which philosopher attacked Schopenhauer?")
Expand Down
22 changes: 11 additions & 11 deletions test/test_faq_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,17 @@ def test_faq_retriever_in_memory_store():
document_store = InMemoryDocumentStore(embedding_field="embedding")

documents = [
{'name': 'How to test this library?', 'text': 'By running tox in the command line!', 'meta': {'question': 'How to test this library?'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
{'name': 'blah blah blah', 'text': 'By running tox in the command line!', 'meta': {'question': 'blah blah blah'}},
{'text': 'By running tox in the command line!', 'meta': {'name': 'How to test this library?', 'question': 'How to test this library?'}},
{'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
{'text': 'By running tox in the command line!', 'meta': {'name': 'blah blah blah', 'question': 'blah blah blah'}},
]

retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", gpu=False)
Expand Down