Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SQuAD to DPR dataset converter #765

Merged
merged 5 commits into from
Feb 1, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions haystack/document_store/base.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import logging
from abc import abstractmethod, ABC
from pathlib import Path
from typing import Any, Optional, Dict, List, Union
from typing import Optional, Dict, List, Union

import numpy as np

from haystack import Document, Label, MultiLabel
from haystack.preprocessor.utils import eval_data_from_json, eval_data_from_jsonl, squad_json_to_jsonl
from haystack.preprocessor.preprocessor import PreProcessor

from haystack.preprocessor.utils import eval_data_from_json, eval_data_from_jsonl, squad_json_to_jsonl

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -64,7 +66,7 @@ def get_all_labels_aggregated(self,
all_labels = self.get_all_labels(index=index, filters=filters)

# Collect all answers to a question in a dict
question_ans_dict = {} # type: ignore
question_ans_dict: dict = {}
for l in all_labels:
# only aggregate labels with correct answers, as only those can be currently used in evaluation
if not l.is_correct_answer:
Expand Down Expand Up @@ -125,7 +127,7 @@ def get_document_count(self, filters: Optional[Dict[str, List[str]]] = None, ind

@abstractmethod
def query_by_embedding(self,
query_emb: List[float],
query_emb: np.ndarray,
filters: Optional[Optional[Dict[str, List[str]]]] = None,
top_k: int = 10,
index: Optional[str] = None,
Expand Down Expand Up @@ -198,6 +200,6 @@ def add_eval_data(self, filename: str, doc_index: str = "eval_document", label_i
logger.error("File needs to be in json or jsonl format.")

@abstractmethod
def delete_all_documents(self, index: str, filters: Optional[Dict[str, List[str]]] = None):
def delete_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None):
pass

11 changes: 6 additions & 5 deletions haystack/document_store/elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,7 +522,7 @@ def query(
elif custom_query: # substitute placeholder for query and filters for the custom_query template string
template = Template(custom_query)
# replace all "${query}" placeholder(s) with query
substitutions = {"query": query}
substitutions = {"query": f'"{query}"'}
# For each filter we got passed, we'll try to find & replace the corresponding placeholder in the template
# Example: filters={"years":[2018]} => replaces {$years} in custom_query with '[2018]'
if filters:
Expand Down Expand Up @@ -568,7 +568,7 @@ def query(
return documents

def query_by_embedding(self,
query_emb: np.array,
query_emb: np.ndarray,
filters: Optional[Dict[str, List[str]]] = None,
top_k: int = 10,
index: Optional[str] = None,
Expand Down Expand Up @@ -631,7 +631,7 @@ def query_by_embedding(self,
]
return documents

def _get_vector_similarity_query(self, query_emb: np.array, top_k: int):
def _get_vector_similarity_query(self, query_emb: np.ndarray, top_k: int):
"""
Generate Elasticsearch query for vector similarity.
"""
Expand Down Expand Up @@ -757,14 +757,15 @@ def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = Non

bulk(self.client, doc_updates, request_timeout=300, refresh=self.refresh_type)

def delete_all_documents(self, index: str, filters: Optional[Dict[str, List[str]]] = None):
def delete_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None):
"""
Delete documents in an index. All documents are deleted if no filters are passed.

:param index: Index name to delete the document from.
:param filters: Optional filters to narrow down the documents to be deleted.
:return: None
"""
index = index or self.index
query: Dict[str, Any] = {"query": {}}
if filters:
filter_clause = []
Expand Down Expand Up @@ -848,7 +849,7 @@ def _create_document_index(self, index_name: str):
if not self.client.indices.exists(index=index_name):
raise e

def _get_vector_similarity_query(self, query_emb: np.array, top_k: int):
def _get_vector_similarity_query(self, query_emb: np.ndarray, top_k: int):
"""
Generate Elasticsearch query for vector similarity.
"""
Expand Down
27 changes: 17 additions & 10 deletions haystack/document_store/faiss.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def __init__(
update_existing_documents: bool = False,
index: str = "document",
similarity: str = "dot_product",
embedding_field: str = "embedding",
**kwargs,
):
"""
Expand Down Expand Up @@ -72,6 +73,7 @@ def __init__(
:param index: Name of index in document store to use.
:param similarity: The similarity function used to compare document vectors. 'dot_product' is the default sine it is
more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence BERT model.
:param embedding_field: Name of field containing an embedding vector.
"""
self.vector_dim = vector_dim

Expand All @@ -83,6 +85,7 @@ def __init__(
self.faiss_index.set_direct_map_type(faiss.DirectMap.Hashtable)

self.return_embedding = return_embedding
self.embedding_field = embedding_field
if similarity == "dot_product":
self.similarity = similarity
else:
Expand Down Expand Up @@ -139,8 +142,8 @@ def write_documents(
for i in range(0, len(document_objects), batch_size):
if add_vectors:
embeddings = [doc.embedding for doc in document_objects[i: i + batch_size]]
embeddings = np.array(embeddings, dtype="float32")
self.faiss_index.add(embeddings)
embeddings_to_index = np.array(embeddings, dtype="float32")
self.faiss_index.add(embeddings_to_index)

docs_to_write_in_sql = []
for doc in document_objects[i: i + batch_size]:
Expand All @@ -154,7 +157,7 @@ def write_documents(

def _create_document_field_map(self) -> Dict:
return {
self.index: "embedding",
self.index: self.embedding_field,
}

def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None, batch_size: int = 10_000):
Expand Down Expand Up @@ -256,7 +259,9 @@ def get_documents_by_id(
doc.embedding = self.faiss_index.reconstruct(int(doc.meta["vector_id"]))
return documents

def train_index(self, documents: Optional[Union[List[dict], List[Document]]], embeddings: Optional[np.array] = None):
def train_index(
self, documents: Optional[Union[List[dict], List[Document]]], embeddings: Optional[np.ndarray] = None
):
"""
Some FAISS indices (e.g. IVF) require initial "training" on a sample of vectors before you can add your final vectors.
The train vectors should come from the same distribution as your final ones.
Expand All @@ -271,20 +276,22 @@ def train_index(self, documents: Optional[Union[List[dict], List[Document]]], em
raise ValueError("Either pass `documents` or `embeddings`. You passed both.")
if documents:
document_objects = [Document.from_dict(d) if isinstance(d, dict) else d for d in documents]
embeddings = [doc.embedding for doc in document_objects]
embeddings = np.array(embeddings, dtype="float32")
self.faiss_index.train(embeddings)
doc_embeddings = [doc.embedding for doc in document_objects]
embeddings_for_train = np.array(doc_embeddings, dtype="float32")
self.faiss_index.train(embeddings_for_train)
if embeddings:
self.faiss_index.train(embeddings)

def delete_all_documents(self, index=None):
def delete_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None):
"""
Delete all documents from the document store.
"""
index = index or self.index
self.faiss_index.reset()
super().delete_all_documents(index=index)
super().delete_all_documents(index=index, filters=filters)

def query_by_embedding(self,
query_emb: np.array,
query_emb: np.ndarray,
filters: Optional[dict] = None,
top_k: int = 10,
index: Optional[str] = None,
Expand Down
14 changes: 7 additions & 7 deletions haystack/document_store/memory.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import logging
from collections import defaultdict
from copy import deepcopy
from typing import Dict, List, Optional, Union, Generator
from uuid import uuid4
from collections import defaultdict

from haystack.document_store.base import BaseDocumentStore
from haystack import Document, Label
from haystack.retriever.base import BaseRetriever

import numpy as np
from scipy.spatial.distance import cosine

import logging
from haystack import Document, Label
from haystack.document_store.base import BaseDocumentStore
from haystack.retriever.base import BaseRetriever

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -94,7 +94,7 @@ def get_documents_by_id(self, ids: List[str], index: Optional[str] = None) -> Li
return documents

def query_by_embedding(self,
query_emb: List[float],
query_emb: np.ndarray,
filters: Optional[Dict[str, List[str]]] = None,
top_k: int = 10,
index: Optional[str] = None,
Expand Down
Loading