Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup Pytest Fixtures #639

Merged
merged 9 commits into from
Dec 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ on:
jobs:
build:

runs-on: ubuntu-latest
runs-on: ubuntu-20.04

steps:
- uses: actions/checkout@v2
Expand Down Expand Up @@ -39,14 +39,14 @@ jobs:
pip install -r requirements.txt
pip install -e .

- name: Run Pytest without pipeline marker
run: cd test && pytest -m "not pipeline"
- name: Run Pytest without generator/pipeline marker
run: cd test && pytest -m "not pipeline and not generator"

# - name: Stop Containers
# run: docker rm -f `docker ps -a -q`

- name: Run pytest with pipeline marker
run: cd test && pytest -m pipeline
- name: Run pytest with generator/pipeline marker
run: cd test && pytest -m "pipeline or generator"

- name: Test with mypy
run: |
Expand Down
119 changes: 43 additions & 76 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,49 +117,6 @@ def rag_generator():
)


@pytest.fixture()
def faiss_document_store():
tanaysoni marked this conversation as resolved.
Show resolved Hide resolved
if os.path.exists("haystack_test_faiss.db"):
os.remove("haystack_test_faiss.db")
document_store = FAISSDocumentStore(
sql_url="sqlite:///haystack_test_faiss.db",
return_embedding=True
)
yield document_store
document_store.faiss_index.reset()


@pytest.fixture()
def inmemory_document_store():
return InMemoryDocumentStore(return_embedding=True)


@pytest.fixture()
def dpr_retriever(faiss_document_store):
return DensePassageRetriever(
document_store=faiss_document_store,
query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
use_gpu=False,
embed_title=True,
use_fast_tokenizers=True
)


@pytest.fixture()
def embedding_retriever(faiss_document_store):
return EmbeddingRetriever(
document_store=faiss_document_store,
embedding_model="deepset/sentence_bert",
use_gpu=False
)


@pytest.fixture()
def tfidf_retriever(inmemory_document_store):
return TfidfRetriever(document_store=inmemory_document_store)


@pytest.fixture(scope="module")
def test_docs_xs():
return [
Expand Down Expand Up @@ -225,19 +182,6 @@ def no_answer_prediction(no_answer_reader, test_docs_xs):
return prediction


@pytest.fixture(params=["elasticsearch", "faiss", "memory", "sql"])
def document_store_with_docs(
request, test_docs_xs, elasticsearch_fixture, faiss_document_store, inmemory_document_store):
document_store = get_document_store(request.param, faiss_document_store, inmemory_document_store)
document_store.write_documents(test_docs_xs)
return document_store


@pytest.fixture(params=["elasticsearch", "faiss", "memory", "sql"])
def document_store(request, test_docs_xs, elasticsearch_fixture, faiss_document_store, inmemory_document_store):
return get_document_store(request.param, faiss_document_store, inmemory_document_store)


@pytest.fixture(params=["es_filter_only", "elasticsearch", "dpr", "embedding", "tfidf"])
def retriever(request, document_store):
return get_retriever(request.param, document_store)
Expand All @@ -248,26 +192,6 @@ def retriever_with_docs(request, document_store_with_docs):
return get_retriever(request.param, document_store_with_docs)


def get_document_store(document_store_type, faiss_document_store, inmemory_document_store):
if document_store_type == "sql":
if os.path.exists("haystack_test.db"):
os.remove("haystack_test.db")
document_store = SQLDocumentStore(url="sqlite:///haystack_test.db")
elif document_store_type == "memory":
document_store = inmemory_document_store
elif document_store_type == "elasticsearch":
# make sure we start from a fresh index
client = Elasticsearch()
client.indices.delete(index='haystack_test*', ignore=[404])
document_store = ElasticsearchDocumentStore(index="haystack_test", return_embedding=True)
elif document_store_type == "faiss":
document_store = faiss_document_store
else:
raise Exception(f"No document store fixture for '{document_store_type}'")

return document_store


def get_retriever(retriever_type, document_store):

if retriever_type == "dpr":
Expand All @@ -291,3 +215,46 @@ def get_retriever(retriever_type, document_store):
raise Exception(f"No retriever fixture for '{retriever_type}'")

return retriever


@pytest.fixture(params=["elasticsearch", "faiss", "memory", "sql"])
def document_store_with_docs(request, test_docs_xs):
document_store = get_document_store(request.param)
document_store.write_documents(test_docs_xs)
yield document_store
if request.param == "faiss":
document_store.faiss_index.reset()


@pytest.fixture(params=["elasticsearch", "faiss", "memory", "sql"])
def document_store(request, test_docs_xs):
document_store = get_document_store(request.param)
yield document_store
if request.param == "faiss":
document_store.faiss_index.reset()


def get_document_store(document_store_type):
if document_store_type == "sql":
if os.path.exists("haystack_test.db"):
os.remove("haystack_test.db")
document_store = SQLDocumentStore(url="sqlite:///haystack_test.db")
elif document_store_type == "memory":
document_store = InMemoryDocumentStore(return_embedding=True)
elif document_store_type == "elasticsearch":
# make sure we start from a fresh index
client = Elasticsearch()
client.indices.delete(index='haystack_test*', ignore=[404])
document_store = ElasticsearchDocumentStore(index="haystack_test", return_embedding=True)
elif document_store_type == "faiss":
if os.path.exists("haystack_test_faiss.db"):
os.remove("haystack_test_faiss.db")
document_store = FAISSDocumentStore(
sql_url="sqlite:///haystack_test_faiss.db",
return_embedding=True
)
return document_store
else:
raise Exception(f"No document store fixture for '{document_store_type}'")

return document_store
66 changes: 38 additions & 28 deletions test/test_faiss.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,42 +32,44 @@ def check_data_correctness(documents_indexed, documents_inserted):
assert len(vector_ids) == len(documents_inserted)


def test_faiss_index_save_and_load(faiss_document_store):
faiss_document_store.write_documents(DOCUMENTS)
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
def test_faiss_index_save_and_load(document_store):
document_store.write_documents(DOCUMENTS)

# test saving the index
faiss_document_store.save("haystack_test_faiss")
document_store.save("haystack_test_faiss")

# clear existing faiss_index
faiss_document_store.faiss_index.reset()
document_store.faiss_index.reset()

# test faiss index is cleared
assert faiss_document_store.faiss_index.ntotal == 0
assert document_store.faiss_index.ntotal == 0

# test loading the index
new_document_store = faiss_document_store.load(sql_url="sqlite:///haystack_test.db",
new_document_store = document_store.load(sql_url="sqlite:///haystack_test.db",
faiss_file_path="haystack_test_faiss")

# check faiss index is restored
assert new_document_store.faiss_index.ntotal == len(DOCUMENTS)


@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
@pytest.mark.parametrize("index_buffer_size", [10_000, 2])
@pytest.mark.parametrize("batch_size", [2])
def test_faiss_write_docs(faiss_document_store, index_buffer_size, batch_size):
faiss_document_store.index_buffer_size = index_buffer_size
def test_faiss_write_docs(document_store, index_buffer_size, batch_size):
document_store.index_buffer_size = index_buffer_size

# Write in small batches
for i in range(0, len(DOCUMENTS), batch_size):
faiss_document_store.write_documents(DOCUMENTS[i: i + batch_size])
document_store.write_documents(DOCUMENTS[i: i + batch_size])

documents_indexed = faiss_document_store.get_all_documents()
documents_indexed = document_store.get_all_documents()

# test if correct vectors are associated with docs
for i, doc in enumerate(documents_indexed):
# we currently don't get the embeddings back when we call document_store.get_all_documents()
original_doc = [d for d in DOCUMENTS if d["text"] == doc.text][0]
stored_emb = faiss_document_store.faiss_index.reconstruct(int(doc.meta["vector_id"]))
stored_emb = document_store.faiss_index.reconstruct(int(doc.meta["vector_id"]))
# compare original input vec with stored one (ignore extra dim added by hnsw)
assert np.allclose(original_doc["embedding"], stored_emb, rtol=0.01)

Expand All @@ -76,37 +78,41 @@ def test_faiss_write_docs(faiss_document_store, index_buffer_size, batch_size):


@pytest.mark.slow
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
@pytest.mark.parametrize("index_buffer_size", [10_000, 2])
def test_faiss_update_docs(faiss_document_store, index_buffer_size, dpr_retriever):
def test_faiss_update_docs(document_store, index_buffer_size, retriever):
# adjust buffer size
faiss_document_store.index_buffer_size = index_buffer_size
document_store.index_buffer_size = index_buffer_size

# initial write
faiss_document_store.write_documents(DOCUMENTS)
document_store.write_documents(DOCUMENTS)

faiss_document_store.update_embeddings(retriever=dpr_retriever)
documents_indexed = faiss_document_store.get_all_documents()
document_store.update_embeddings(retriever=retriever)
documents_indexed = document_store.get_all_documents()

# test if correct vectors are associated with docs
for i, doc in enumerate(documents_indexed):
original_doc = [d for d in DOCUMENTS if d["text"] == doc.text][0]
updated_embedding = dpr_retriever.embed_passages([Document.from_dict(original_doc)])
stored_emb = faiss_document_store.faiss_index.reconstruct(int(doc.meta["vector_id"]))
updated_embedding = retriever.embed_passages([Document.from_dict(original_doc)])
stored_emb = document_store.faiss_index.reconstruct(int(doc.meta["vector_id"]))
# compare original input vec with stored one (ignore extra dim added by hnsw)
assert np.allclose(updated_embedding, stored_emb, rtol=0.01)

# test document correctness
check_data_correctness(documents_indexed, DOCUMENTS)


def test_faiss_update_with_empty_store(faiss_document_store, dpr_retriever):
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
def test_faiss_update_with_empty_store(document_store, retriever):
# Call update with empty doc store
faiss_document_store.update_embeddings(retriever=dpr_retriever)
document_store.update_embeddings(retriever=retriever)

# initial write
faiss_document_store.write_documents(DOCUMENTS)
document_store.write_documents(DOCUMENTS)

documents_indexed = faiss_document_store.get_all_documents()
documents_indexed = document_store.get_all_documents()

# test document correctness
check_data_correctness(documents_indexed, DOCUMENTS)
Expand All @@ -125,25 +131,29 @@ def test_faiss_retrieving(index_factory):
assert type(result[0]) == Document


def test_faiss_finding(faiss_document_store, embedding_retriever):
faiss_document_store.write_documents(DOCUMENTS)
finder = Finder(reader=None, retriever=embedding_retriever)
@pytest.mark.parametrize("retriever", ["embedding"], indirect=True)
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
def test_faiss_finding(document_store, retriever):
document_store.write_documents(DOCUMENTS)
finder = Finder(reader=None, retriever=retriever)

prediction = finder.get_answers_via_similar_questions(question="How to test this?", top_k_retriever=1)

assert len(prediction.get('answers', [])) == 1


def test_faiss_pipeline(faiss_document_store, embedding_retriever):
@pytest.mark.parametrize("retriever", ["embedding"], indirect=True)
@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
def test_faiss_pipeline(document_store, retriever):
documents = [
{"name": "name_1", "text": "text_1", "embedding": np.random.rand(768).astype(np.float32)},
{"name": "name_2", "text": "text_2", "embedding": np.random.rand(768).astype(np.float32)},
{"name": "name_3", "text": "text_3", "embedding": np.random.rand(768).astype(np.float64)},
{"name": "name_4", "text": "text_4", "embedding": np.random.rand(768).astype(np.float32)},
]
faiss_document_store.write_documents(documents)
document_store.write_documents(documents)
pipeline = Pipeline()
pipeline.add_node(component=embedding_retriever, name="FAISS", inputs=["Query"])
pipeline.add_node(component=retriever, name="FAISS", inputs=["Query"])
output = pipeline.run(query="How to test this?", top_k_retriever=3)
assert len(output["documents"]) == 3

Expand Down
6 changes: 3 additions & 3 deletions test/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def test_graph_creation(reader, retriever_with_docs, document_store_with_docs):
@pytest.mark.slow
@pytest.mark.elasticsearch
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
def test_extractive_qa_answers(reader, retriever_with_docs, document_store_with_docs):
def test_extractive_qa_answers(reader, retriever_with_docs):
pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
prediction = pipeline.run(query="Who lives in Berlin?", top_k_retriever=10, top_k_reader=3)
assert prediction is not None
Expand All @@ -40,7 +40,7 @@ def test_extractive_qa_answers(reader, retriever_with_docs, document_store_with_

@pytest.mark.elasticsearch
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
def test_extractive_qa_offsets(reader, retriever_with_docs, document_store_with_docs):
def test_extractive_qa_offsets(reader, retriever_with_docs):
pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
prediction = pipeline.run(query="Who lives in Berlin?", top_k_retriever=10, top_k_reader=5)

Expand All @@ -54,7 +54,7 @@ def test_extractive_qa_offsets(reader, retriever_with_docs, document_store_with_
@pytest.mark.slow
@pytest.mark.elasticsearch
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
def test_extractive_qa_answers_single_result(reader, retriever_with_docs, document_store_with_docs):
def test_extractive_qa_answers_single_result(reader, retriever_with_docs):
pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
query = "testing finder"
prediction = pipeline.run(query=query, top_k_retriever=1, top_k_reader=1)
Expand Down
11 changes: 7 additions & 4 deletions test/test_tfidf_retriever.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
import pytest


def test_tfidf_retriever(inmemory_document_store, tfidf_retriever):
@pytest.mark.parametrize("retriever", ["tfidf"], indirect=True)
@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
def test_tfidf_retriever(document_store, retriever):

test_docs = [
{"id": "26f84672c6d7aaeb8e2cd53e9c62d62d", "name": "testing the finder 1", "text": "godzilla says hello"},
{"name": "testing the finder 2", "text": "optimus prime says bye"},
{"name": "testing the finder 3", "text": "alien says arghh"}
]

inmemory_document_store.write_documents(test_docs)
document_store.write_documents(test_docs)

tfidf_retriever.fit()
doc = tfidf_retriever.retrieve("godzilla", top_k=1)[0]
retriever.fit()
doc = retriever.retrieve("godzilla", top_k=1)[0]
assert doc.id == "26f84672c6d7aaeb8e2cd53e9c62d62d"
assert doc.text == 'godzilla says hello'
assert doc.meta == {"name": "testing the finder 1"}