deepset-ai · tanaysoni · Dec 14, 2020 · Nov 30, 2020 · Nov 30, 2020 · Nov 30, 2020
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -9,7 +9,7 @@ on:
 jobs:
   build:
 
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-20.04
 
     steps:
     - uses: actions/checkout@v2
@@ -39,14 +39,14 @@ jobs:
         pip install -r requirements.txt
         pip install -e .
 
-    - name: Run Pytest without pipeline marker
-      run: cd test && pytest -m "not pipeline"
+    - name: Run Pytest without generator/pipeline marker
+      run: cd test &&  pytest -m "not pipeline and not generator"
 
 #    - name: Stop Containers
 #      run: docker rm -f `docker ps -a -q`
 
-    - name: Run pytest with pipeline marker
-      run: cd test && pytest -m pipeline
+    - name: Run pytest with generator/pipeline marker
+      run: cd test && pytest -m "pipeline or generator"
 
     - name: Test with mypy
       run: |

diff --git a/test/conftest.py b/test/conftest.py
@@ -117,49 +117,6 @@ def rag_generator():
     )
 
 
-@pytest.fixture()
-def faiss_document_store():
-    if os.path.exists("haystack_test_faiss.db"):
-        os.remove("haystack_test_faiss.db")
-    document_store = FAISSDocumentStore(
-        sql_url="sqlite:///haystack_test_faiss.db",
-        return_embedding=True
-    )
-    yield document_store
-    document_store.faiss_index.reset()
-
-
-@pytest.fixture()
-def inmemory_document_store():
-    return InMemoryDocumentStore(return_embedding=True)
-
-
-@pytest.fixture()
-def dpr_retriever(faiss_document_store):
-    return DensePassageRetriever(
-        document_store=faiss_document_store,
-        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
-        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
-        use_gpu=False,
-        embed_title=True,
-        use_fast_tokenizers=True
-    )
-
-
-@pytest.fixture()
-def embedding_retriever(faiss_document_store):
-    return EmbeddingRetriever(
-        document_store=faiss_document_store,
-        embedding_model="deepset/sentence_bert",
-        use_gpu=False
-    )
-
-
-@pytest.fixture()
-def tfidf_retriever(inmemory_document_store):
-    return TfidfRetriever(document_store=inmemory_document_store)
-
-
 @pytest.fixture(scope="module")
 def test_docs_xs():
     return [
@@ -225,19 +182,6 @@ def no_answer_prediction(no_answer_reader, test_docs_xs):
     return prediction
 
 
-@pytest.fixture(params=["elasticsearch", "faiss", "memory", "sql"])
-def document_store_with_docs(
-        request, test_docs_xs, elasticsearch_fixture, faiss_document_store, inmemory_document_store):
-    document_store = get_document_store(request.param, faiss_document_store, inmemory_document_store)
-    document_store.write_documents(test_docs_xs)
-    return document_store
-
-
-@pytest.fixture(params=["elasticsearch", "faiss", "memory", "sql"])
-def document_store(request, test_docs_xs, elasticsearch_fixture, faiss_document_store, inmemory_document_store):
-    return get_document_store(request.param, faiss_document_store, inmemory_document_store)
-
-
 @pytest.fixture(params=["es_filter_only", "elasticsearch", "dpr", "embedding", "tfidf"])
 def retriever(request, document_store):
     return get_retriever(request.param, document_store)
@@ -248,26 +192,6 @@ def retriever_with_docs(request, document_store_with_docs):
     return get_retriever(request.param, document_store_with_docs)
 
 
-def get_document_store(document_store_type, faiss_document_store, inmemory_document_store):
-    if document_store_type == "sql":
-        if os.path.exists("haystack_test.db"):
-            os.remove("haystack_test.db")
-        document_store = SQLDocumentStore(url="sqlite:///haystack_test.db")
-    elif document_store_type == "memory":
-        document_store = inmemory_document_store
-    elif document_store_type == "elasticsearch":
-        # make sure we start from a fresh index
-        client = Elasticsearch()
-        client.indices.delete(index='haystack_test*', ignore=[404])
-        document_store = ElasticsearchDocumentStore(index="haystack_test", return_embedding=True)
-    elif document_store_type == "faiss":
-        document_store = faiss_document_store
-    else:
-        raise Exception(f"No document store fixture for '{document_store_type}'")
-
-    return document_store
-
-
 def get_retriever(retriever_type, document_store):
 
     if retriever_type == "dpr":
@@ -291,3 +215,46 @@ def get_retriever(retriever_type, document_store):
         raise Exception(f"No retriever fixture for '{retriever_type}'")
 
     return retriever
+
+
+@pytest.fixture(params=["elasticsearch", "faiss", "memory", "sql"])
+def document_store_with_docs(request, test_docs_xs):
+    document_store = get_document_store(request.param)
+    document_store.write_documents(test_docs_xs)
+    yield document_store
+    if request.param == "faiss":
+        document_store.faiss_index.reset()
+
+
+@pytest.fixture(params=["elasticsearch", "faiss", "memory", "sql"])
+def document_store(request, test_docs_xs):
+    document_store = get_document_store(request.param)
+    yield document_store
+    if request.param == "faiss":
+        document_store.faiss_index.reset()
+
+
+def get_document_store(document_store_type):
+    if document_store_type == "sql":
+        if os.path.exists("haystack_test.db"):
+            os.remove("haystack_test.db")
+        document_store = SQLDocumentStore(url="sqlite:///haystack_test.db")
+    elif document_store_type == "memory":
+        document_store = InMemoryDocumentStore(return_embedding=True)
+    elif document_store_type == "elasticsearch":
+        # make sure we start from a fresh index
+        client = Elasticsearch()
+        client.indices.delete(index='haystack_test*', ignore=[404])
+        document_store = ElasticsearchDocumentStore(index="haystack_test", return_embedding=True)
+    elif document_store_type == "faiss":
+        if os.path.exists("haystack_test_faiss.db"):
+            os.remove("haystack_test_faiss.db")
+        document_store = FAISSDocumentStore(
+            sql_url="sqlite:///haystack_test_faiss.db",
+            return_embedding=True
+        )
+        return document_store
+    else:
+        raise Exception(f"No document store fixture for '{document_store_type}'")
+
+    return document_store
diff --git a/test/test_faiss.py b/test/test_faiss.py
@@ -32,42 +32,44 @@ def check_data_correctness(documents_indexed, documents_inserted):
     assert len(vector_ids) == len(documents_inserted)
 
 
-def test_faiss_index_save_and_load(faiss_document_store):
-    faiss_document_store.write_documents(DOCUMENTS)
+@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
+def test_faiss_index_save_and_load(document_store):
+    document_store.write_documents(DOCUMENTS)
 
     # test saving the index
-    faiss_document_store.save("haystack_test_faiss")
+    document_store.save("haystack_test_faiss")
 
     # clear existing faiss_index
-    faiss_document_store.faiss_index.reset()
+    document_store.faiss_index.reset()
 
     # test faiss index is cleared
-    assert faiss_document_store.faiss_index.ntotal == 0
+    assert document_store.faiss_index.ntotal == 0
 
     # test loading the index
-    new_document_store = faiss_document_store.load(sql_url="sqlite:///haystack_test.db",
+    new_document_store = document_store.load(sql_url="sqlite:///haystack_test.db",
                                              faiss_file_path="haystack_test_faiss")
 
     # check faiss index is restored
     assert new_document_store.faiss_index.ntotal == len(DOCUMENTS)
 
 
+@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
 @pytest.mark.parametrize("index_buffer_size", [10_000, 2])
 @pytest.mark.parametrize("batch_size", [2])
-def test_faiss_write_docs(faiss_document_store, index_buffer_size, batch_size):
-    faiss_document_store.index_buffer_size = index_buffer_size
+def test_faiss_write_docs(document_store, index_buffer_size, batch_size):
+    document_store.index_buffer_size = index_buffer_size
 
     # Write in small batches
     for i in range(0, len(DOCUMENTS), batch_size):
-        faiss_document_store.write_documents(DOCUMENTS[i: i + batch_size])
+        document_store.write_documents(DOCUMENTS[i: i + batch_size])
 
-    documents_indexed = faiss_document_store.get_all_documents()
+    documents_indexed = document_store.get_all_documents()
 
     # test if correct vectors are associated with docs
     for i, doc in enumerate(documents_indexed):
         # we currently don't get the embeddings back when we call document_store.get_all_documents()
         original_doc = [d for d in DOCUMENTS if d["text"] == doc.text][0]
-        stored_emb = faiss_document_store.faiss_index.reconstruct(int(doc.meta["vector_id"]))
+        stored_emb = document_store.faiss_index.reconstruct(int(doc.meta["vector_id"]))
         # compare original input vec with stored one (ignore extra dim added by hnsw)
         assert np.allclose(original_doc["embedding"], stored_emb, rtol=0.01)
 
@@ -76,37 +78,41 @@ def test_faiss_write_docs(faiss_document_store, index_buffer_size, batch_size):
 
 
 @pytest.mark.slow
+@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
+@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
 @pytest.mark.parametrize("index_buffer_size", [10_000, 2])
-def test_faiss_update_docs(faiss_document_store, index_buffer_size, dpr_retriever):
+def test_faiss_update_docs(document_store, index_buffer_size, retriever):
     # adjust buffer size
-    faiss_document_store.index_buffer_size = index_buffer_size
+    document_store.index_buffer_size = index_buffer_size
 
     # initial write
-    faiss_document_store.write_documents(DOCUMENTS)
+    document_store.write_documents(DOCUMENTS)
 
-    faiss_document_store.update_embeddings(retriever=dpr_retriever)
-    documents_indexed = faiss_document_store.get_all_documents()
+    document_store.update_embeddings(retriever=retriever)
+    documents_indexed = document_store.get_all_documents()
 
     # test if correct vectors are associated with docs
     for i, doc in enumerate(documents_indexed):
         original_doc = [d for d in DOCUMENTS if d["text"] == doc.text][0]
-        updated_embedding = dpr_retriever.embed_passages([Document.from_dict(original_doc)])
-        stored_emb = faiss_document_store.faiss_index.reconstruct(int(doc.meta["vector_id"]))
+        updated_embedding = retriever.embed_passages([Document.from_dict(original_doc)])
+        stored_emb = document_store.faiss_index.reconstruct(int(doc.meta["vector_id"]))
         # compare original input vec with stored one (ignore extra dim added by hnsw)
         assert np.allclose(updated_embedding, stored_emb, rtol=0.01)
 
     # test document correctness
     check_data_correctness(documents_indexed, DOCUMENTS)
 
 
-def test_faiss_update_with_empty_store(faiss_document_store, dpr_retriever):
+@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
+@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
+def test_faiss_update_with_empty_store(document_store, retriever):
     # Call update with empty doc store
-    faiss_document_store.update_embeddings(retriever=dpr_retriever)
+    document_store.update_embeddings(retriever=retriever)
 
     # initial write
-    faiss_document_store.write_documents(DOCUMENTS)
+    document_store.write_documents(DOCUMENTS)
 
-    documents_indexed = faiss_document_store.get_all_documents()
+    documents_indexed = document_store.get_all_documents()
 
     # test document correctness
     check_data_correctness(documents_indexed, DOCUMENTS)
@@ -125,25 +131,29 @@ def test_faiss_retrieving(index_factory):
     assert type(result[0]) == Document
 
 
-def test_faiss_finding(faiss_document_store, embedding_retriever):
-    faiss_document_store.write_documents(DOCUMENTS)
-    finder = Finder(reader=None, retriever=embedding_retriever)
+@pytest.mark.parametrize("retriever", ["embedding"], indirect=True)
+@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
+def test_faiss_finding(document_store, retriever):
+    document_store.write_documents(DOCUMENTS)
+    finder = Finder(reader=None, retriever=retriever)
 
     prediction = finder.get_answers_via_similar_questions(question="How to test this?", top_k_retriever=1)
 
     assert len(prediction.get('answers', [])) == 1
 
 
-def test_faiss_pipeline(faiss_document_store, embedding_retriever):
+@pytest.mark.parametrize("retriever", ["embedding"], indirect=True)
+@pytest.mark.parametrize("document_store", ["faiss"], indirect=True)
+def test_faiss_pipeline(document_store, retriever):
     documents = [
         {"name": "name_1", "text": "text_1", "embedding": np.random.rand(768).astype(np.float32)},
         {"name": "name_2", "text": "text_2", "embedding": np.random.rand(768).astype(np.float32)},
         {"name": "name_3", "text": "text_3", "embedding": np.random.rand(768).astype(np.float64)},
         {"name": "name_4", "text": "text_4", "embedding": np.random.rand(768).astype(np.float32)},
     ]
-    faiss_document_store.write_documents(documents)
+    document_store.write_documents(documents)
     pipeline = Pipeline()
-    pipeline.add_node(component=embedding_retriever, name="FAISS", inputs=["Query"])
+    pipeline.add_node(component=retriever, name="FAISS", inputs=["Query"])
     output = pipeline.run(query="How to test this?", top_k_retriever=3)
     assert len(output["documents"]) == 3
 

diff --git a/test/test_pipeline.py b/test/test_pipeline.py
@@ -24,7 +24,7 @@ def test_graph_creation(reader, retriever_with_docs, document_store_with_docs):
 @pytest.mark.slow
 @pytest.mark.elasticsearch
 @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
-def test_extractive_qa_answers(reader, retriever_with_docs, document_store_with_docs):
+def test_extractive_qa_answers(reader, retriever_with_docs):
     pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
     prediction = pipeline.run(query="Who lives in Berlin?", top_k_retriever=10, top_k_reader=3)
     assert prediction is not None
@@ -40,7 +40,7 @@ def test_extractive_qa_answers(reader, retriever_with_docs, document_store_with_
 
 @pytest.mark.elasticsearch
 @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
-def test_extractive_qa_offsets(reader, retriever_with_docs, document_store_with_docs):
+def test_extractive_qa_offsets(reader, retriever_with_docs):
     pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
     prediction = pipeline.run(query="Who lives in Berlin?", top_k_retriever=10, top_k_reader=5)
 
@@ -54,7 +54,7 @@ def test_extractive_qa_offsets(reader, retriever_with_docs, document_store_with_
 @pytest.mark.slow
 @pytest.mark.elasticsearch
 @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
-def test_extractive_qa_answers_single_result(reader, retriever_with_docs, document_store_with_docs):
+def test_extractive_qa_answers_single_result(reader, retriever_with_docs):
     pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
     query = "testing finder"
     prediction = pipeline.run(query=query, top_k_retriever=1, top_k_reader=1)

diff --git a/test/test_tfidf_retriever.py b/test/test_tfidf_retriever.py
@@ -1,17 +1,20 @@
+import pytest
 
 
-def test_tfidf_retriever(inmemory_document_store, tfidf_retriever):
+@pytest.mark.parametrize("retriever", ["tfidf"], indirect=True)
+@pytest.mark.parametrize("document_store", ["memory"], indirect=True)
+def test_tfidf_retriever(document_store, retriever):
 
     test_docs = [
         {"id": "26f84672c6d7aaeb8e2cd53e9c62d62d", "name": "testing the finder 1", "text": "godzilla says hello"},
         {"name": "testing the finder 2", "text": "optimus prime says bye"},
         {"name": "testing the finder 3", "text": "alien says arghh"}
     ]
 
-    inmemory_document_store.write_documents(test_docs)
+    document_store.write_documents(test_docs)
 
-    tfidf_retriever.fit()
-    doc = tfidf_retriever.retrieve("godzilla", top_k=1)[0]
+    retriever.fit()
+    doc = retriever.retrieve("godzilla", top_k=1)[0]
     assert doc.id == "26f84672c6d7aaeb8e2cd53e9c62d62d"
     assert doc.text == 'godzilla says hello'
     assert doc.meta == {"name": "testing the finder 1"}