In [1]:
import elasticsearch
import json

Haystack indexes all documents in a `DocumentStore` The current implementations of DocumentStore include `ElasticsearchDocumentStore`, `SQLDocumentStore`, and `InMemoryDocumentStore`.

We recommended Elasticsearch as it comes preloaded with features like full-text queries, BM25 retrieval, and vector storage for text embeddings.

<b>Alternatives:</b> If you are unable to setup an Elasticsearch instance, then follow the Tutorial 3 for using SQL/InMemory document stores.

<b>Hint:</b> This tutorial creates a new document store instance with Wikipedia articles on Game of Thrones. However, you can configure Haystack to work with your existing document stores.

In [2]:
# Intialize the document store for training data. 
from haystack.database.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="",
                                            index="document", embedding_dim=768, embedding_field="embedding")

08/04/2020 14:54:44 - INFO - elasticsearch -   PUT http://localhost:9200/document?timeout=100 [status:400 request:0.014s]


In [3]:
# SQuAD format is required for indexing documents in the document store
# Convert any json to SQuAD format: Here we use the official DPR dataset
from haystack.utils import convert_dpr_to_squad
input_path = "DPR/data/data/retriever/trivia-dev.json"
output_path = "DPR/data/data/retriever/trivia2Squad-dev.json"
convert_dpr_to_squad(input_file=input_path, output_file=output_path)



{'version': None,
 'data': [{'title': "'Oh, Whistle, and I'll Come to You, My Lad'",
   'paragraphs': [{'qas': [{'question': 'What 1904 story by M. R. James in which the protagonist finds an antique artifact with undesirable consequences gets its ominously inviting title from a 1793 Robert Burns poem/song?',
       'answers': [{'text': "Oh, Whistle, and I'll Come to You, My Lad",
         'answer_start': 1},
        {'text': "Oh, Whistle, and I'll Come to You, My Lad",
         'answer_start': 46},
        {'text': "'Oh, Whistle, and I'll Come to You, My Lad'",
         'answer_start': 0},
        {'text': "'Oh, Whistle, and I'll Come to You, My Lad'",
         'answer_start': 45}],
       'is_impossible': False,
       'id': None}],
     'context': '\'Oh, Whistle, and I\'ll Come to You, My Lad\' "\'Oh, Whistle, and I\'ll Come to You, My Lad\'" is a ghost story by British writer M. R. James, included in his collection "Ghost Stories of an Antiquary" (1904). It is named after the poem b

In [5]:
# question answering training dataset must be in SQuAD format
# Extract Documents and Labels indices from SQuAD format QA dataset
from haystack.indexing.utils import eval_data_from_file as get_data_from_squad
squad_datapath = output_path
documents, labels = get_data_from_squad(squad_datapath)

filename /media/vaishali/75f51685-53cb-4317-bd6e-dacf384b9259/vaishali/Documents/deepset/DPR/data/data/retriever/trivia2Squad-dev.json


In [9]:
# Create index for documents and labels in document store
document_store.write_documents(documents=documents)
document_store.write_labels(labels=labels)

08/04/2020 15:04:28 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.151s]
08/04/2020 15:04:29 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.037s]
08/04/2020 15:04:30 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.003s]
08/04/2020 15:04:31 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:0.871s]
08/04/2020 15:04:32 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.003s]
08/04/2020 15:04:33 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.032s]
08/04/2020 15:04:34 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.033s]
08/04/2020 15:04:35 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.030s]


<b>`Unlabelled Documents`:</b> Unlabelled Documents are passages or articles which do not have associated labels(question and answers). Unlabelled Documents are indexed into the document store to aid the negative retriever and have a wider choice of documents to extract neagtives and hard-negatives from.

In [15]:
from haystack.database.base import Document, List
import csv
def get_unlabelled_docs_from_csv(path: str, labelled_docs: List[Document]) -> List[Document]:
    """
     Extract all unlabelled documents from larger corpora such as wikipedia

    :param path: path to a csv file with fields: 'context', 'title', 'passage_id'
    :return: List of unlabelled Documents: [Document]
    """
    labelled_passage_id = set([doc.meta["passage_id"] for doc in labelled_docs])
    with open(path) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter='\t', quotechar='|')
        docs = [Document(text=row[1], meta={'name': row[2], 'passage_id': row[0]})
            for row in csv_reader if row[0] not in labelled_passage_id]
    return docs

In [16]:
# Extra unlabelled dataset can be inserted in the document store
unlabelled_output_path = "DPR/data/data/wikipedia_split/unlabelled_psgs_w100.tsv"
unlabelled_documents = get_unlabelled_docs_from_csv(path=unlabelled_output_path, labelled_docs=documents)
unlabelled_documents_subset = unlabelled_documents[:1000]
document_store.write_documents(documents=unlabelled_documents_subset)

08/04/2020 15:27:17 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:4.359s]
08/04/2020 15:27:18 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.180s]


In [17]:
# Intialize the Dense Passage Retriever to be trained
from haystack.retriever.dense import DensePassageRetriever
retriever = DensePassageRetriever(document_store=document_store, embedding_model="dpr-bert-base-nq",
                                  do_lower_case=True, use_gpu=True)

08/04/2020 15:28:27 - INFO - haystack.retriever.dpr_utils -   Loading saved model from models/dpr/checkpoint/retriever/single/nq/bert-base-encoder.cp
08/04/2020 15:28:33 - INFO - haystack.retriever.dense -   Loaded encoder params:  {'do_lower_case': True, 'pretrained_model_cfg': 'bert-base-uncased', 'encoder_model_type': 'hf_bert', 'pretrained_file': None, 'projection_dim': 0, 'sequence_length': 256}
08/04/2020 15:28:49 - INFO - haystack.retriever.dense -   Loading saved model state ...
08/04/2020 15:28:49 - INFO - haystack.retriever.dense -   Loading saved model state ...


In [20]:
# Initalize the negative sampling retriever for hard-negatives in the training data
from haystack.retriever.sparse import ElasticsearchRetriever
negative_retriever = ElasticsearchRetriever(document_store=document_store)

In [21]:
# Create the training data in json format
output_json_path = "DPR/data/data/retriever/train_retriever.json"
train_json = retriever.prepare_training_data(negative_retriever=negative_retriever, output_path=output_json_path, top_k=10)

08/04/2020 15:30:38 - INFO - elasticsearch -   POST http://localhost:9200/document/_search?scroll=5m&size=1000 [status:200 request:0.159s]
08/04/2020 15:30:38 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.113s]
08/04/2020 15:30:38 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.092s]
08/04/2020 15:30:38 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.110s]
08/04/2020 15:30:38 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.239s]
08/04/2020 15:30:39 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.316s]
08/04/2020 15:30:39 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.289s]
08/04/2020 15:30:39 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.138s]
08/04/2020 15:30:39 - INFO - elasticsearch

In [22]:
train_json[0]

{'question': 'What 1904 story by M. R. James in which the protagonist finds an antique artifact with undesirable consequences gets its ominously inviting title from a 1793 Robert Burns poem/song?',
 'answers': ["Oh, Whistle, and I'll Come to You, My Lad"],
 'positive_ctxs': [{'title': "'Oh, Whistle, and I'll Come to You, My Lad'",
   'text': '\'Oh, Whistle, and I\'ll Come to You, My Lad\' "\'Oh, Whistle, and I\'ll Come to You, My Lad\'" is a ghost story by British writer M. R. James, included in his collection "Ghost Stories of an Antiquary" (1904). It is named after the poem by Robert Burns. Parkins, the protagonist, a skeptical Cambridge professor, is on holiday in the town of "Burnstow" (a fictionalized version of Felixstowe, Suffolk), on the southeast coast of England. While investigating a Templar ruin for a colleague, he finds a whistle with two Latin inscriptions. On one side it says "Quis est iste, qui venit?".',
   'passage_id': '18600354',
   'score': 0,
   'title_score': 0}]