Start docker container with elasticsearch:
_docker pull docker.elastic.co/elasticsearch/elasticsearch:7.12.1
_docker run -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch:7.12.1

In [None]:
from TEI_Handling import TEIFile
import glob
from pathlib import Path
import multiprocessing
from multiprocessing.pool import Pool
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
from haystack.preprocessor import PreProcessor

In [None]:
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")

In [None]:
def to_dict(paper_path):
    tei = TEIFile(paper_path)
    processor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by="word",
        split_length=250,
        split_respect_sentence_boundary=True,
        split_overlap=20
    )
    return processor.process(tei.to_dict())

In [None]:
papers = sorted(Path("../data/unpaywall-grobid-sample").glob('*.tei.xml'))
print(f"Processing {papers.__len__()} papers on {multiprocessing.cpu_count()} cores.")
pool = Pool()

In [None]:
dicts = []
dicts.extend(pool.imap(to_dict, papers, 5))
pool.close()

In [None]:
documents = []
for inner_list in dicts:
    documents.extend(inner_list)

In [None]:
print(documents[:2])

In [None]:
document_store.write_documents(documents)

In [None]:
from haystack.retriever.sparse import ElasticsearchRetriever
retriever = ElasticsearchRetriever(document_store=document_store)

In [None]:
from haystack.reader.farm import FARMReader
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)

In [None]:
from haystack.pipeline import ExtractiveQAPipeline
pipe = ExtractiveQAPipeline(reader, retriever)

In [None]:
prediction = pipe.run(query="What is the Internet Engineering Task Force?", top_k_retriever=10, top_k_reader=5)

In [None]:
from haystack.utils import print_answers

In [None]:
print_answers(prediction, details="minimal")