In [None]:
%%bash

pip install --upgrade pip
pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]

In [None]:
# Recommended: Start Elasticsearch using Docker via the Haystack utility function
from haystack.utils import launch_es

launch_es()



In [None]:
%%bash

wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
chown -R daemon:daemon elasticsearch-7.9.2

In [None]:
%%bash --bg

sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch

In [None]:
import time
time.sleep(30)

In [None]:
import os
from haystack.document_stores import ElasticsearchDocumentStore

# Get the host where Elasticsearch is running, default to localhost
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")
document_store = ElasticsearchDocumentStore(host=host, username="", password="", index="document")

In [None]:
from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http


# Let's first fetch some documents that we want to query
# Here: 517 Wikipedia articles for Game of Thrones
doc_dir = "data/tutorial1"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt1.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# Convert files to dicts
# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
# It must take a str as input, and return a str.
docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)

# We now have a list of dictionaries that we can write to our document store.
# If your texts come from a different source (e.g. a DB), you can of course skip convert_files_to_dicts() and create the dictionaries yourself.
# The default format here is:
# {
#    'content': "<DOCUMENT_TEXT_HERE>",
#    'meta': {'name': "<DOCUMENT_NAME_HERE>", ...}
# }
# (Optionally: you can also add more key-value-pairs here, that will be indexed as fields in Elasticsearch and
# can be accessed later for filtering or shown in the responses of the Pipeline)

# Let's have a look at the first 3 entries:
print(docs[:3])

# Now, let's write the dicts containing documents to our DB.
document_store.write_documents(docs)

[<Document: {'content': '\'\'\'Rickon Stark\'\'\' is a fictional character in the \'\'A Song of Ice and Fire\'\' series of fantasy novels by American author George R. R. Martin, and its television adaptation \'\'Game of Thrones\'\'.\nIntroduced in 1996\'s \'\'A Game of Thrones\'\', Rickon is the youngest child of Eddard Stark, the honorable lord of Winterfell, an ancient fortress in the North of the fictional kingdom of Westeros. He subsequently appeared in Martin\'s \'\'A Clash of Kings\'\' (1998). The \'\'Publishers Weekly\'\' review of \'\'A Game of Thrones\'\' noted, "It is fascinating to watch Martin\'s characters mature and grow, particularly Stark\'s children, who stand at the center of the book."\nRickon is portrayed by Irish actor Art Parkinson in the HBO television adaptation.', 'content_type': 'text', 'score': None, 'meta': {'name': '334_Rickon_Stark.txt'}, 'embedding': None, 'id': '85a40281eda602fffad9c5832328055d'}>, <Document: {'content': '\n== Character description ==\nR

In [None]:
from haystack.nodes import BM25Retriever

retriever = BM25Retriever(document_store=document_store)

In [None]:
from haystack.nodes import FARMReader

# Load a  local model or any of the QA models on
# Hugging Face's model hub (https://huggingface.co/models)

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

In [None]:
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, retriever)

In [None]:
pred1 = pipe.run(
    query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 5}, "Reader": {"top_k": 5}}
)

pred2 = pipe.run(
    query="Who plays Daenerys?", params={"Retriever": {"top_k": 5}, "Reader": {"top_k": 5}}
)

Inferencing Samples: 100%|██████████| 1/1 [00:17<00:00, 17.53s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:26<00:00, 26.68s/ Batches]


In [None]:
from pprint import pprint

pprint(pred1)
pprint(pred2)

{'answers': [<Answer {'answer': 'Ned', 'type': 'extractive', 'score': 0.9767239689826965, 'context': "\n====Season 1====\nArya accompanies her father Ned and her sister Sansa to King's Landing. Before their departure, Arya's half-brother Jon Snow gifts A", 'offsets_in_document': [{'start': 46, 'end': 49}], 'offsets_in_context': [{'start': 46, 'end': 49}], 'document_id': '180c2a6b36369712b361a80842e79356', 'meta': {'name': '43_Arya_Stark.txt'}}>,
             <Answer {'answer': 'Lord Eddard Stark', 'type': 'extractive', 'score': 0.8930399417877197, 'context': 'ark daughters.\nDuring the Tourney of the Hand to honour her father Lord Eddard Stark, Sansa Stark is enchanted by the knights performing in the event.', 'offsets_in_document': [{'start': 659, 'end': 676}], 'offsets_in_context': [{'start': 67, 'end': 84}], 'document_id': 'd1f36ec7170e4c46cde65787fe125dfe', 'meta': {'name': '332_Sansa_Stark.txt'}}>,
             <Answer {'answer': 'Joffrey', 'type': 'extractive', 'score': 0.6753826