In [11]:
import numpy as np
import pandas as pd

from haystack import Finder
from haystack.preprocessor.cleaning import clean_wiki_text
from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
from haystack.retriever.sparse import ElasticsearchRetriever

In [3]:
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")

10/11/2020 16:19:34 - INFO - elasticsearch -   PUT http://localhost:9200/document [status:200 request:0.427s]
10/11/2020 16:19:34 - INFO - elasticsearch -   PUT http://localhost:9200/label [status:200 request:0.148s]


In [4]:
doc_dir = "../data/article_txt_got"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

10/11/2020 16:20:26 - INFO - haystack.preprocessor.utils -   Fetching from https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip to `../data/article_txt_got`
100%|██████████| 1095120/1095120 [00:02<00:00, 543083.90B/s]


True

In [7]:
dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)

10/11/2020 16:26:41 - INFO - haystack.preprocessor.utils -   Converting ../data/article_txt_got/145_Elio_M._García_Jr._and_Linda_Antonsson.txt
10/11/2020 16:26:41 - INFO - haystack.preprocessor.utils -   Converting ../data/article_txt_got/368_Jaime_Lannister.txt
10/11/2020 16:26:41 - INFO - haystack.preprocessor.utils -   Converting ../data/article_txt_got/133_Game_of_Thrones__Season_5__soundtrack_.txt
10/11/2020 16:26:41 - INFO - haystack.preprocessor.utils -   Converting ../data/article_txt_got/515_The_Door__Game_of_Thrones_.txt
10/11/2020 16:26:41 - INFO - haystack.preprocessor.utils -   Converting ../data/article_txt_got/119_Walk_of_Punishment.txt
10/11/2020 16:26:41 - INFO - haystack.preprocessor.utils -   Converting ../data/article_txt_got/369_Samwell_Tarly.txt
10/11/2020 16:26:41 - INFO - haystack.preprocessor.utils -   Converting ../data/article_txt_got/356_Tales_of_Dunk_and_Egg.txt
10/11/2020 16:26:41 - INFO - haystack.preprocessor.utils -   Converting ../data/article_txt_got/

In [8]:
len(dicts)

2497

In [20]:
print(dicts[0])

{'text': "Linda Antonsson and Elio García at Archipelacon on June 28, 2015.\n'''Elio Miguel García Jr.''' (born May 6, 1978) and '''Linda Maria Antonsson''' (born November 18, 1974) are authors known for their contributions and expertise in the ''A Song of Ice and Fire'' series by George R. R. Martin, co-writing in 2014 with Martin ''The World of Ice & Fire'', a companion book for the series. They are also the founders of the fansite Westeros.org, one of the earliest fan websites for ''A Song of Ice and Fire''.", 'meta': {'name': '145_Elio_M._García_Jr._and_Linda_Antonsson.txt'}}


In [10]:
# Now, let's write the dicts containing documents to our DB.
document_store.write_documents(dicts)

10/11/2020 16:27:22 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.057s]
10/11/2020 16:27:23 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.083s]
10/11/2020 16:27:25 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.037s]
10/11/2020 16:27:26 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.019s]
10/11/2020 16:27:27 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.037s]


In [12]:
retriever = ElasticsearchRetriever(document_store=document_store)

In [13]:
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)

10/11/2020 16:28:33 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None
10/11/2020 16:28:33 - INFO - farm.infer -   Could not find `deepset/roberta-base-squad2` locally. Try to download from model hub ...
10/11/2020 16:28:33 - INFO - filelock -   Lock 140402708652496 acquired on /Users/j.zhang/.cache/torch/transformers/f7d4b9379a9c487fa03ccf3d8e00058faa9d664cf01fc03409138246f48760da.c6288e0f84ec797ba5c525c923a5bbc479b47c761aded9734a5f6a473b044c8d.lock


HBox(children=(IntProgress(value=0, description='Downloading', max=559, style=ProgressStyle(description_width=…

10/11/2020 16:28:33 - INFO - filelock -   Lock 140402708652496 released on /Users/j.zhang/.cache/torch/transformers/f7d4b9379a9c487fa03ccf3d8e00058faa9d664cf01fc03409138246f48760da.c6288e0f84ec797ba5c525c923a5bbc479b47c761aded9734a5f6a473b044c8d.lock





10/11/2020 16:28:34 - INFO - filelock -   Lock 140402708607056 acquired on /Users/j.zhang/.cache/torch/transformers/8c0c8b6371111ac5fbc176aefcf9dbe129db7be654c569b8375dd3712fc4dc67.d045adc91e17ecdf7dc3eeff4c875df94bdf2eb749d72b3ae47ae93f8e85213c.lock


HBox(children=(IntProgress(value=0, description='Downloading', max=498637366, style=ProgressStyle(description_…

10/11/2020 16:29:33 - INFO - filelock -   Lock 140402708607056 released on /Users/j.zhang/.cache/torch/transformers/8c0c8b6371111ac5fbc176aefcf9dbe129db7be654c569b8375dd3712fc4dc67.d045adc91e17ecdf7dc3eeff4c875df94bdf2eb749d72b3ae47ae93f8e85213c.lock





	 We guess it's an *ENGLISH* model ... 
	 If not: Init the language model by supplying the 'language' param.
10/11/2020 16:29:43 - INFO - filelock -   Lock 140402719683024 acquired on /Users/j.zhang/.cache/torch/transformers/1e3af82648d7190d959a9d76d727ef629b1ca51b3da6ad04039122453cb56307.6a4061e8fc00057d21d80413635a86fdcf55b6e7594ad9e25257d2f99a02f4be.lock


HBox(children=(IntProgress(value=0, description='Downloading', max=898822, style=ProgressStyle(description_wid…

10/11/2020 16:29:44 - INFO - filelock -   Lock 140402719683024 released on /Users/j.zhang/.cache/torch/transformers/1e3af82648d7190d959a9d76d727ef629b1ca51b3da6ad04039122453cb56307.6a4061e8fc00057d21d80413635a86fdcf55b6e7594ad9e25257d2f99a02f4be.lock





10/11/2020 16:29:44 - INFO - filelock -   Lock 140402719908816 acquired on /Users/j.zhang/.cache/torch/transformers/b901c69e8e7da4a24c635ad81d016d274f174261f4f5c144e43f4b00e242c3b0.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda.lock


HBox(children=(IntProgress(value=0, description='Downloading', max=456318, style=ProgressStyle(description_wid…

10/11/2020 16:29:45 - INFO - filelock -   Lock 140402719908816 released on /Users/j.zhang/.cache/torch/transformers/b901c69e8e7da4a24c635ad81d016d274f174261f4f5c144e43f4b00e242c3b0.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda.lock





10/11/2020 16:29:46 - INFO - filelock -   Lock 140402720059856 acquired on /Users/j.zhang/.cache/torch/transformers/2d9b03b59a8af464bf4238025a3cf0e5a340b9d0ba77400011e23c130b452510.16f949018cf247a2ea7465a74ca9a292212875e5fd72f969e0807011e7f192e4.lock


HBox(children=(IntProgress(value=0, description='Downloading', max=150, style=ProgressStyle(description_width=…

10/11/2020 16:29:47 - INFO - filelock -   Lock 140402720059856 released on /Users/j.zhang/.cache/torch/transformers/2d9b03b59a8af464bf4238025a3cf0e5a340b9d0ba77400011e23c130b452510.16f949018cf247a2ea7465a74ca9a292212875e5fd72f969e0807011e7f192e4.lock





10/11/2020 16:29:47 - INFO - filelock -   Lock 140402720675344 acquired on /Users/j.zhang/.cache/torch/transformers/507984f2e28c7dfed5db9a20acd68beb969c7f2833abc9e582e967fa0291f3dc.100c88dbe27dbd73822c575274ade4eb2427596ac56e96769249b7512341654d.lock


HBox(children=(IntProgress(value=0, description='Downloading', max=189, style=ProgressStyle(description_width=…

10/11/2020 16:29:47 - INFO - filelock -   Lock 140402720675344 released on /Users/j.zhang/.cache/torch/transformers/507984f2e28c7dfed5db9a20acd68beb969c7f2833abc9e582e967fa0291f3dc.100c88dbe27dbd73822c575274ade4eb2427596ac56e96769249b7512341654d.lock





10/11/2020 16:29:48 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None
10/11/2020 16:29:48 - INFO - farm.infer -   Got ya 7 parallel workers to do inference ...
10/11/2020 16:29:48 - INFO - farm.infer -    0    0    0    0    0    0    0 
10/11/2020 16:29:48 - INFO - farm.infer -   /w\  /w\  /w\  /w\  /w\  /w\  /w\
10/11/2020 16:29:48 - INFO - farm.infer -   /'\  / \  /'\  /'\  / \  / \  /'\
10/11/2020 16:29:48 - INFO - farm.infer -               
Process ForkPoolWorker-4:
Process ForkPoolWorker-6:
Process ForkPoolWorker-1:
Process ForkPoolWorker-5:
Process ForkPoolWorker-2:
Process ForkPoolWorker-3:
Process ForkPoolWorker-7:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
 

In [14]:
finder = Finder(reader, retriever)

In [15]:
prediction = finder.get_answers(question="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5)

10/11/2020 16:30:19 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.194s]
10/11/2020 16:30:19 - INFO - haystack.retriever.sparse -   Got 10 candidates from retriever
10/11/2020 16:30:19 - INFO - haystack.finder -   Reader is looking for detailed answer in 12544 chars ...
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  2.78 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  3.97 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.32 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.84 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  2.02 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  2.94 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.02 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.59s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.58 Batches/s]
Inferencing Samples: 100%|

In [18]:
print_answers(prediction, details="minimal")

[   {   'answer': 'Lord Eddard Stark',
        'context': 'ark daughters.\n'
                   'During the Tourney of the Hand to honour her father Lord '
                   'Eddard Stark, Sansa Stark is enchanted by the knights '
                   'performing in the event.'},
    {   'answer': 'Eddard',
        'context': 's Nymeria after a legendary warrior queen. She travels '
                   "with her father, Eddard, to King's Landing when he is made "
                   'Hand of the King. Before she leaves,'},
    {   'answer': 'Tywin',
        'context': 'Stark marrying two of his children.\n'
                   'Tyrion Lannister suspects his father Tywin, who decides '
                   'Tyrion and his barbarians will fight in the vanguard, '
                   'want'},
    {   'answer': 'Yoren',
        'context': " Baelor the Blessed. Ned notices Arya and alerts Night's "
                   'Watch recruiter Yoren. Before Sansa, Cersei Lannister, '
                   'Jof