In [20]:
from haystack.reader import FARMReader
from haystack.utils import launch_es, print_answers, launch_milvus
from pprint import pprint


#TODO DRAW AND SHOW PIPELINES

In [5]:
from haystack.preprocessor.utils import fetch_archive_from_http, convert_files_to_dicts
from haystack.preprocessor.cleaning import clean_wiki_text

#Download and prepare data - 517 Wikipedia articles for Game of Thrones
doc_dir = "data/article_txt_got"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# convert files to dicts containing documents that can be indexed to our datastore
got_dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)

04/19/2021 16:06:01 - INFO - haystack.preprocessor.utils -   Found data stored in `data/article_txt_got`. Delete this first if you really want to fetch new data.
04/19/2021 16:06:01 - INFO - haystack.preprocessor.utils -   Converting data/article_txt_got/145_Elio_M._García_Jr._and_Linda_Antonsson.txt
04/19/2021 16:06:01 - INFO - haystack.preprocessor.utils -   Converting data/article_txt_got/401_Power_Is_Power.txt
04/19/2021 16:06:01 - INFO - haystack.preprocessor.utils -   Converting data/article_txt_got/368_Jaime_Lannister.txt
04/19/2021 16:06:01 - INFO - haystack.preprocessor.utils -   Converting data/article_txt_got/349_List_of_Game_of_Thrones_characters.txt
04/19/2021 16:06:01 - INFO - haystack.preprocessor.utils -   Converting data/article_txt_got/39_Renly_Baratheon.txt
04/19/2021 16:06:01 - INFO - haystack.preprocessor.utils -   Converting data/article_txt_got/330_Oberyn_Martell.txt
04/19/2021 16:06:01 - INFO - haystack.preprocessor.utils -   Converting data/article_txt_got/79_T

In [27]:
from haystack import Pipeline
from haystack.document_store import ElasticsearchDocumentStore
from haystack.retriever.sparse import ElasticsearchRetriever

#Init core components

launch_es()
document_store = ElasticsearchDocumentStore()
document_store.delete_all_documents()
document_store.write_documents(got_dicts)

es_retriever = ElasticsearchRetriever(document_store=document_store)

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")

04/19/2021 18:02:24 - INFO - haystack.utils -   Starting Elasticsearch ...
04/19/2021 18:02:24 - INFO - elasticsearch -   HEAD http://localhost:9200/ [status:200 request:0.004s]
04/19/2021 18:02:24 - INFO - elasticsearch -   HEAD http://localhost:9200/document [status:200 request:0.002s]
04/19/2021 18:02:24 - INFO - elasticsearch -   GET http://localhost:9200/document [status:200 request:0.002s]
04/19/2021 18:02:24 - INFO - elasticsearch -   PUT http://localhost:9200/document/_mapping [status:200 request:0.006s]
04/19/2021 18:02:24 - INFO - elasticsearch -   HEAD http://localhost:9200/label [status:200 request:0.002s]
04/19/2021 18:02:26 - INFO - elasticsearch -   POST http://localhost:9200/document/_delete_by_query [status:200 request:1.347s]
04/19/2021 18:02:28 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:0.313s]
04/19/2021 18:02:29 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.01

In [12]:
from haystack.pipeline import ExtractiveQAPipeline

# Premade pipeline

p_extractive_premade = ExtractiveQAPipeline(reader=reader, retriever=es_retriever)
res = p_extractive_premade.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5)
p_extractive_premade.draw()

print_answers(res, details="minimal")

04/19/2021 16:08:44 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.026s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 44.06 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 52.38 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 46.19 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 49.12 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 22.93 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 22.69 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 30.63 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 31.57 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 32.15 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 32.45 Batches/s]

[   {   'answer': 'Lord Eddard Stark',
        'context': 'ark daughters.\n'
                   'During the Tourney of the Hand to honour her father Lord '
                   'Eddard Stark, Sansa Stark is enchanted by the knights '
                   'performing in the event.'},
    {   'answer': 'Lord Eddard Stark',
        'context': 'ark daughters.\n'
                   'During the Tourney of the Hand to honour her father Lord '
                   'Eddard Stark, Sansa Stark is enchanted by the knights '
                   'performing in the event.'},
    {   'answer': 'Ned',
        'context': '\n'
                   '====Season 1====\n'
                   'Arya accompanies her father Ned and her sister Sansa to '
                   "King's Landing. Before their departure, Arya's "
                   'half-brother Jon Snow gifts A'},
    {   'answer': 'Ned',
        'context': '\n'
                   '====Season 1====\n'
                   'Arya accompanies her father Ned and her si




In [18]:
# Construct basic Retriever-Reader pipeline - same as above

p_extractive = Pipeline()

p_extractive.add_node(component=es_retriever, name="Retriever", inputs=["Query"])
p_extractive.add_node(component=reader, name="Reader", inputs=["Retriever"])
res = p_extractive.run(query="Who is the father of Arya Stark??", top_k_retriever=10, top_k_reader=5)

print_answers(res, details="minimal")


# Can Milvus run on GCP?
# Indexing vs querying pipeline???
# Yaml???

04/19/2021 16:12:50 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.017s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 29.35 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 48.11 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 43.18 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 55.98 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 23.31 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 22.96 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 33.32 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 31.94 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 32.01 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 32.50 Batches/s]

[   {   'answer': 'Lord Eddard Stark',
        'context': 'ark daughters.\n'
                   'During the Tourney of the Hand to honour her father Lord '
                   'Eddard Stark, Sansa Stark is enchanted by the knights '
                   'performing in the event.'},
    {   'answer': 'Lord Eddard Stark',
        'context': 'ark daughters.\n'
                   'During the Tourney of the Hand to honour her father Lord '
                   'Eddard Stark, Sansa Stark is enchanted by the knights '
                   'performing in the event.'},
    {   'answer': 'Ned',
        'context': '\n'
                   '====Season 1====\n'
                   'Arya accompanies her father Ned and her sister Sansa to '
                   "King's Landing. Before their departure, Arya's "
                   'half-brother Jon Snow gifts A'},
    {   'answer': 'Ned',
        'context': '\n'
                   '====Season 1====\n'
                   'Arya accompanies her father Ned and her si




In [24]:
# Retriever only

p_retrieval = Pipeline()

p_retrieval.add_node(component=es_retriever, name="Retriever", inputs=["Query"])
res = p_retrieval.run(query="Who is the father of Arya Stark?", top_k_retriever=10)
pprint(res)

print(dir(res["documents"][0]))
# Output are Document objects which contain text, id, probability, score

04/19/2021 16:15:56 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.012s]


{'documents': [{'text': "\n===In the Riverlands===\nThe Stark army reaches the Twins, a bridge stronghold controlled by Walder Frey, who agrees to allow the army to cross the river and to commit his troops in return for Robb and Arya Stark marrying two of his children.\nTyrion Lannister suspects his father Tywin, who decides Tyrion and his barbarians will fight in the vanguard, wants him killed. As Tyrion, Bronn, and the prostitute Shae swap stories, Tyrion reveals he was married to a woman his father revealed was a prostitute, and made Tyrion watch as his guardsmen raped her.\nAs a Stark force approaches, Tyrion is trampled in the rush and regains consciousness to find the battle over. Tywin discovers the Stark host was only 2,000 men, not the 20,000 he was led to expect.\nRobb, having divided his forces, defeats Jaime Lannister's army with his remaining 18,000 men and captures Jaime.", 'id': '57a07931-60e2-4ee6-870a-dbbab36af336', 'score': 11.65971, 'probability': 0.8111444554282315,

In [29]:
from haystack.retriever.dense import DensePassageRetriever
from haystack.pipeline import JoinDocuments

# Ensemble
# Use JoinDocuments to merge the outputs of the two retrievers. Makes sure it can be used by reader?

dpr_retriever = DensePassageRetriever(document_store)
document_store.update_embeddings(dpr_retriever, update_existing_embeddings=False)

p_ensemble = Pipeline()
p_ensemble.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"])
p_ensemble.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"])
p_ensemble.add_node(component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["ESRetriever", "DPRRetriever"])
# p_ensemble.add_node(component=reader, name="QAReader", inputs=["JoinResults"])
res = p_ensemble.run(query="Who is the father of Arya Stark?", top_k_retriever=5) #This is top_k per retriever

pprint(res)

04/19/2021 18:04:40 - INFO - elasticsearch -   POST http://localhost:9200/document/_count [status:200 request:0.127s]
04/19/2021 18:04:40 - INFO - haystack.document_store.elasticsearch -   Updating embeddings for all 2497 docs ...
04/19/2021 18:04:41 - INFO - elasticsearch -   POST http://localhost:9200/document/_search?scroll=1d&size=10000 [status:200 request:0.597s]
04/19/2021 18:04:42 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll [status:200 request:0.013s]
04/19/2021 18:04:42 - INFO - elasticsearch -   DELETE http://localhost:9200/_search/scroll [status:200 request:0.003s]
Creating Embeddings: 100%|██████████| 157/157 [00:28<00:00,  5.44 Batches/s]
04/19/2021 18:05:15 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:0.857s]
04/19/2021 18:05:16 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:0.693s]
04/19/2021 18:05:17 - INFO - elasticsearch -   POST http://localho

In [35]:
del reader
del p_extractive
del p_extractive_premade
del p_retrieval
del p_ensemble


NameError: name 'p_ensemble' is not defined

In [34]:
from haystack.summarizer import TransformersSummarizer

#Summarize
summarizer = TransformersSummarizer(model_name_or_path="google/pegasus-xsum")

p_summarizer = Pipeline()
p_summarizer.add_node(component=es_retriever, name="Retriever", inputs=["Query"])
p_summarizer.add_node(component=summarizer, name="Summarizer", inputs=["Retriever"])
res = p_summarizer.run(query="Who is the father of Arya Stark??", top_k_retriever=10)

pprint(res)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: CUDA out of memory. Tried to allocate 376.00 MiB (GPU 0; 7.93 GiB total capacity; 6.73 GiB already allocated; 84.38 MiB free; 6.92 GiB reserved in total by PyTorch)

In [None]:
# Generator?

In [None]:
#Translate?

In [None]:
# Multi end point?
