In [35]:
from haystack.utils import clean_wiki_text, convert_files_to_dicts, fetch_archive_from_http, print_answers
from haystack.nodes import FARMReader, TransformersReader
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import DensePassageRetriever
from haystack.pipelines import ExtractiveQAPipeline

In [24]:
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

In [None]:
# doc_dir = "Data/article_txt_got"
# s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
# fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

In [None]:
dicts = convert_files_to_dicts(dir_path=doc_dir,clean_func=clean_wiki_text,split_paragraphs=True)
document_store.write_documents(dicts)

In [None]:
retriever = DensePassageRetriever(document_store=document_store,
                                 query_embedding_model='facebook/dpr-question_encoder-single-nq-base',
                                 passage_embedding_model='facebook/dpr-ctx_encoder-single-nq-base',
                                 max_seq_len_query=64,
                                 max_seq_len_passage=256,
                                 batch_size=16,
                                 use_gpu=True,
                                 embed_title=True,
                                 use_fast_tokenizers=True)
document_store.update_embeddings(retriever)

In [None]:
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

In [36]:
pipeline = ExtractiveQAPipeline(reader, retriever)

In [37]:
prediction = pipeline.run(query="Who created the Dothraki vocabulary?",
                         params = {"Retriever":{"top_k":10}, 
                                  "Reader":{"top_k":2}})

  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.58 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.86 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.83 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.99 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.84 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.77 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.89 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  5.01 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.95 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.97 Batches/s]


In [39]:
print_answers(prediction,details="minimum")


Query: Who created the Dothraki vocabulary?
Answers:
[   {   'answer': 'David J. Peterson',
        'context': 'orld. The language was developed for the TV series by the '
                   'linguist David J. Peterson, working off the Dothraki words '
                   "and phrases in Martin's novels.\n"
                   ','},
    {   'answer': 'David J. Peterson',
        'context': '\n'
                   '===Valyrian===\n'
                   'David J. Peterson, who created the Dothraki language for '
                   'the first season of the show, was entrusted by the '
                   'producers to design a new '}]
