### Environment setup

In [0]:
! pip install git+git://github.com/deepset-ai/haystack.git@92429a40e6176a3b0c822081f6511fc1c555fabf

In [0]:
import pandas as pd
import json
from google.colab import files
from timeit import default_timer as timer


from haystack import Finder
from haystack.indexing.cleaning import clean_wiki_text
from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers

Get Elasticsearch running

In [0]:
# In Colab / No Docker environments: Start Elasticsearch from source
! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q
! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz
! chown -R daemon:daemon elasticsearch-7.6.2

import os
from subprocess import Popen, PIPE, STDOUT
es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],
                   stdout=PIPE, stderr=STDOUT,
                   preexec_fn=lambda: os.setuid(1)  # as daemon
                  )
# wait until ES has started
! sleep 30

In [0]:
# Connect to Elasticsearch
from haystack.database.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")

06/09/2020 00:06:33 - INFO - elasticsearch -   PUT http://localhost:9200/document [status:400 request:0.112s]


### Corpus upload

In [0]:
# Let's first get some documents that we want to query
# Must be a .zip of .txt files
files.upload()

Saving corpus.zip to corpus (1).zip


{'corpus.zip': b'PK\x03\x04\x14\x00\x08\x00\x08\x00-{\xc8P\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10\x00\x10\x00Ex_Machina_3.txtUX\x0c\x00\xd5\xba\xde^\xd5\xba\xde^\xf5\x01\x14\x00eW\xcb\x8e\xdbF\x10\xbc\xfb+\xfa\xb8\x06\xb4:\xe4\x14\xf8\xb6~\xc4\xefu\x10\x1b\x0er\x1c\x92Mr\xa2\xe1\x0c3\x0fi\xe9\x93\x7f#@\x02\xe8[\xf4)\xfe\x92T\xf7P\xbbZ\xe4`\xefJ\xe4LwWWU\xf7\xde\xf4\x99#\x19\x8a\xdc\xb2\xcf\x14\x8b\xa7\xd0\x9f\x8ea\xcf\xd1-\xd7\xe32s\xb7\xa1\xa6\xe4\xd3\xb1\xb8l\'\x93\xd9-\xa7cg\x93\x99\xe7`}\xb6~8\x1dSk\xaf{K\xdc\xf7!\xe6\xb4\xa1\xde\xf8\x84k(\x8fL\x03\xfb\xc8\xd4[o\x9c[h4{\xa6\x14&\xce#N\xd2\x01\xef\x8fx%\xcb=\xc4w\xad\xcd\xdc\x91iB\xc9[\xba\r\xfe\xfe\xb4\x9b\x86\xc01Q\x1aCq\x1d\xcd14\xa6\xc1}8z\x7f,\x87\xb0\xa5_B\xd4\xb0\xbd\x8d)\x132f\xb2\x1e\xf5\xb9\x80\x00\xf2qC\x07\xaei\x18|\x1f\xb3\xedmk\x8d\xc3[(\xcdY\x04lk@:\xd8<\xca\xdb\xcem\x1e=\xdd>y\xf2\xea\x8e>\x9a\x16%\x98\ry`\xe5,\x82\x19\xdfQj#\xb3?D$\x14O\xc7\x1b\xc7w\xf4\xdaD\x87G?\xbe\xff\x9d\x048\x00\x9dC\x94\x88\x1d\x

In [0]:
!mkdir movie_reviews
!unzip corpus.zip -d ./movie_reviews

Get the corpus into the document store

In [0]:
# Now, let's write the docs to our DB.
doc_dir = "./movie_reviews"
write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True, split_paragraphs=True)

06/09/2020 00:07:06 - INFO - elasticsearch -   POST http://localhost:9200/_count [status:200 request:0.152s]
06/09/2020 00:07:06 - INFO - elasticsearch -   POST http://localhost:9200/_bulk [status:200 request:0.321s]
06/09/2020 00:07:06 - INFO - haystack.indexing.io -   Wrote 60 docs to DB


### Build the Finder

In [0]:
from haystack.retriever.elasticsearch import ElasticsearchRetriever
retriever = ElasticsearchRetriever(document_store=document_store)

Choose a pre-trained model to use from https://huggingface.co/models

It must be a transformer-type model (one of the *BERTs), and ideally one trained with SQUAD

In [0]:
reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=False)

06/09/2020 02:07:43 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None
06/09/2020 02:07:43 - INFO - farm.infer -   Could not find `distilbert-base-uncased-distilled-squad` locally. Try to download from model hub ...
06/09/2020 02:07:43 - INFO - filelock -   Lock 140624249510376 acquired on /root/.cache/torch/transformers/e88f38f2c8bc669ef7873de68f36bf764d4f64b9833ca8401efe271aab476745.0f15800a5b4c30725c555e054e3d0262e9916635f0de9d397c30acd86c21dc73.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=451.0, style=ProgressStyle(description_…

06/09/2020 02:07:43 - INFO - filelock -   Lock 140624249510376 released on /root/.cache/torch/transformers/e88f38f2c8bc669ef7873de68f36bf764d4f64b9833ca8401efe271aab476745.0f15800a5b4c30725c555e054e3d0262e9916635f0de9d397c30acd86c21dc73.lock
06/09/2020 02:07:44 - INFO - filelock -   Lock 140624249508136 acquired on /root/.cache/torch/transformers/1fb4b3980f6966dcb2c2e8a04794b70423fc470b65efcb692b8d796f3cae9e9e.f4565e3948d4331d7e0460adbcbdcac536e9886f24a2fad1190d6b53c231a3a3.lock





HBox(children=(FloatProgress(value=0.0, description='Downloading', max=265481570.0, style=ProgressStyle(descri…

06/09/2020 02:07:51 - INFO - filelock -   Lock 140624249508136 released on /root/.cache/torch/transformers/1fb4b3980f6966dcb2c2e8a04794b70423fc470b65efcb692b8d796f3cae9e9e.f4565e3948d4331d7e0460adbcbdcac536e9886f24a2fad1190d6b53c231a3a3.lock





	 We guess it's an *ENGLISH* model ... 
	 If not: Init the language model by supplying the 'language' param.
06/09/2020 02:07:56 - INFO - filelock -   Lock 140624323279840 acquired on /root/.cache/torch/transformers/9b3c03a36e83b13d5ba95ac965c9f9074a99e14340c523ab405703179e79fc46.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…

06/09/2020 02:07:56 - INFO - filelock -   Lock 140624323279840 released on /root/.cache/torch/transformers/9b3c03a36e83b13d5ba95ac965c9f9074a99e14340c523ab405703179e79fc46.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084.lock
06/09/2020 02:07:57 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None





In [0]:
finder = Finder(reader, retriever)

### Finished product!
Type your question and run the cell to get answers

In [0]:
#@title Ask a Question
Question = "Who falls in love with Marty McFly?" #@param {type: "string"}
#Detail = "High" #@param ["High", "Medium", "Minimal"]

Detail = "High"
if Detail == "High":
  desired_details = "all"
elif Detail == "Medium":
  desired_details = "medium"
elif Detail == "Minimal":
  desired_details = "minimal"


#search the text and print out the time it took
start = timer() 
prediction = finder.get_answers(question=Question, top_k_retriever=10, top_k_reader=5);
end = timer()

#print("\n\nTime to find answer: {:.4f}\n".format(end-start))
#print_answers(prediction, details=desired_details)


print("\nTime to find answer: {:.4f} seconds".format(end-start))

df = pd.DataFrame(columns = ['Answer', 'Doc', 'Context', 'Probability', 'Score'])
for answer in prediction['answers']:
  df = df.append({
                  'Answer': answer['answer'],
                  'Doc': answer['meta']['name'],
                  'Context': answer['context'],
                  'Probability': answer['probability'],
                  'Score': answer['score']
                }, 
                 sort=False,
                 ignore_index=True)
display(df)

06/09/2020 02:13:28 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.014s]
06/09/2020 02:13:28 - INFO - haystack.retriever.elasticsearch -   Got 10 candidates from retriever
06/09/2020 02:13:28 - INFO - haystack.finder -   Reader is looking for detailed answer in 32305 chars ...



Time to find answer: 22.6067 seconds


Unnamed: 0,Answer,Doc,Context,Probability,Score
0,Deckard,Blade_Runner_2.txt,"an blond beast, Roy Batty (the wonderful Rutger Hauer). Along the way, Decka...",0.906715,18.193323
1,Roy,Blade_Runner_1.txt,Deckard and between the pleasure model Priss (Daryl Hannah) and her lover Ro...,0.855041,14.197556
2,Michael J. Fox,Back_Future_3.txt,"rty McFly’s son, Marty Jr. (conveniently played by McFly “himself,” Michael ...",0.843301,13.463987
3,Victoria,Cats_1.txt,tening inasmuch as the cast is remarkable. A shy and refined cat named Victo...,0.827076,12.52037
4,Harrison Ford,Blade_Runner_1.txt,"When ""Blade Runner"" premiered in 1982, Harrison Ford disparagingly quipped, ...",0.822657,12.275634
