On Windows:

    pip install farm-haystack==0.7.0 -f https://download.pytorch.org/whl/torch_stable.html
Anything else:

    pip install farm-haystack==0.7.0

# TO install elasticSearch on docker

In [None]:
#docker network create es-stack-network
#docker run -d --name elasticsearchdb --net es-stack-network -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" elasticsearch:7.11.2
#docker run -d --name kibana-es-ui --net es-stack-network -e "ELASTICSEARCH_URL=http://elasticsearchdb:9200"  -p 5601:5601 kibana:7.11.2

In [None]:
import json

with open(r'data/squad/dev.json', 'r') as f:
    squad = json.load(f)

In [None]:
# import os
# os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"


In [None]:
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(host='localhost', username='', password='', index='squad_docs')


In [None]:
# url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/"
# files = ["train-v2.0.json", "dev-v2.0.json"]
# squad_dir = "./data/squad"

In [None]:
# import os
# import json
# import requests


In [None]:
# if not os.path.exists(squad_dir):
#     os.makedirs(squad_dir)

In [None]:
# for file in files:
#     res = requests.get(url+file)
#     with open(os.path.join(squad_dir, file) ,"wb") as fp:
#         for chunk in res.iter_content(chunk_size=40):
#             fp.write(chunk)

In [None]:
squad_docs = []

for sample in squad:
    squad_docs.append({
        'text': sample['context']
    })

In [None]:
import requests


In [None]:
res = requests.get('http://localhost:9200/_cluster/health')

res.json()

In [None]:
document_store.write_documents(squad_docs)


In [None]:
from haystack.retriever.sparse import TfidfRetriever

retriever = TfidfRetriever(document_store)

In [None]:
len(squad)


In [None]:
query = "Physics is a very abstract subject"

retriever.retrieve(query)

In [None]:
res = requests.post('http://localhost:9200/squad_docs/_delete_by_query',
                    json={
                        'query': {
                            'match_all': {}
                        }
                    })

res.json()

In [None]:
res = requests.get('http://localhost:9200/squad_docs/_count')

res.json()

In [None]:
# create list of contexts (we cannot do this using current dictionary format)
contexts = [sample['context'] for sample in squad]

# convert to set to remove duplicates, then back to list
contexts = list(set(contexts))

# convert back to dictionary format we need
squad_docs = [{'text': sample} for sample in contexts]

In [None]:
document_store.write_documents(squad_docs)


In [None]:
retriever = TfidfRetriever(document_store)


In [None]:
retriever.retrieve(query)


In [None]:
# import BM25 retriever
from haystack.retriever.sparse import ElasticsearchRetriever

# intialize
retriever = ElasticsearchRetriever(document_store)

# and query
retriever.retrieve(query)

# Faiss

In [None]:
!nvidia-smi

In [None]:
# Install the latest master of Haystack
!pip install farm-haystack==1.6.0 --quiet
!pip install 'farm-haystack[faiss]' --quiet
!pip freeze  | grep farm-haystack

In [None]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [None]:
path = 'models/faiss'

import os

if not os.path.exists(path):
    os.makedirs(path)

In [None]:
from haystack.document_stores import FAISSDocumentStore

# initialize FAISS
document_store = FAISSDocumentStore(
    faiss_index_factory_str='Flat',
    sql_url=f'sqlite:///{path}/squad_dev.db',
    return_embedding=True
)

In [None]:
import json

with open('dev.json', 'r') as f:
    squad = json.load(f)

In [None]:
from haystack.schema import Document
import re
# create list of contexts
contexts = [re.sub(r'[()]', '', sample['context']) for sample in squad]

# remove duplicates
contexts = list(set(contexts))

# create list of Document objects
squad_docs = [Document(content=sample) for sample in contexts]

In [None]:
document_store.delete_documents()

In [None]:
document_store.write_documents(squad_docs)


In [None]:
from haystack.nodes import DensePassageRetriever


In [None]:
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model='facebook/dpr-question_encoder-single-nq-base',
    passage_embedding_model='facebook/dpr-ctx_encoder-single-nq-base',
    use_gpu=True,
    embed_title=True
)


In [None]:
document_store.update_embeddings(retriever=retriever)

In [None]:
document_store.save(index_path=f'{path}/squad_dev.faiss')


In [None]:
del document_store, retriever


In [None]:
document_store = FAISSDocumentStore.load(index_path=f'{path}/squad_dev.faiss', config_path=f'{path}/squad_dev.json')


In [None]:
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model='facebook/dpr-question_encoder-single-nq-base',
    passage_embedding_model='facebook/dpr-ctx_encoder-single-nq-base',
    use_gpu=True,
    embed_title=True
)

In [None]:
retriever.retrieve('What subject is most abstract?')[0].content



### Retriver-Reader Stack

In [None]:
from haystack.reader.farm import FARMReader

reader = FARMReader(model_name_or_path='deepset/bert-base-cased-squad2', use_gpu=True)

In [None]:
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import DensePassageRetriever


In [None]:
path = './models/faiss'


In [None]:
document_store = FAISSDocumentStore.load(index_path=f'{path}/squad_dev.faiss', config_path=f'{path}/squad_dev.json')


In [None]:
# initialize DPR model
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model='facebook/dpr-question_encoder-single-nq-base',
    passage_embedding_model='facebook/dpr-ctx_encoder-single-nq-base',
    use_gpu=True,
    embed_title=True
)

In [None]:
from haystack.pipelines import ExtractiveQAPipeline

pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever)

In [None]:
extracted_answers = pipeline.run(query='What does theoretical computer science cover?')


In [None]:
extracted_answers =  [
    {"content": i.answer, "score":i.score, "context":i.context} 
     for i in extracted_answers["answers"]
    ]


In [None]:
for i,ans in enumerate(extracted_answers):
    print(i+1 ,"=>",ans["content"])

# Open Domain Question

In [None]:
#https://raw.githubusercontent.com/jamescalam/transformers/main/data/text/meditations/clean.txt

In [6]:
import requests

In [7]:
data = requests.get("https://raw.githubusercontent.com/jamescalam/transformers/main/data/text/meditations/clean.txt")

In [10]:
text = data.text.split("\n")

In [12]:
len(text)

507

In [13]:
text[:3]


['From my grandfather Verus I learned good morals and the government of my temper.',
 'From the reputation and remembrance of my father, modesty and a manly character.',
 'From my mother, piety and beneficence, and abstinence, not only from evil deeds, but even from evil thoughts; and further, simplicity in my way of living, far removed from the habits of the rich.']

In [16]:
requests.get('http://localhost:9200/_cluster/health').json()


{'cluster_name': 'docker-cluster',
 'status': 'yellow',
 'timed_out': False,
 'number_of_nodes': 1,
 'number_of_data_nodes': 1,
 'active_primary_shards': 2,
 'active_shards': 2,
 'relocating_shards': 0,
 'initializing_shards': 0,
 'unassigned_shards': 2,
 'delayed_unassigned_shards': 0,
 'number_of_pending_tasks': 0,
 'number_of_in_flight_fetch': 0,
 'task_max_waiting_in_queue_millis': 0,
 'active_shards_percent_as_number': 50.0}

In [17]:
print(requests.get('http://localhost:9200/_cat/indices').text)


yellow open squad_docs 3CW2Ki6NS9CGGytt2j1Hnw 1 1 1204 0 1.2mb 1.2mb
yellow open label      lbdUMht3RAufz1YZov3KVQ 1 1    0 0  208b  208b



In [18]:
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore

doc_store = ElasticsearchDocumentStore(
    host='localhost',
    username='', password='',
    index='aurelius'
)

08/10/2022 10:46:33 - INFO - elasticsearch -   PUT http://localhost:9200/aurelius [status:200 request:0.390s]
08/10/2022 10:46:33 - INFO - elasticsearch -   HEAD http://localhost:9200/label [status:200 request:0.017s]


In [19]:
print(requests.get('http://localhost:9200/_cat/indices').text)


yellow open aurelius   -ZmTejLvQeiaujGW0R8HDg 1 1    0 0  208b  208b
yellow open squad_docs 3CW2Ki6NS9CGGytt2j1Hnw 1 1 1204 0 1.2mb 1.2mb
yellow open label      lbdUMht3RAufz1YZov3KVQ 1 1    0 0  208b  208b



In [20]:
data_json = [
    {
        'text': paragraph,
        'meta': {
            'source': 'meditations'
        }
    } for paragraph in text
]

In [21]:
data_json[:3]


[{'text': 'From my grandfather Verus I learned good morals and the government of my temper.',
  'meta': {'source': 'meditations'}},
 {'text': 'From the reputation and remembrance of my father, modesty and a manly character.',
  'meta': {'source': 'meditations'}},
 {'text': 'From my mother, piety and beneficence, and abstinence, not only from evil deeds, but even from evil thoughts; and further, simplicity in my way of living, far removed from the habits of the rich.',
  'meta': {'source': 'meditations'}}]

In [22]:
doc_store.write_documents(data_json)


08/10/2022 10:47:25 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.385s]
08/10/2022 10:47:26 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.032s]


In [23]:
requests.get('http://localhost:9200/aurelius/_count').json()


{'count': 507,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}

# Retriever Reader Pipeline


In [24]:
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore

doc_store = ElasticsearchDocumentStore(
    host='localhost',
    username='', password='',
    index='aurelius'
)

08/10/2022 10:48:42 - INFO - elasticsearch -   HEAD http://localhost:9200/aurelius [status:200 request:0.016s]
08/10/2022 10:48:42 - INFO - elasticsearch -   GET http://localhost:9200/aurelius [status:200 request:0.007s]
08/10/2022 10:48:42 - INFO - elasticsearch -   PUT http://localhost:9200/aurelius/_mapping [status:200 request:0.024s]
08/10/2022 10:48:42 - INFO - elasticsearch -   HEAD http://localhost:9200/label [status:200 request:0.007s]


In [25]:
from haystack.retriever.sparse import ElasticsearchRetriever
from haystack.reader.farm import FARMReader

retriever = ElasticsearchRetriever(doc_store)  # BM25
reader = FARMReader(model_name_or_path='deepset/bert-base-cased-squad2',
                    context_window_size=1500,
                    use_gpu=True)

08/10/2022 10:49:09 - INFO - farm.utils -   Using device: CPU 
08/10/2022 10:49:09 - INFO - farm.utils -   Number of GPUs: 0
08/10/2022 10:49:09 - INFO - farm.utils -   Distributed Training: False
08/10/2022 10:49:09 - INFO - farm.utils -   Automatic Mixed Precision: None
08/10/2022 10:49:45 - INFO - farm.utils -   Using device: CPU 
08/10/2022 10:49:45 - INFO - farm.utils -   Number of GPUs: 0
08/10/2022 10:49:45 - INFO - farm.utils -   Distributed Training: False
08/10/2022 10:49:45 - INFO - farm.utils -   Automatic Mixed Precision: None
08/10/2022 10:49:45 - INFO - farm.infer -   Got ya 7 parallel workers to do inference ...
08/10/2022 10:49:45 - INFO - farm.infer -    0    0    0    0    0    0    0 
08/10/2022 10:49:45 - INFO - farm.infer -   /w\  /w\  /w\  /w\  /w\  /w\  /w\
08/10/2022 10:49:45 - INFO - farm.infer -   /'\  / \  /'\  /'\  / \  / \  /'\
08/10/2022 10:49:45 - INFO - farm.infer -               


In [26]:
from haystack.pipeline import ExtractiveQAPipeline

qa = ExtractiveQAPipeline(reader=reader, retriever=retriever)

In [27]:
qa.run(query='What did your grandfather teach?')


08/10/2022 10:55:23 - INFO - elasticsearch -   POST http://localhost:9200/aurelius/_search [status:200 request:0.451s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.32s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.10s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.12 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.27s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.26s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.12 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.28 Batches/s]
Inferencing Samples: 100%|███████████████

{'query': 'What did your grandfather teach?',
 'no_ans_gap': -3.121257781982422,
 'answers': [{'answer': 'good morals and the government of my temper',
   'score': 5.546539306640625,
   'probability': 0.6667044950977762,
   'context': 'From my grandfather Verus I learned good morals and the government of my temper.',
   'offset_start': 36,
   'offset_end': 79,
   'offset_start_in_doc': 36,
   'offset_end_in_doc': 79,
   'document_id': 'bdec3b60-f352-456e-93de-f269515ec3da',
   'meta': {'source': 'meditations'}},
  {'answer': 'self-government',
   'score': 1.488326072692871,
   'probability': 0.5463765048011897,
   'context': 'From Maximus I learned self-government, and not to be led aside by anything; and cheerfulness in all circumstances, as well as in illness; and a just admixture in the moral character of sweetness and dignity, and to do what was set before me without complaining. I observed that everybody believed that he thought as he spoke, and that in all that he did he never ha

In [28]:
qa.run(query='What is the Universe?',top_k_reader=3)


08/10/2022 10:56:26 - INFO - elasticsearch -   POST http://localhost:9200/aurelius/_search [status:200 request:0.060s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.00 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.38 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.25 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.33 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.52 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.43 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.11 Batches/s]
Inferencing Samples: 100%|███████████████

{'query': 'What is the Universe?',
 'no_ans_gap': 9.206113815307617,
 'answers': [{'answer': 'a well-arranged universe',
   'score': 12.657489776611328,
   'probability': 0.8295139175329398,
   'context': 'Either it is a well-arranged universe or a chaos huddled together, but still a universe. But can a certain order subsist in thee, and disorder in the All? And this too when all things are so separated and diffused and sympathetic.',
   'offset_start': 13,
   'offset_end': 37,
   'offset_start_in_doc': 13,
   'offset_end_in_doc': 37,
   'document_id': '9c90031a-7b69-4856-8c16-0012ea941ff7',
   'meta': {'source': 'meditations'}},
  {'answer': 'the universe loves to make whatever is about to be',
   'score': 10.453649520874023,
   'probability': 0.7869614603311372,
   'context': '"The earth loves the shower"; and "the solemn aether loves": and the universe loves to make whatever is about to be. I say then to the universe, that I love as thou lovest. And is not this too said, that "this 

In [29]:
qa.run(query='What is art?',
       top_k_reader=3)

08/10/2022 10:57:13 - INFO - elasticsearch -   POST http://localhost:9200/aurelius/_search [status:200 request:0.034s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.05s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.14 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.10 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.10 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.02 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.06s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.08 Batches/s]
Inferencing Samples: 100%|███████████████

{'query': 'What is art?',
 'no_ans_gap': 4.066239833831787,
 'answers': [{'answer': 'the acts of life',
   'score': 6.920151233673096,
   'probability': 0.703708179598843,
   'context': 'Let it make no difference to thee whether thou art cold or warm, if thou art doing thy duty; and whether thou art drowsy or satisfied with sleep; and whether ill-spoken of or praised; and whether dying or doing something else. For it is one of the acts of life, this act by which we die: it is sufficient then in this act also to do well what we have in hand.',
   'offset_start': 244,
   'offset_end': 260,
   'offset_start_in_doc': 244,
   'offset_end_in_doc': 260,
   'document_id': '0873c884-c5a0-4d39-bab0-fff70a8dd6c4',
   'meta': {'source': 'meditations'}},
  {'answer': 'To be good',
   'score': 4.782202243804932,
   'probability': 0.6451471606834001,
   'context': 'What is thy art? To be good. And how is this accomplished well except by general principles, some about the nature of the universe, and o