In [1]:
from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import TextConverter
from haystack.nodes import BM25Retriever
from haystack.nodes import EmbeddingRetriever
from haystack.nodes import FARMReader
from haystack.nodes import PreProcessor
from haystack.pipelines import ExtractiveQAPipeline
from haystack import Document
from haystack.utils import convert_files_to_docs
from pprint import  pprint
import logging


In [2]:

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)


In [3]:
base_dir = '/home/eliran/Projects/wikipedia/data/'
file_name = 'en.wikipedia.org.wiki.U2.txt'
#with open(base_dir + file_name, 'r') as f:
 #   text = f.read()

#paragraphs = text.split('\n')

In [4]:
converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"])
doc_txt = converter.convert(file_path=base_dir + file_name, meta=None)[0]

In [5]:
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True, 
    clean_header_footer=False,
    split_by='word',
    split_length=100,
    split_respect_sentence_boundary=True
)

In [6]:
clean_docs = preprocessor.process([doc_txt])
#clean_docs

Preprocessing:   0%|          | 0/1 [00:00<?, ?docs/s]

In [7]:
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")
host

'localhost'

In [9]:
document_store = ElasticsearchDocumentStore(
    host=host, 
    username="elastic", 
    password="giUTMUpLwLV8RrGp0r-o", 
    index="u2wikipedia", 
    scheme='https',
    ca_certs='/usr/local/share/ca-certificates/http_ca.crt')

  indices = self.client.indices.get(index_name, headers=headers)


In [10]:
document_store.write_documents(clean_docs)

In [11]:
retriever = BM25Retriever(document_store=document_store)

In [12]:
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)
# deepset/bert-large-uncased-whole-word-masking-squad2

INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
INFO:haystack.modeling.model.language_model: * LOADING MODEL: 'deepset/roberta-base-squad2' (Roberta)
INFO:haystack.modeling.model.language_model:Auto-detected model language: english
INFO:haystack.modeling.model.language_model:Loaded 'deepset/roberta-base-squad2' (Roberta model) from model hub.
INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1


In [23]:
pipe = ExtractiveQAPipeline(reader, retriever)

In [24]:
# You can configure how many candidates the Reader and Retriever shall return
# The higher top_k_retriever, the better (but also the slower) your answers.
prediction = pipe.run(
    query="Who play guitar in U2?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)
pprint(prediction)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 10.63 Batches/s]

{'answers': [<Answer {'answer': 'David Evans', 'type': 'extractive', 'score': 0.7207580804824829, 'context': ' played drums and was joined by: Paul Hewson ("Bono") on lead vocals; David Evans ("the Edge") and his older brother Dik Evans on guitar; Adam Clayton', 'offsets_in_document': [{'start': 76, 'end': 87}], 'offsets_in_context': [{'start': 70, 'end': 81}], 'document_id': 'f126a7c52812a13ffb2a32f5fddeb0cc', 'meta': {'_split_id': 7, 'vector_id': '161'}}>,
             <Answer {'answer': 'The Edge', 'type': 'extractive', 'score': 0.6129111051559448, 'context': 'alls the "Fat Lady" voice on the tracks "Lemon" and "Numb".[369][370]\n\nThe Edge\'s style of playing guitar is distinguished by his chiming timbres,[371', 'offsets_in_document': [{'start': 289, 'end': 297}], 'offsets_in_context': [{'start': 71, 'end': 79}], 'document_id': '889f3a84e9a6d0392f8f2565380bc566', 'meta': {'_split_id': 122, 'vector_id': '96'}}>,
             <Answer {'answer': 'Mullen and Clayton', 'type': 'extracti




In [16]:
faiss_document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

In [17]:
faiss_document_store.write_documents(clean_docs)

Writing Documents:   0%|          | 0/165 [00:00<?, ?it/s]

In [18]:

retriever = EmbeddingRetriever(
    document_store=faiss_document_store,
    embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
    model_format="sentence_transformers",
)
# Important:
# Now that we initialized the Retriever, we need to call update_embeddings() to iterate over all
# previously indexed documents and update their embedding representation.
# While this can be a time consuming operation (depending on the corpus size), it only needs to be done once.
# At query time, we only need to embed the query and compare it to the existing document embeddings, which is very fast.
faiss_document_store.update_embeddings(retriever)

INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
INFO:haystack.nodes.retriever.dense:Init retriever using embeddings of model sentence-transformers/multi-qa-mpnet-base-dot-v1
INFO:haystack.document_stores.faiss:Updating embeddings for 165 docs...


Updating Embedding:   0%|          | 0/165 [00:00<?, ? docs/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

In [19]:
eader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
INFO:haystack.modeling.model.language_model: * LOADING MODEL: 'deepset/roberta-base-squad2' (Roberta)
INFO:haystack.modeling.model.language_model:Auto-detected model language: english
INFO:haystack.modeling.model.language_model:Loaded 'deepset/roberta-base-squad2' (Roberta model) from model hub.
INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1


In [21]:
faiss_pipe = ExtractiveQAPipeline(reader, retriever)

In [26]:
prediction = faiss_pipe.run(
    query="Who write lyrics in U2?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)
pprint(prediction)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 10.66 Batches/s]


{'answers': [<Answer {'answer': 'Bono', 'type': 'extractive', 'score': 0.6403651237487793, 'context': 're an Irish rock band from Dublin, formed in 1976. The group consists of Bono (lead vocals and rhythm guitar), the Edge (lead guitar, keyboards, and b', 'offsets_in_document': [{'start': 77, 'end': 81}], 'offsets_in_context': [{'start': 73, 'end': 77}], 'document_id': '17288ef8fe25adcf3e51bec10c1ae068', 'meta': {'_split_id': 0, 'vector_id': '3'}}>,
             <Answer {'answer': 'Bono', 'type': 'extractive', 'score': 0.497267484664917, 'context': 'ense of exhilaration" that resulted from the Edge\'s "radiant chords" and Bono\'s "ardent vocals".[328] However, according to Bob Stanley, "U2 rejected ', 'offsets_in_document': [{'start': 238, 'end': 242}], 'offsets_in_context': [{'start': 73, 'end': 77}], 'document_id': '456a2b57e2b439b89210d6061804e57f', 'meta': {'_split_id': 110, 'vector_id': '44'}}>,
             <Answer {'answer': 'William S. Burroughs', 'type': 'extractive', 'score':

In [43]:
def pipelines_runner(query):
    faiss_pred = faiss_pipe.run(
        query=query, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
    )
    es_pred = faiss_pipe.run(
        query=query, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
    )
    for i in range(0,5):
        print(f'FAISS {faiss_pred["answers"][i].answer} Score {faiss_pred["answers"][i].score}')
        print(f'ES {es_pred["answers"][i].answer} Score {faiss_pred["answers"][i].score}')



In [44]:
pipelines_runner("Who write songs for U2?")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 10.70 Batches/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 11.05 Batches/s]

FAISS Paul Epworth, Ryan Tedder, Declan Gaffney, and Flood Score 0.6937980651855469
ES Paul Epworth, Ryan Tedder, Declan Gaffney, and Flood Score 0.6937980651855469
FAISS Bono (lead vocals and rhythm guitar), the Edge (lead guitar, keyboards, and backing vocals), Adam Clayton (bass guitar), and Larry Mullen Jr. Score 0.5482701659202576
ES Bono (lead vocals and rhythm guitar), the Edge (lead guitar, keyboards, and backing vocals), Adam Clayton (bass guitar), and Larry Mullen Jr. Score 0.5482701659202576
FAISS William S. Burroughs Score 0.5052986145019531
ES William S. Burroughs Score 0.5052986145019531
FAISS Bob Dylan, Van Morrison, and Keith Richards Score 0.43949365615844727
ES Bob Dylan, Van Morrison, and Keith Richards Score 0.43949365615844727
FAISS Daniel Lanois and Brian Eno Score 0.4150450527667999
ES Daniel Lanois and Brian Eno Score 0.4150450527667999





In [29]:
 for i in range(2,5):
    print(i)

2
3
4
