In [None]:
%%bash
# Install required libraries. You need to restart runtime after installation.
pip install --upgrade pip
pip install farm-haystack[colab,elasticsearch,inference]
pip install datasets
pip install apache-beam

In [None]:
# In Colab / No Docker environments: Start Elasticsearch from source
! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
! chown -R daemon:daemon elasticsearch-7.9.2

import os
from subprocess import Popen, PIPE, STDOUT

es_server = Popen(
    ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1)  # as daemon
)
# wait until ES has started
! sleep 30


In [None]:
# Initialize the document store

from haystack.document_stores import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore()

In [None]:
# Download the preprocessed wikipedia simple english from huggingface datasets library
from datasets import load_dataset
simple_ds = load_dataset("wikipedia", "20220301.simple")


In [None]:
# Extract the content from dataset object and transform it to Document class
from tqdm import tqdm
from haystack.schema import Document

raw_document_list = []
for file in tqdm(simple_ds['train']):
    temp_document = {}
    temp_document["content_type"] = "text"
    temp_document["content"] = file["text"]
    temp_document["meta"] = {"id":file["id"],"url":file["url"],"title":file["title"]}
    temp_document["id_hash_keys"] = ["content","meta"]
    raw_document_list.append(Document.from_dict(temp_document))

print(f"Number of articles present in the wikipedia simple english dataset are {len(raw_document_list)}")

In [None]:
# For the demo, we will use 10k articles from the dataset

from haystack.nodes import PreProcessor
preprocessor = PreProcessor(split_by="word",split_respect_sentence_boundary=True,split_length=100)
processed_document_list = preprocessor.process(documents=raw_document_list[:10000])

# Write the processed documents to document_store

document_store.write_documents(documents=processed_document_list)


In [None]:
# Generate embeddings using sentence transformer model

from haystack.nodes import EmbeddingRetriever
embedding_retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
    model_format="sentence_transformers",top_k=100)

# Generate embeddings
document_store.update_embeddings(retriever=embedding_retriever)

In [None]:
# Verify the document count and embeddings count
print(f"Document count after writing to index are {document_store.get_document_count()}")
print(f"Embedding count present in current index are {document_store.get_embedding_count()}")

In [None]:
# Initialize Keyword search and Join node
from haystack.nodes import BM25Retriever,JoinDocuments
keyword_retriever = BM25Retriever(document_store=document_store,top_k=100)
join_docs = JoinDocuments(join_mode="concatenate",weights=[0.5,0.5],top_k_join=75)

In [None]:
# Lets glue the components using a pipeline

from haystack.pipelines import Pipeline
hybrid_search_pipeline = Pipeline()

#Adding Semantic Component
hybrid_search_pipeline.add_node(component=embedding_retriever,name="semantic-search",inputs=["Query"])

# Adding Keyword Component
hybrid_search_pipeline.add_node(component=keyword_retriever,name="keyword-search",inputs=["Query"])

# Join Documents
hybrid_search_pipeline.add_node(component=join_docs,name="join-documents",inputs=["semantic-search,keyword-search"])
results = hybrid_search_pipeline.run(query="What is the Islamic Republic Day?",params={"top_k":100})

In [None]:
# Helper function to print the results
from haystack.utils import print_documents
print_documents(results=results)