In [None]:
# Download and install Elasticsearch engine

!pip install --upgrade pip
!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]
!wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
!tar -xzf /content/elasticsearch-7.9.2-linux-x86_64.tar.gz
!chown -R daemon:daemon elasticsearch-7.9.2

In [None]:
import os
from subprocess import Popen, PIPE, STDOUT

# Start Elasticsearch server and test the connection
es_server = Popen(args=["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda:os.setuid(1))
# Wait until Elasticsearch has started
!sleep 30
!curl -X GET "localhost:9200/?pretty"

In [None]:
# Instantiate the document store.
# Return the document embedding for later use with dense retriever
from haystack.document_stores import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(return_embedding=True)

In [None]:
# Data loading and processing 

!pip install datasets
from collections import defaultdict
from datasets import load_dataset, Dataset
from tqdm.notebook import tqdm

In [None]:
# Load the data 
data = load_dataset("json", data_files="/content/Bible.json", split="train")
data.set_format("pandas")
data["version"].value_counts()

In [None]:
# Subset data to use only CEI1974 version
cei = data[:]
cei1974 = cei[cei.version =="CEI1974"]
df = Dataset.from_dict(cei1974)

In [None]:
# Create documents to load in the document store. 
# Each document (a set of verses) has "content" field to store the content of the document,
# and can have additional fields specified as a dictionary inside "meta" field. 

docs = []
depth = 3 # Select the depth of the document search: max number of verses in a single document
for book in df:
  book_tuples = [(verse_dict["source"], verse_dict["id"]) for verse_dict in book["segments"]]
  for i in tqdm(range(len(book_tuples))):
    docs.append({"content":book_tuples[i][0], "meta":{"id":book_tuples[i][1]}})   
    if i < len(book_tuples)-1:  
      for step in range(1, depth):
        docs.append({"content":" ".join([tpl[0] for tpl in book_tuples[i:(i+1+step)]]), \
                     "meta":{"id":" ".join([tpl[1] for tpl in book_tuples[i:(i+1+step)]])}})

In [None]:
# Writing documents to database (the operation can take several minutes)
document_store.write_documents(docs)
document_store.get_document_count()

In [None]:
# Initialize the Retriever with document_store
from haystack.nodes import EmbeddingRetriever

retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    model_format="sentence_transformers",
)
# We need to call update_embeddings() to iterate over all
# previously indexed documents and update their embedding representation.
# While this can be a time consuming operation, it only needs to be done once.
# At query time, we only need to embed the query and compare it to the existing document embeddings.
document_store.update_embeddings(retriever)

In [None]:
# Define a function to search and retrieve verses from the Bible
# having high semantic similarity with some input text.

def ricerca_versetto(text, top_k=3):
  """
  Retrieve the identifier of the verse in the document store which is closest to the input text
  and return the corresponding semantic similarity score.
  Args:
    text (str): a text to match against documents inside the document store
    top_k (int): a number of documents to retrieve from the document store
  Returns:
    A dictionary with the identifier of the matched biblic verse, 
    the text of the matched verse and its matching score.
  """
  retrieved_docs = retriever.retrieve(query=text, top_k=top_k)
  output = []
  for doc in retrieved_docs:
    output.append({"id_vestetto":doc.meta["id"],
                   "versetto":doc.content,
                   "semantic_similarity": round(doc.score, 5)})
  return output

In [None]:
query = "perché siete stati arricchiti di tutti i doni, compresi quello della parola e quello della conoscenza"
ricerca_versetto(query, top_k=3)