In [36]:
# Download and install Elasticsearch engine

!pip install --upgrade pip
!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]
!wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
!tar -xzf /content/elasticsearch-7.9.2-linux-x86_64.tar.gz
!chown -R daemon:daemon elasticsearch-7.9.2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pip
  Downloading pip-22.3.1-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 13.5 MB/s 
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.1.3
    Uninstalling pip-21.1.3:
      Successfully uninstalled pip-21.1.3
Successfully installed pip-22.3.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting farm-haystack[colab]
  Cloning https://github.com/deepset-ai/haystack.git to /tmp/pip-install-niy3uf1c/farm-haystack_d4b4933a0c074a0f870d70637e1a0cdb
  Running command git clone --filter=blob:none --quiet https://github.com/deepset-ai/haystack.git /tmp/pip-install-niy3uf1c/farm-haystack_d4b4933a0c074a0f870d70637e1a0cdb
  Resolved https://github.com/deepset-ai/haystack.git to commit e143f7cc3617a5bb360a464c18608c99e8703583
  Installing build d

In [39]:
import os
from subprocess import Popen, PIPE, STDOUT

# Start Elasticsearch server and test the connection
es_server = Popen(args=["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda:os.setuid(1))
# Wait until Elasticsearch has started
!sleep 30
!curl -X GET "localhost:9200/?pretty"

{
  "name" : "e6e4351abbdd",
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "OpVKsKkzS9G_27dVRXlKFQ",
  "version" : {
    "number" : "7.9.2",
    "build_flavor" : "default",
    "build_type" : "tar",
    "build_hash" : "d34da0ea4a966c4e49417f2da2f244e3e97b4e6e",
    "build_date" : "2020-09-23T00:45:33.626720Z",
    "build_snapshot" : false,
    "lucene_version" : "8.6.2",
    "minimum_wire_compatibility_version" : "6.8.0",
    "minimum_index_compatibility_version" : "6.0.0-beta1"
  },
  "tagline" : "You Know, for Search"
}


In [40]:
# Instantiate the document store.
# Return the document embedding for later use with dense retriever
from haystack.document_stores import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(return_embedding=True)

In [37]:
!pip install transformers
from transformers import AutoTokenizer, AutoModelForMaskedLM
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0m

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [1]:
!pip install datasets
from collections import defaultdict
from datasets import load_dataset, Dataset
from tqdm.notebook import tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
# Loading data 
data = load_dataset("json", data_files="/content/Bible.json", split="train")
data.set_format("pandas")
data["version"].value_counts()

CEI1974               72
INTERCONFESSIONALE    72
CEI2008               71
Name: version, dtype: int64

In [4]:
# Subset data to use only CEI1974 version
cei = data[:]
cei1974 = cei[cei.version =="CEI1974"]
df = Dataset.from_dict(cei1974)

Unnamed: 0,version,book,segments
0,CEI1974,1 cor,"[{'chapter': '1', 'id': '(1 cor 1 1)', 'lemma'..."
1,CEI1974,1 cr,"[{'chapter': '1', 'id': '(1 cr 1 1)', 'lemma':..."
2,CEI1974,1 gv,"[{'chapter': '1', 'id': '(1 gv 1 1)', 'lemma':..."
3,CEI1974,1 mac,"[{'chapter': '1', 'id': '(1 mac 1 1)', 'lemma'..."
4,CEI1974,1 pt,"[{'chapter': '1', 'id': '(1 pt 1 1)', 'lemma':..."
...,...,...,...
183,CEI1974,is,"[{'chapter': '1', 'id': '(is 1 1)', 'lemma': '..."
184,CEI1974,lm,"[{'chapter': '1', 'id': '(lm 1 1)', 'lemma': '..."
185,CEI1974,lv,"[{'chapter': '1', 'id': '(lv 1 1)', 'lemma': '..."
186,CEI1974,lc,"[{'chapter': '1', 'id': '(lc 1 1)', 'lemma': '..."


In [45]:
# Create documents to load in the document store. 
# Each document (a set of verses) has "content" field to store the content of the document,
# and can have additional fields specified as a dictionary inside "meta" field. 

docs = []
depth = 3 # Select the depth of the document search: max number of verses in a single document
for book in df:
  book_tuples = [(verse_dict["source"], verse_dict["id"]) for verse_dict in book["segments"]]
  for i in tqdm(range(len(book_tuples))):
    docs.append({"content":book_tuples[i][0], "meta":{"id":book_tuples[i][1]}})   
    if i < len(book_tuples)-1:  
      for step in range(1, depth):
        docs.append({"content":" ".join([tpl[0] for tpl in book_tuples[i:(i+1+step)]]), \
                     "meta":{"id":" ".join([tpl[1] for tpl in book_tuples[i:(i+1+step)]])}})

  0%|          | 0/436 [00:00<?, ?it/s]

  0%|          | 0/943 [00:00<?, ?it/s]

  0%|          | 0/105 [00:00<?, ?it/s]

  0%|          | 0/924 [00:00<?, ?it/s]

  0%|          | 0/105 [00:00<?, ?it/s]

  0%|          | 0/817 [00:00<?, ?it/s]

  0%|          | 0/810 [00:00<?, ?it/s]

  0%|          | 0/89 [00:00<?, ?it/s]

  0%|          | 0/113 [00:00<?, ?it/s]

  0%|          | 0/256 [00:00<?, ?it/s]

  0%|          | 0/822 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/555 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

  0%|          | 0/719 [00:00<?, ?it/s]

  0%|          | 0/693 [00:00<?, ?it/s]

  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/82 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/146 [00:00<?, ?it/s]

  0%|          | 0/404 [00:00<?, ?it/s]

  0%|          | 0/1007 [00:00<?, ?it/s]

  0%|          | 0/213 [00:00<?, ?it/s]

  0%|          | 0/117 [00:00<?, ?it/s]

  0%|          | 0/678 [00:00<?, ?it/s]

  0%|          | 0/1070 [00:00<?, ?it/s]

  0%|          | 0/105 [00:00<?, ?it/s]

  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/406 [00:00<?, ?it/s]

  0%|          | 0/1288 [00:00<?, ?it/s]

  0%|          | 0/197 [00:00<?, ?it/s]

  0%|          | 0/914 [00:00<?, ?it/s]

  0%|          | 0/221 [00:00<?, ?it/s]

  0%|          | 0/432 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/2525 [00:00<?, ?it/s]

  0%|          | 0/435 [00:00<?, ?it/s]

  0%|          | 0/1379 [00:00<?, ?it/s]

  0%|          | 0/53 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/249 [00:00<?, ?it/s]

  0%|          | 0/211 [00:00<?, ?it/s]

  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/531 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/302 [00:00<?, ?it/s]

  0%|          | 0/155 [00:00<?, ?it/s]

  0%|          | 0/280 [00:00<?, ?it/s]

  0%|          | 0/1213 [00:00<?, ?it/s]

  0%|          | 0/167 [00:00<?, ?it/s]

  0%|          | 0/1272 [00:00<?, ?it/s]

  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/1532 [00:00<?, ?it/s]

  0%|          | 0/1363 [00:00<?, ?it/s]

  0%|          | 0/726 [00:00<?, ?it/s]

  0%|          | 0/1064 [00:00<?, ?it/s]

  0%|          | 0/73 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

  0%|          | 0/657 [00:00<?, ?it/s]

  0%|          | 0/879 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/340 [00:00<?, ?it/s]

  0%|          | 0/149 [00:00<?, ?it/s]

  0%|          | 0/1286 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/858 [00:00<?, ?it/s]

  0%|          | 0/1149 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

In [46]:
# Writing documents to database (the operation can take several minutes)
document_store.write_documents(docs)
document_store.get_document_count()

105721

In [47]:
# Initialize the Retriever with document_store
from haystack.nodes import EmbeddingRetriever

retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    model_format="sentence_transformers",
)
# We need to call update_embeddings() to iterate over all
# previously indexed documents and update their embedding representation.
# While this can be a time consuming operation, it only needs to be done once.
# At query time, we only need to embed the query and compare it to the existing document embeddings.
document_store.update_embeddings(retriever)

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/402 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Updating embeddings:   0%|          | 0/105721 [00:00<?, ? Docs/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/179 [00:00<?, ?it/s]

In [48]:
def ricerca_versetto(text, top_k=3):
  """Retrieve the identifier of the verse in the document store which is closest to the input text
     and return the corresponding semantic similarity score.
  Args:
    text (str): some text to match against documents inside the document store
    top_k (int): number of documents to retrieve from the document store
  Returns:
    A dictionary with the identifier of the match biblic verse, 
    the text of the matched verse and its matching score.
  """
  retrieved_docs = retriever.retrieve(query=text, top_k=top_k)
  output = []
  for doc in retrieved_docs:
    output.append({"id_vestetto":doc.meta["id"],
                   "versetto":doc.content,
                   "semantic_similarity": round(doc.score, 5)})
  return output

In [49]:
query = "perché siete stati arricchiti di tutti i doni, compresi quello della parola e quello della conoscenza"
ricerca_versetto(query, top_k=3)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[{'id_vestetto': '(1 cor 1 5)',
  'versetto': 'perché in lui siete stati arricchiti di tutti i doni, quelli della parola e quelli della scienza.',
  'semantic_similarity': 0.5134},
 {'id_vestetto': '(2 cor 9 11)',
  'versetto': 'Così sarete ricchi per ogni generosità, la quale poi farà salire a Dio l`inno di ringraziamento per mezzo nostro.',
  'semantic_similarity': 0.5122},
 {'id_vestetto': '(2 cor 9 15)',
  'versetto': 'Grazie a Dio per questo suo ineffabile dono!',
  'semantic_similarity': 0.51177}]