In [1]:
%%bash
# Install required libraries. You need to restart runtime after installation.
pip install --upgrade pip
pip install farm-haystack[colab,elasticsearch,inference]
pip install datasets
pip install apache-beam

Collecting pip
  Downloading pip-23.2.1-py3-none-any.whl (2.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.1/2.1 MB 13.6 MB/s eta 0:00:00
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-23.2.1
Collecting farm-haystack[colab,elasticsearch,inference]
  Obtaining dependency information for farm-haystack[colab,elasticsearch,inference] from https://files.pythonhosted.org/packages/3f/ce/5cf16922f2190e6b5455dccaefefe64d228b9c15d35e0decc5fb6362fdf8/farm_haystack-1.18.1-py3-none-any.whl.metadata
  Downloading farm_haystack-1.18.1-py3-none-any.whl.metadata (25 kB)
Collecting boilerpy3 (from farm-haystack[colab,elasticsearch,inference])
  Downloading boilerpy3-1.0.6-py3-none-any.whl (22 kB)
Collecting canals==0.2.2 (from farm-haystack[colab,elasticsearch,inference])
  Obtaining dependency information for canals==0.2.2 from https://file

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests==2.27.1, but you have requests 2.31.0 which is incompatible.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
multiprocess 0.70.15 requires dill>=0.3.7, but you have dill 0.3.1.1 which is incompatible.


In [1]:
# In Colab / No Docker environments: Start Elasticsearch from source
! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
! chown -R daemon:daemon elasticsearch-7.9.2

import os
from subprocess import Popen, PIPE, STDOUT

es_server = Popen(
    ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1)  # as daemon
)
# wait until ES has started
! sleep 30


In [2]:
# Initialize the document store

from haystack.document_stores import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore()

In [3]:
# Download the preprocessed wikipedia simple english from huggingface datasets library
from datasets import load_dataset
simple_ds = load_dataset("wikipedia", "20220301.simple")


Downloading builder script:   0%|          | 0.00/35.9k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/30.4k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/235M [00:00<?, ?B/s]

In [4]:
# Extract the content from dataset object and transform it to Document class
from tqdm import tqdm
from haystack.schema import Document

raw_document_list = []
for file in tqdm(simple_ds['train']):
    temp_document = {}
    temp_document["content_type"] = "text"
    temp_document["content"] = file["text"]
    temp_document["meta"] = {"id":file["id"],"url":file["url"],"title":file["title"]}
    temp_document["id_hash_keys"] = ["content","meta"]
    raw_document_list.append(Document.from_dict(temp_document))

print(f"Number of articles present in the wikipedia simple english dataset are {len(raw_document_list)}")

100%|██████████| 205328/205328 [01:45<00:00, 1945.75it/s]

Number of articles present in the wikipedia simple english dataset are 205328





In [5]:
# For the demo, we will use 10k articles from the dataset

from haystack.nodes import PreProcessor
preprocessor = PreProcessor(split_by="word",split_respect_sentence_boundary=True,split_length=100)
processed_document_list = preprocessor.process(documents=raw_document_list[:10000])

# Write the processed documents to document_store

document_store.write_documents(documents=processed_document_list)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Preprocessing:   0%|          | 0/10000 [00:00<?, ?docs/s]



In [6]:
# Generate embeddings using sentence transformer model

from haystack.nodes import EmbeddingRetriever
embedding_retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
    model_format="sentence_transformers",top_k=100)

# Generate embeddings
document_store.update_embeddings(retriever=embedding_retriever)

Downloading (…)16ebc/.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b6b5d16ebc/README.md:   0%|          | 0.00/8.65k [00:00<?, ?B/s]

Downloading (…)b5d16ebc/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)ebc/data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)16ebc/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)6ebc/train_script.py:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

Downloading (…)b6b5d16ebc/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5d16ebc/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


Updating embeddings:   0%|          | 0/53076 [00:00<?, ? Docs/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/97 [00:00<?, ?it/s]

In [7]:
# Verify the document count and embeddings count
print(f"Document count after writing to index are {document_store.get_document_count()}")
print(f"Embedding count present in current index are {document_store.get_embedding_count()}")

Document count after writing to index are 53076
Embedding count present in current index are 53076


In [12]:
# Lets glue the components using a pipeline

from haystack.pipelines import Pipeline
semantic_search_pipeline = Pipeline()

#Adding nodes

semantic_search_pipeline.add_node(component=embedding_retriever,name="semantic-search",inputs=["Query"])
results = semantic_search_pipeline.run(query="What is the Islamic Republic Day?",params={"top_k":10})

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [13]:
# Helper function to print the results
from haystack.utils import print_documents
print_documents(results=results)


Query: What is the Islamic Republic Day?

{   'content': '1935)\n'
               '\n'
               'Observances\n'
               'Bastille Day (France)\n'
               'Republic Day (Iraq)\n'
               '\n'
               'References\n'
               '\n'
               'Days of the year',
    'name': None}

{   'content': '1934)\n'
               '\n'
               'Holidays and observances\n'
               'Kiribati – Independence Day, 2nd day (not a holiday)\n'
               'Mongolia – Naadam Holiday, 3rd day\n'
               'Montenegro – Statehood Day\n'
               "Bahá'í Faith – Feast of Kálimát (Words) – First day of the "
               "seventh month of the Bahá'í Calendar\n"
               'Bon Festival – Buddhist festival to honor the dead (East '
               'Japan)\n'
               '\n'
               'Other websites\n'
               'BBC: On This Day\n'
               '\n'
               'Days of the year',
    'name': None}

{   'content': 'Ob

In [14]:
results = semantic_search_pipeline.run(query="What year was Apple Inc. founded?",params={"top_k":10})
print_documents(results=results)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Query: What year was Apple Inc. founded?

{   'content': 'Apple Inc. is a public company and trades on the NASDAQ under '
               'the stock ticker AAPL. On March 19, 2015, it became one of the '
               '30 components of the Dow Jones Industrial Average.\n'
               '\n'
               'General history\n'
               'Apple was started in 1976 by Steve Jobs and Steve Wozniak. '
               'Before they made the company, they sold "blue boxes", which '
               'had telephone buttons on them. People could use them to make '
               'telephone calls from payphones without paying any money. It '
               'did this by pretending to be a telephone operator. The '
               "company's first product is now called the Apple I computer. ",
    'name': None}

{   'content': 'They were almost ready to sell it, but a problem happened. '
               'Steve Wozniak was working for the computer company '
               'Hewlett-Packard, and the p