<a href="https://colab.research.google.com/github/dszon/matching/blob/master/haystack_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preliminaries

In [1]:
%%bash
nvidia-smi
pip -q install --upgrade pip
pip -q install farm-haystack[colab]
pip -q install datasets

Mon Jan 23 16:12:03 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   60C    P0    30W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces



In [2]:
import logging
logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [3]:
from haystack.utils import launch_es
from haystack.telemetry import disable_telemetry
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.utils import fetch_archive_from_http
from haystack import Pipeline
from haystack.nodes import TextConverter, PreProcessor
from haystack.nodes import BM25Retriever
from haystack.nodes import FARMReader
from haystack import Pipeline
from haystack.utils import print_answers
from pprint import pprint
from datasets import load_dataset
import os
import glob
import time

INFO:haystack.telemetry:Haystack sends anonymous usage data to understand the actual usage and steer dev efforts towards features that are most meaningful to users. You can opt-out at anytime by calling disable_telemetry() or by manually setting the environment variable HAYSTACK_TELEMETRY_ENABLED as described for different operating systems on the documentation page. More information at https://docs.haystack.deepset.ai/docs/telemetry


In [4]:
%%bash
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
chown -R daemon:daemon elasticsearch-7.9.2

In [5]:
%%bash --bg
sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch

In [6]:
# wait until elastic is up
time.sleep(30)

# Starting with Haystack

In [7]:
disable_telemetry()
launch_es()

INFO:haystack.telemetry:Telemetry has been disabled.


## Getting and preprocessing documents

In [8]:
# Get the host where Elasticsearch is running, default to localhost
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")

document_store = ElasticsearchDocumentStore(
    host=host,
    username="",
    password="",
    index="document"
)

In [9]:
doc_dir = "data/build_a_scalable_question_answering_system"
if os.path.exists(doc_dir):
  for file in glob.glob(f"{doc_dir}/*"):
    os.remove(file)  
fetch_archive_from_http(
    url="http://3.126.84.96/Archiv.zip", 
    output_dir=doc_dir
)
files_to_index = [doc_dir + "/" + f for f in os.listdir(doc_dir)]
logging.getLogger("haystack").info(f"we have a corpus of {len(files_to_index)} documents")

INFO:haystack.utils.import_utils:Fetching from http://3.126.84.96/Archiv.zip to 'data/build_a_scalable_question_answering_system'
INFO:haystack:we have a corpus of 71566 documents


In [10]:
indexing_pipeline = Pipeline()
text_converter = TextConverter()
indexing_pipeline.add_node(component=text_converter, name="TextConverter", inputs=["File"])

In [11]:
preprocessor = PreProcessor(
    clean_whitespace=True,
    clean_header_footer=True,
    clean_empty_lines=True,
    split_by="word",
    split_length=100,
    split_overlap=20,
    split_respect_sentence_boundary=True,
)
indexing_pipeline.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])

In [12]:
indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"])

In [None]:
indexing_pipeline.run(file_paths=files_to_index)

Converting files:   0%|          | 0/71566 [00:00<?, ?it/s]

In [None]:
retriever = BM25Retriever(document_store=document_store)

In [None]:
#reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)
#reader = FARMReader(model_name_or_path="deepset/roberta-large-squad2", use_gpu=True)
reader = FARMReader(model_name_or_path="deepset/deberta-v3-large-squad2", use_gpu=True)


In [None]:
querying_pipeline = Pipeline()
querying_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
querying_pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"])


## Prediction

In [None]:
prediction = querying_pipeline.run(
    query="Ist Spitzkohl lecker?",
    params={
        "Retriever": {"top_k": 100},
        "Reader": {"top_k": 20}
    }
)

In [None]:
for answer in prediction['answers']:
  answer = answer.to_dict()
  if answer['score'] > .01:
    print(f"{answer['score']:4.3f} {answer['answer']:65s} {answer['context']}")