In [1]:
%%bash

pip install --upgrade pip
pip install farm-haystack[colab,inference]

Collecting farm-haystack[colab,inference]
  Downloading farm_haystack-1.18.1-py3-none-any.whl (737 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 737.2/737.2 kB 46.2 MB/s eta 0:00:00
Collecting boilerpy3 (from farm-haystack[colab,inference])
  Downloading boilerpy3-1.0.6-py3-none-any.whl (22 kB)
Collecting canals==0.2.2 (from farm-haystack[colab,inference])
  Downloading canals-0.2.2-py3-none-any.whl (31 kB)
Collecting events (from farm-haystack[colab,inference])
  Downloading Events-0.4.tar.gz (5.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting lazy-imports==0.3.1 (from farm-haystack[colab,inference])
  Downloading lazy_imports-0.3.1-py3-none-any.whl (12 kB)
Collecting posthog (from farm-haystack[colab,inference])
  Downloading posthog-3.0.1-py2.py3-none-any.whl (37 kB)
Collecting prompthub-py==4.0.0 (from farm-haystack[colab,inference])
  Downloading prompthub_py-4.0.0-py3-none-any.whl (6.9 kB)
Collecting quantulu

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests==2.27.1, but you have requests 2.31.0 which is incompatible.


In [3]:
#Initialsising the Document Store

from haystack.document_stores import InMemoryDocumentStore

document_store = InMemoryDocumentStore(use_bm25=True)

In [4]:
#Downloading the document corpus

from haystack.utils import fetch_archive_from_http, convert_files_to_docs, clean_wiki_text

# Download and prepare data - 517 Wikipedia articles for Game of Thrones
doc_dir = "data/tutorial11"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt11.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# convert files to dicts containing documents that can be indexed to our datastore
got_docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
document_store.delete_documents()
document_store.write_documents(got_docs)

Updating BM25 representation...:   0%|          | 0/2357 [00:00<?, ? docs/s]

In [5]:
# Here we initialize the core components that we will be gluing together using the Pipeline class
# Initialize a BM25Retriever, an EmbeddingRetriever, and a FARMReader.
#You can combine these components to create a classic Retriever-Reader pipeline that is designed to perform Open Domain Question Answering.

from haystack.nodes import BM25Retriever, EmbeddingRetriever, FARMReader

# Initialize Sparse Retriever
bm25_retriever = BM25Retriever(document_store=document_store)

# Initialize embedding Retriever
embedding_retriever = EmbeddingRetriever(
    document_store=document_store, embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1"
)
document_store.update_embeddings(embedding_retriever, update_existing_embeddings=False)

# Initialize Reader
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")


Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)16ebc/.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b6b5d16ebc/README.md:   0%|          | 0.00/8.65k [00:00<?, ?B/s]

Downloading (…)b5d16ebc/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)ebc/data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)16ebc/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)6ebc/train_script.py:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

Downloading (…)b6b5d16ebc/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5d16ebc/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


Updating Embedding:   0%|          | 0/2357 [00:00<?, ? docs/s]

Batches:   0%|          | 0/74 [00:00<?, ?it/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [8]:
# To draw the pipelines
!apt install libgraphviz-dev
!pip install pygraphviz

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libgail-common libgail18 libgtk2.0-0 libgtk2.0-bin libgtk2.0-common
  libgvc6-plugins-gtk libxdot4
Suggested packages:
  gvfs
The following NEW packages will be installed:
  libgail-common libgail18 libgraphviz-dev libgtk2.0-0 libgtk2.0-bin
  libgtk2.0-common libgvc6-plugins-gtk libxdot4
0 upgraded, 8 newly installed, 0 to remove and 15 not upgraded.
Need to get 2,148 kB of archives.
After this operation, 7,427 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal/main amd64 libgtk2.0-common all 2.24.32-4ubuntu4 [126 kB]
Get:2 http://archive.ubuntu.com/ubuntu focal/main amd64 libgtk2.0-0 amd64 2.24.32-4ubuntu4 [1,791 kB]
Get:3 http://archive.ubuntu.com/ubuntu focal/main amd64 libgail18 amd64 2.24.32-4ubuntu4 [14.7 kB]
Get:4 http://archive.ubuntu.com/ubuntu focal/main amd64 libgail-common amd64 2.24.32-4ub

In [9]:
############################################### INBUILT EXTRACTIVE QA PIPELINE ##########################################

from haystack.pipelines import ExtractiveQAPipeline
from haystack.utils import print_answers

p_extractive_premade = ExtractiveQAPipeline(reader=reader, retriever=bm25_retriever)

ans = p_extractive_premade.run(
    query="Who is the father of Arya Stark?",
    params={"Retriever": {"top_k": 10},
            "Reader": {"top_k": 5}
            }
    )

p_extractive_premade.draw("pipeline_extractive_premade.png")
print_answers(ans, details="minimum")

Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

'Query: Who is the father of Arya Stark?'
'Answers:'
[   {   'answer': 'Eddard',
        'context': 's Nymeria after a legendary warrior queen. She travels '
                   "with her father, Eddard, to King's Landing when he is made "
                   'Hand of the King. Before she leaves,'},
    {   'answer': 'Ned',
        'context': '\n'
                   '====Season 1====\n'
                   'Arya accompanies her father Ned and her sister Sansa to '
                   "King's Landing. Before their departure, Arya's "
                   'half-brother Jon Snow gifts A'},
    {   'answer': 'Lord Eddard Stark',
        'context': 'ark daughters.\n'
                   'During the Tourney of the Hand to honour her father Lord '
                   'Eddard Stark, Sansa Stark is enchanted by the knights '
                   'performing in the event.'},
    {   'answer': 'Eddard Stark',
        'context': "arrested by Catelyn Stark. He then races to King's Landing "
                 

In [11]:
# If you want to just do the retrieval step, you can use a DocumentSearchPipeline
############################################### INBUILT DocumentSearchPipeline ##########################################

from haystack.pipelines import DocumentSearchPipeline
from haystack.utils import print_documents

p_retrieval = DocumentSearchPipeline(embedding_retriever)
res = p_retrieval.run(
    query="Who is the father of Arya Stark?",
    params={"Retriever": {"top_k": 5}}
    )
p_retrieval.draw("pipeline_retrieval.png")
print_documents(res, max_text_len=200)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Query: Who is the father of Arya Stark?

{   'content': '\n'
               '=== Background ===\n'
               'Arya is the third child and younger daughter of Eddard and '
               'Catelyn Stark and is nine years old at the beginning of the '
               'book series.  She has five siblings: an older brother Robb, '
               'a...',
    'name': '43_Arya_Stark.txt'}

{   'content': '\n'
               '===Arya Stark===\n'
               "'''Arya Stark''' portrayed by Maisie Williams. Arya Stark of "
               'House Stark is the younger daughter and third child of Lord '
               'Eddard and Catelyn Stark of Winterfell. Ever the tomboy, '
               'Arya...',
    'name': '349_List_of_Game_of_Thrones_characters.txt'}

{   'content': "'''Arya Stark''' is a fictional character in American author "
               "George R. R. Martin's ''A Song of Ice and Fire'' epic fantasy "
               'novel series.  She is a prominent point of view character in '

In [15]:
############################################### CUSTOM EXTRACTIVE QA PIPELINE ##########################################

from haystack.pipelines import Pipeline

# Custom built extractive QA pipeline
p_extractive = Pipeline()
p_extractive.add_node(component=bm25_retriever, name="Retriever", inputs=["Query"])
p_extractive.add_node(component=reader, name="Reader", inputs=["Retriever"])

# Now we can run it
res = p_extractive.run(
    query="Who is the father of Arya Stark?",
    params={"Retriever": {"top_k": 5},
            "Reader":{"top_k":3}
            }
    )
print_answers(res, details="minimum")

# To generate the pipeline image
p_extractive.draw("pipeline_extractive.png")

Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

'Query: Who is the father of Arya Stark?'
'Answers:'
[   {   'answer': 'Eddard',
        'context': 's Nymeria after a legendary warrior queen. She travels '
                   "with her father, Eddard, to King's Landing when he is made "
                   'Hand of the King. Before she leaves,'},
    {   'answer': 'Lord Eddard Stark',
        'context': 'ark daughters.\n'
                   'During the Tourney of the Hand to honour her father Lord '
                   'Eddard Stark, Sansa Stark is enchanted by the knights '
                   'performing in the event.'},
    {   'answer': 'Eddard Stark',
        'context': "arrested by Catelyn Stark. He then races to King's Landing "
                   "to inform Eddard Stark. During Lord Eddard's execution, he "
                   'finds Arya Stark and shields her'}]


In [16]:
############################################### CUSTOM EXTRACTIVE QA PIPELINE USING 2 RETRIEVERS ##########################################

from haystack.nodes import JoinDocuments

# Create ensembled pipeline
p_ensemble = Pipeline()
p_ensemble.add_node(component=bm25_retriever, name="BM25Retriever", inputs=["Query"])
p_ensemble.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"])
p_ensemble.add_node(
    component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["BM25Retriever", "EmbeddingRetriever"]
)
p_ensemble.add_node(component=reader, name="Reader", inputs=["JoinResults"])

# Uncomment the following to generate the pipeline image
# p_ensemble.draw("pipeline_ensemble.png")

# Run pipeline
res = p_ensemble.run(
    query="Who is the father of Arya Stark?", params={"EmbeddingRetriever": {"top_k": 5}, "BM25Retriever": {"top_k": 5}}
)
print_answers(res, details="minimum")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

'Query: Who is the father of Arya Stark?'
'Answers:'
[   {   'answer': 'Eddard',
        'context': 's Nymeria after a legendary warrior queen. She travels '
                   "with her father, Eddard, to King's Landing when he is made "
                   'Hand of the King. Before she leaves,'},
    {   'answer': 'Eddard and Catelyn Stark',
        'context': 'tark ===\n'
                   'Arya Stark is the third child and younger daughter of '
                   'Eddard and Catelyn Stark. She serves as a POV character '
                   "for 33 chapters throughout ''A "},
    {   'answer': 'Lord Eddard Stark',
        'context': 'ark daughters.\n'
                   'During the Tourney of the Hand to honour her father Lord '
                   'Eddard Stark, Sansa Stark is enchanted by the knights '
                   'performing in the event.'},
    {   'answer': 'Lord Eddard Stark',
        'context': "Game of Thrones'', Arya is the third child and younger "
                  

In [17]:
############################################### CUSTOM NODES ##########################################

from haystack import BaseComponent
from typing import Optional, List


class CustomNode(BaseComponent):
    outgoing_edges = 1

    def run(self, query: str, my_optional_param: Optional[int]):
        # process the inputs
        output = {"my_output": ...}
        return output, "output_1"

    def run_batch(self, queries: List[str], my_optional_param: Optional[int]):
        # process the inputs
        output = {"my_output": ...}
        return output, "output_1"


In [18]:
class CustomQueryClassifier(BaseComponent):
    outgoing_edges = 2

    def run(self, query: str):
        if "?" in query:
            return {}, "output_2"
        else:
            return {}, "output_1"

    def run_batch(self, queries: List[str]):
        split = {"output_1": {"queries": []}, "output_2": {"queries": []}}
        for query in queries:
            if "?" in query:
                split["output_2"]["queries"].append(query)
            else:
                split["output_1"]["queries"].append(query)

        return split, "split"


# Here we build the pipeline
p_classifier = Pipeline()
p_classifier.add_node(component=CustomQueryClassifier(), name="QueryClassifier", inputs=["Query"])
p_classifier.add_node(component=bm25_retriever, name="BM25Retriever", inputs=["QueryClassifier.output_1"])
p_classifier.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["QueryClassifier.output_2"])
p_classifier.add_node(component=reader, name="QAReader", inputs=["BM25Retriever", "EmbeddingRetriever"])

# Uncomment the following to generate the pipeline image
p_classifier.draw("pipeline_classifier.png")

# Run only the dense retriever on the full sentence query
res_1 = p_classifier.run(query="Who is the father of Arya Stark?")
print("Embedding Retriever Results" + "\n" + "=" * 15)
print_answers(res_1)

# Run only the sparse retriever on a keyword based query
res_2 = p_classifier.run(query="Arya Stark father")
print("BM25Retriever Results" + "\n" + "=" * 15)
print_answers(res_2)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

Embedding Retriever Results
'Query: Who is the father of Arya Stark?'
'Answers:'
[   <Answer {'answer': 'Eddard and Catelyn Stark', 'type': 'extractive', 'score': 0.9058835506439209, 'context': "tark ===\nArya Stark is the third child and younger daughter of Eddard and Catelyn Stark. She serves as a POV character for 33 chapters throughout ''A ", 'offsets_in_document': [{'start': 74, 'end': 98}], 'offsets_in_context': [{'start': 63, 'end': 87}], 'document_ids': ['965789c741c68963042e85b6e7b89757'], 'meta': {'name': '30_List_of_A_Song_of_Ice_and_Fire_characters.txt'}}>,
    <Answer {'answer': 'Lord Eddard Stark', 'type': 'extractive', 'score': 0.8793494701385498, 'context': "Game of Thrones'', Arya is the third child and younger daughter of Lord Eddard Stark and his wife Lady Catelyn Stark.  She is tomboyish, headstrong, f", 'offsets_in_document': [{'start': 419, 'end': 436}], 'offsets_in_context': [{'start': 67, 'end': 84}], 'document_ids': ['2ee56bdd46dfd30b23f91bcc046456a4'], 'meta':

Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

BM25Retriever Results
'Query: Arya Stark father'
'Answers:'
[   <Answer {'answer': 'Eddard', 'type': 'extractive', 'score': 0.9085890054702759, 'context': "s Nymeria after a legendary warrior queen. She travels with her father, Eddard, to King's Landing when he is made Hand of the King. Before she leaves,", 'offsets_in_document': [{'start': 147, 'end': 153}], 'offsets_in_context': [{'start': 72, 'end': 78}], 'document_ids': ['ba2a8e87ddd95e380bec55983ee7d55f'], 'meta': {'name': '43_Arya_Stark.txt'}}>,
    <Answer {'answer': 'Ned', 'type': 'extractive', 'score': 0.7877868413925171, 'context': "\n====Season 1====\nArya accompanies her father Ned and her sister Sansa to King's Landing. Before their departure, Arya's half-brother Jon Snow gifts A", 'offsets_in_document': [{'start': 46, 'end': 49}], 'offsets_in_context': [{'start': 46, 'end': 49}], 'document_ids': ['180c2a6b36369712b361a80842e79356'], 'meta': {'name': '43_Arya_Stark.txt'}}>,
    <Answer {'answer': 'Lord Eddard Stark', 'type