In [None]:
# Make sure to do this before you run the code
# Go to Runtime->Change runtime type->Hard accelerator & change it to GPU

In [None]:
############################################### INSTALLING THE REQUIRED MODULES TO BUILD THE PIPELINE ##########################################

%%bash

pip install --upgrade pip
pip install farm-haystack[colab,preprocessing,elasticsearch,inference]





In [18]:
############################################### DOWNLOADING THE DATA SET ##########################################

### You have to use appropriate method to download the dataset, the below url's might not have

'''
##### STANDARD DATA SET ######

from haystack.utils import fetch_archive_from_http

fetch_archive_from_http()

fetch_archive_from_http(
    url = "https://raw.githubusercontent.com/dshreddy/pipeline/main/documents/standard_data_set.zip",
    output_dir = "data",
)

dir = "/content/data/standard_data_set"
'''

##### CUSTOM DATA SET ######
from haystack.utils import fetch_archive_from_http

fetch_archive_from_http(
    url = "https://raw.githubusercontent.com/dshreddy/pipeline/main/documents/custom_data_set.zip",
    output_dir = "data",
)

dir = "/content/data/custom_data_set"

False

In [None]:
############################################### INITIALISING THE ELASTIC SEARCH DOCUMENT STORE ##########################################

# A DocumentStore stores the Documents that the question answering system uses to find answers to your questions.

%%bash

# Download Elasticsearch
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
chown -R daemon:daemon elasticsearch-7.9.2

In [None]:
%%bash --bg

# Start Elasticsearch in the background
sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch

In [None]:
# wait 30s for the server to fully start up
import time
time.sleep(30)

In [None]:
# Initialize the ElasticsearchDocumentStore
from haystack.utils import launch_es
launch_es()



In [None]:
import os
from haystack.document_stores import ElasticsearchDocumentStore

# Get the host where Elasticsearch is running, default to localhost
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")

document_store = ElasticsearchDocumentStore(
    host=host,
    username="",
    password="",
    index="document"
)

In [None]:
###################################################### INDEXING PIPELINE ######################################################
from haystack import Pipeline
from haystack.nodes import TextConverter, PreProcessor

indexing_pipeline = Pipeline()               ### Indexing Pipeline
text_converter = TextConverter()             ### Node 1 : Text Converter
preprocessor = PreProcessor(                 ### Node 2 : Preprocessor
    clean_whitespace=True,
    clean_header_footer=True,
    clean_empty_lines=True,
    split_by="word",                         # We can also split by sentences, passages
    split_length=200,                        # Maximum number of words per output document
    split_overlap=15,                        # Amount of overlap between 2 adjacent documents after a split
    split_respect_sentence_boundary=True,    # Ensures that doc boundaries do not fall in the middle of sentences
)
# To learn more about the parameters of the PreProcessor, see https://docs.haystack.deepset.ai/docs/preprocessor#usage



# Adding nodes
indexing_pipeline.add_node(component=text_converter, name="TextConverter", inputs=["File"])
indexing_pipeline.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])
indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"])

document_store.delete_documents()
indexing_pipeline.run_batch(file_paths=[dir + "/" + f for f in os.listdir(dir)])

# Now the preprocessed Documents are in the DocumentStore

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Converting files:   0%|          | 0/16 [00:00<?, ?it/s]

Preprocessing:   0%|          | 0/16 [00:00<?, ?docs/s]

{'documents': [<Document: {'content': 'UST, formerly known as UST GLOBAL, is a provider of digital technology and transformation, information technology and services, headquartered in Aliso Viejo, California, United States. Stephen Ross founded UST in 1998 in Laguna Hills. The company has offices in the Americas, EMEA, APAC, and India. In June 2018, Temasek, Singapore’s sovereign wealth fund, invested US$250 million in UST, giving UST a US$1 billion-plus valuation.', 'content_type': 'text', 'score': None, 'meta': {'_split_id': 0, '_split_overlap': []}, 'id_hash_keys': ['content'], 'embedding': None, 'id': '4d6a82c6aa44636e1a704e0bee37b3bc'}>,
  <Document: {'content': 'UST offers services in areas like digital transformation, cybersecurity, data analytics, data engineering, technology and digital consulting, supply chain management, cloud infrastructure, developer productivity, quality engineering, IT talent sourcing, innovation as a service, legacy modernization, human-centered design,

In [None]:
############################################### COMPONENTS NEEDED FOR PIPELINES ##########################################
from haystack.nodes import BM25Retriever, EmbeddingRetriever, FARMReader,JoinDocuments, PromptNode, PromptTemplate, AnswerParser
from haystack.pipelines import Pipeline
from haystack.utils import print_answers

# Initialize Sparse Retriever
bm25_retriever = BM25Retriever(document_store=document_store)

# Initialize embedding Retriever
embedding_retriever = EmbeddingRetriever(document_store=document_store, embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1")
document_store.update_embeddings(embedding_retriever, update_existing_embeddings=False)

# Initialize Reader
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")

# Initialize the joiner
joiner = JoinDocuments(join_mode="concatenate")

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)16ebc/.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b6b5d16ebc/README.md:   0%|          | 0.00/8.65k [00:00<?, ?B/s]

Downloading (…)b5d16ebc/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)ebc/data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)16ebc/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)6ebc/train_script.py:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

Downloading (…)b6b5d16ebc/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5d16ebc/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


Updating embeddings:   0%|          | 0/27 [00:00<?, ? Docs/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
############################################### SEARCH PIPELINE 1 ##########################################
# design of the pipeline
# Query -> BM25 retriever -> reader -> answer

pipe1 = Pipeline()
pipe1.add_node(component=bm25_retriever, name="Retriever", inputs=["Query"])
pipe1.add_node(component=reader, name="Reader", inputs=["Retriever"])

res = pipe1.run(
    query="who is the founder of UST ?",
    params={"Retriever": {"top_k": 10},
            "Reader":{"top_k":1}
            }
    )
print_answers(res, details="minimum")

Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

'Query: who is the founder of UST ?'
'Answers:'
[   {   'answer': 'Stephen Ross',
        'context': 'd services, headquartered in Aliso Viejo, California, '
                   'United States. Stephen Ross founded UST in 1998 in Laguna '
                   'Hills. The company has offices in the '}]


In [None]:
############################################### SEARCH PIPELINE 2 ##########################################
# design of the pipeline

'''
                        Query
                        /    \
                       /      \
            BM25 retriever   Embedding retriever
                      \       /
                       \     /
                        \   /
                        Reader
                          |
                          ⬇
                         Answer
'''

pipe2 = Pipeline()
pipe2.add_node(component=bm25_retriever, name="BM25Retriever", inputs=["Query"])
pipe2.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"])
pipe2.add_node(component=joiner, name="JoinResults", inputs=["BM25Retriever", "EmbeddingRetriever"])
pipe2.add_node(component=reader, name="Reader", inputs=["JoinResults"])

res = pipe2.run(
    query="Who is the founder of UST ?",
    params={"EmbeddingRetriever": {"top_k": 10},
            "BM25Retriever": {"top_k": 10},
            "Reader":{"top_k":1}
            }
)
print_answers(res, details="minimum")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inferencing Samples:   0%|          | 0/2 [00:00<?, ? Batches/s]

'Query: Who is the founder of UST ?'
'Answers:'
[   {   'answer': 'Stephen Ross',
        'context': 'd services, headquartered in Aliso Viejo, California, '
                   'United States. Stephen Ross founded UST in 1998 in Laguna '
                   'Hills. The company has offices in the '}]


In [None]:
############################################### TRAINING OUR MODEL ON OUR OWN DATA SET ##########################################

In [None]:
'''
##### TRAIN FILE FOR STANDARD DATA SET #######
# Downloading the reader train file from git hub
!wget -nc https://raw.githubusercontent.com/dshreddy/pipeline/main/standard_train_file.json
'''

##### TRAIN FILE FOR CUSTOM DATA SET #######
# Downloading the reader train file from git hub
!wget -nc https://raw.githubusercontent.com/dshreddy/pipeline/main/custom_train_file.json

--2023-07-26 10:56:15--  https://raw.githubusercontent.com/dshreddy/pipeline/main/custom_train_file.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80466 (79K) [text/plain]
Saving to: ‘custom_train_file.json’


2023-07-26 10:56:15 (5.93 MB/s) - ‘custom_train_file.json’ saved [80466/80466]



In [None]:
'''
# If Dataset is standard dataset run this & comment the below part
reader_eval_results = reader.eval_on_file(data_dir ="/content/",test_filename="standard_train_file.json", device="cuda")
reader_eval_results
'''

reader_eval_results = reader.eval_on_file(data_dir ="/content/",test_filename="custom_train_file.json", device="cuda")
reader_eval_results

- instead of giving you full control over which labels to use, this method always returns three types of metrics: combined (no suffix), text_answer ('_text_answer' suffix) and no_answer ('_no_answer' suffix) metrics.
- instead of comparing predictions with labels on a string level, this method compares them on a token-ID level. This makes it unable to do any string normalization (e.g. normalize whitespaces) beforehand.
Hence, results might slightly differ from those of `Pipeline.eval()`
.If you are just about starting to evaluate your model consider using `Pipeline.eval()` instead.


Preprocessing dataset:   0%|          | 0/1 [00:00<?, ? Dicts/s]

Evaluating:   0%|          | 0/6 [00:00<?, ?it/s]

RuntimeError: ignored

In [None]:
'''
# If Dataset is standard dataset run this & comment the below part
reader.train(data_dir ="/content/",train_filename="standard_train_file.json", use_gpu=True, n_epochs=4, save_dir="my_model")
'''
reader.train(data_dir ="/content/", train_filename="custom_train_file.json",use_gpu=True, n_epochs=100, save_dir="my_model")

In [None]:
# Saving the model happens automatically at the end of training into the `save_dir` you specified
# However, you could also save a reader manually again via:
reader.save(directory="my_model")

In [None]:
# If you want to load it at a later point, just do:
new_reader = FARMReader(model_name_or_path="my_model")

In [None]:
'''
# If Dataset is standard dataset run this & comment the below part
reader_eval_results = reader.eval_on_file(data_dir ="/content/",test_filename="standard_train_file.json", device="cuda")
reader_eval_results
'''

reader_eval_results = reader.eval_on_file(data_dir ="/content/",test_filename="custom_train_file.json", device="cuda")
reader_eval_results

In [None]:
############################################### USING THE TRAINED MODEL IN OUR PIPELINE ##########################################

In [None]:
# Query -> BM25 retriever ->  {**trained reader**} -> answer

pipe = Pipeline()
pipe.add_node(component=bm25_retriever, name="Retriever", inputs=["Query"])
pipe.add_node(component=new_reader, name="Reader", inputs=["Retriever"])

'''
res = pipe.run(
    query="How many atoms combine to form dioxygen?",
    params={"Retriever": {"top_k": 10},
            "Reader":{"top_k":1}
            }
    )
print_answers(res, details="minimum")
'''

queries = [
    "who is the founder of UST ?",
    "what is UST Gloabal's new name?",
    "what is the valuation of UST?",

    "what is a data structure ?",
    "what are the examples of a linear data structure?",
    "what is a dynamic data structure?",
    "what is an array",
    "what is a graph",
    "what is a trie?",
    ]

for q in queries :
  res = pipe.run(
      query=q,
      params={"Retriever": {"top_k": 10},
              "Reader":{"top_k":1}
              }
      )
  print_answers(res, details="minimum")

In [None]:
'''
                        Query
                        /    \
                       /      \
            BM25 retriever   Embedding retriever
                      \       /
                       \     /
                        \   /
                   **trained reader**
                          |
                          ⬇
                         Answer
'''

pipe = Pipeline()
pipe.add_node(component=bm25_retriever, name="BM25Retriever", inputs=["Query"])
pipe.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"])
pipe.add_node(component=joiner, name="JoinResults", inputs=["BM25Retriever", "EmbeddingRetriever"])
pipe.add_node(component=new_reader, name="Reader", inputs=["JoinResults"])

'''
res = pipe.run(
    query="How many atoms combine to form dioxygen?",
    params={"EmbeddingRetriever": {"top_k": 10},
            "BM25Retriever": {"top_k": 10},
            "Reader":{"top_k":1}
            }
)
print_answers(res, details="minimum")
'''

queries = [
    "who is the founder of UST ?",
    "what is UST Gloabal's new name?",
    "what is the valuation of UST?",

    "what is a data structure ?",
    "what are the examples of a linear data structure?",
    "what is a dynamic data structure?",
    "what is an array",
    "what is a graph",
    "what is a trie?",
    ]

for q in queries :
  res = pipe.run(
      query=q,
      params={"EmbeddingRetriever": {"top_k": 10},
              "BM25Retriever": {"top_k": 10},
              "Reader":{"top_k":1}
            }
      )