In [None]:
#Installing the required modules

%%bash

pip install --upgrade pip
pip install farm-haystack[colab,preprocessing,elasticsearch,inference]



In [None]:
#Since I saved the document corpus in a drive folder, I need to mount the drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Thr path for the folder in which our document corpus is stored
folder_path = "/content/drive/MyDrive/NLP/UST/docs"

In [None]:
# Initializing the Elastic Search Document Store
# A DocumentStore stores the Documents that the question answering system uses to find answers to your questions.
# Cuase we can't just store lare number of documents in an python data structure like dictionary or list and also
# Document stores provide inbuilt indexing and are compatibile with other nodes of haystsack framework
%%bash

wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
chown -R daemon:daemon elasticsearch-7.9.2

In [None]:
# Start The server
%%bash --bg

sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch

In [None]:
# wait 30s for the server to fully start up
import time
time.sleep(30)

In [None]:
# Initialize the ElasticsearchDocumentStore
from haystack.utils import launch_es
launch_es()



In [None]:
import os
from haystack.document_stores import ElasticsearchDocumentStore

# Get the host where Elasticsearch is running, default to localhost
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")

document_store = ElasticsearchDocumentStore(
    host=host,
    username="",
    password="",
    index="document"
)

In [None]:
# ElasticsearchDocumentStore is up and running and ready to store the Documents.
# The next step is adding the files to the DocumentStore.
# The INDEXING PIPELINE turns your files into Document objects and writes them to the DocumentStore.
# Our indexing pipeline will have two nodes: TextConverter, which turns .txt files into Haystack Document objects, and
# PreProcessor, which cleans and splits the text within a Document.

###################################################### INDEXING PIPELINE ######################################################
from haystack import Pipeline
from haystack.nodes import TextConverter, PreProcessor

indexing_pipeline = Pipeline()               ### Indexing Pipeline
text_converter = TextConverter()             ### Node 1 : Text Converter
preprocessor = PreProcessor(                 ### Node 2 : Preprocessr
    clean_whitespace=True,
    clean_header_footer=True,
    clean_empty_lines=True,
    split_by="word",                         # We can also split by sentences, passages
    split_length=200,                        # Maximum number of words per output document
    split_overlap=15,                        # Amount of overlap between 2 adjacent documents after a split
    split_respect_sentence_boundary=True,    # Ensures that doc boundaries do not fall in the middle of sentences
)
# To learn more about the parameters of the PreProcessor, see https://docs.haystack.deepset.ai/docs/preprocessor#usage

# Adding nodes
indexing_pipeline.add_node(component=text_converter, name="TextConverter", inputs=["File"])
indexing_pipeline.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])
indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"])

indexing_pipeline.run_batch(file_paths=[folder_path + "/" + f for f in os.listdir(folder_path)])


# Now that the preprocessed Documents are in the DocumentStore, let's initialize the nodes we want to use in our query pipeline.
###################################################### QUERY PIPELINE ######################################################
# A Retriever sifts through all the Documents and returns only those that are relevant to the question based on vector similarity.
# BM25Retriever is the recommended Retriever for a question answering system.
from haystack.nodes import BM25Retriever
retriever = BM25Retriever(document_store=document_store)

# For more Retriever options, see https://docs.haystack.deepset.ai/docs/retriever


# A Reader scans the texts it received from the Retriever and extracts the top answer candidates.
# Readers are based on powerful deep learning models but are much slower than Retrievers at processing the same amount of text.
# FARMReader with a base-sized RoBERTa question answering model called https://huggingface.co/deepset/roberta-base-squad2
from haystack.nodes import FARMReader
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

# It's a good all-round model to start with. To find a model that's best for your use case, see https://docs.haystack.deepset.ai/docs/reader#models
from haystack import Pipeline
querying_pipeline = Pipeline()
querying_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
querying_pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"])

Converting files:   0%|          | 0/20 [00:00<?, ?it/s]

Preprocessing:   0%|          | 0/20 [00:00<?, ?docs/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
# Use the pipeline's run() method to ask a question. The query argument is where you type your question.
# Additionally, you can set the number of documents you want the Reader and Retriever to return using the top-k parameter.
# To learn more about setting arguments, see https://docs.haystack.deepset.ai/docs/pipelines#arguments
# To understand the importance of the top-k parameter, see https://docs.haystack.deepset.ai/docs/optimization#choosing-the-right-top-k-values

prediction = querying_pipeline.run(
    query="Which firm is one of the biggest investors in Google",
    params={
        "Retriever": {"top_k": 20},
        "Reader": {"top_k": 3}
    }
)

# To print the returned answers
from haystack.utils import print_answers
print_answers(
    prediction,
    details="minimum" ## Choose from `minimum`, `medium` and `all`
)

# To learn how to improve the performance of the Reader, see https://haystack.deepset.ai/tutorials/02_finetune_a_model_on_your_data

Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

'Query: Which firm is one of the biggest investors in Google'
'Answers:'
[   {   'answer': 'TimeWarner',
        'context': 'd sales boost Time Warner profit\n'
                   '\n'
                   'Quarterly profits at US media giant TimeWarner jumped 76% '
                   'to $1.13bn (£600m) for the three months to December, '
                   'from '},
    {   'answer': 'Time Warner',
        'context': 'ns which offset a profit dip at Warner Bros, and less '
                   'users for AOL.\n'
                   '\n'
                   'Time Warner said on Friday that it now owns 8% of '
                   'search-engine Google. But its '},
    {   'answer': 'Dimension Data',
        'context': 'dea of an \'always available\' society," says Cara Diemont '
                   'of IT firm Dimension Data, which commissioned the survey. '
                   'However, call centres also saw a sh'}]
