In [1]:
from haystack.utils import  convert_files_to_dicts, print_answers
from haystack.nodes import FARMReader
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import DensePassageRetriever
from haystack.pipelines import ExtractiveQAPipeline
from pymongo import MongoClient
from haystack.schema import Document




In [2]:

db_client = MongoClient(host="localhost", port=27017)
database = db_client['Website_Chatbot']
collection = database["MitsSpider"]


In [9]:
def split_paragraph(text):
    print(text)
    # for para in text.split("\n\n"):
    #     if not para.strip():
    #         continue
        # print(para)

In [11]:
docs = [] 
for document in collection.find({}):
    content = document["content"]
    for para in content.split("\n\n"):
        if not para.strip():
            continue

        tmp_doc = Document(content=para)
        tmp_doc.id = str(document["_id"])
        tmp_doc.meta = {"source":document['source'], }
        tmp_doc.content_type = "str"
        docs.append(tmp_doc)


In [12]:
len(docs)

98

In [13]:
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")
document_store.write_documents(docs)

Writing Documents: 10000it [00:00, 37854.70it/s]         


In [14]:
retriever = DensePassageRetriever(document_store=document_store,
                                 query_embedding_model='facebook/dpr-question_encoder-single-nq-base',
                                 passage_embedding_model='facebook/dpr-ctx_encoder-single-nq-base',
                                 max_seq_len_query=64,
                                 max_seq_len_passage=256,
                                 batch_size=16,
                                 use_gpu=True,
                                 embed_title=True,
                                 use_fast_tokenizers=True)
document_store.update_embeddings(retriever)

INFO - haystack.modeling.utils -  Using devices: CUDA:0
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find facebook/dpr-question_encoder-single-nq-base locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...
INFO - haystack.modeling.model.language_model -  Loaded facebook/dpr-question_encoder-single-nq-base
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find facebook/dpr-ctx_encoder-single-nq-base locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...
INFO - haystack.modeling.model.language_model -  Loaded facebook/dpr-ctx_encoder-single-nq-base
INFO - haystack.document_stores.faiss -  Updating embeddings for 98 docs...
  cur_tensor = torch.tensor([sampl

In [15]:
reader = FARMReader(model_name_or_path="Saved Models/roberta_base_squad2", use_gpu=True,
num_processes=0)

INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Model found locally at Saved Models/roberta_base_squad2
INFO - haystack.modeling.model.language_model -  Loaded Saved Models/roberta_base_squad2
INFO - haystack.modeling.model.adaptive_model -  Found files for loading 1 prediction heads
INFO - haystack.modeling.model.prediction_head -  Loading prediction head from Saved Models\roberta_base_squad2\prediction_head_0.bin
INFO - haystack.modeling.data_handler.processor -  Initialized processor without tasks. Supply `metric` and `label_list` to the constructor for using the default task or add a custom task later via processor.add_task()
INFO - haystack.modeling.logger -  ML Logging is turned off. No parameters, metrics or artifacts will be logged to MLFlow.
INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack

In [16]:
pipeline = ExtractiveQAPipeline(reader=reader,retriever=retriever)

In [18]:
prediction = pipeline.run(query="Where is mits located?",
                         params = {"Retriever":{"top_k":10}, 
                                  "Reader":{"top_k":10}})

  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.07s/ Batches]


In [25]:
first_ans = prediction["answers"][0]
first_ans.document_id

'6213cd05abf8a36f04f5c720'

### Saving the models

In [26]:
retriever.save("context_model_retriever_2")
document_store.save("document_store")

INFO - haystack.modeling.model.biadaptive_model -  prediction_head saving


In [2]:
tmp_doc_store = FAISSDocumentStore.load("document_store")
tmp_reader = FARMReader(model_name_or_path="Saved Models/roberta_base_squad2",
use_gpu=True, num_processes=0)
tmp_retriever = DensePassageRetriever.load("context_model_retriever_2", tmp_doc_store)
tmp_pipeline = ExtractiveQAPipeline(tmp_reader, tmp_retriever)

INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Model found locally at Saved Models/roberta_base_squad2
INFO - haystack.modeling.model.language_model -  Loaded Saved Models/roberta_base_squad2
INFO - haystack.modeling.model.adaptive_model -  Found files for loading 1 prediction heads
INFO - haystack.modeling.model.prediction_head -  Loading prediction head from Saved Models\roberta_base_squad2\prediction_head_0.bin
INFO - haystack.modeling.data_handler.processor -  Initialized processor without tasks. Supply `metric` and `label_list` to the constructor for using the default task or add a custom task later via processor.add_task()
INFO - haystack.modeling.logger -  ML Logging is turned off. No parameters, metrics or artifacts will be logged to MLFlow.
INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack

In [3]:
predictions = tmp_pipeline.run("Where is MITS located")

  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.26s/ Batches]


In [7]:
answers = predictions["answers"]


In [9]:
answers[0].document_id

'6213cd05abf8a36f04f5c720'