In [15]:
from haystack.utils import   print_answers
from haystack.nodes import FARMReader
from haystack.document_stores import FAISSDocumentStore, InMemoryDocumentStore
from haystack.nodes import DensePassageRetriever, PreProcessor
from haystack.pipelines import ExtractiveQAPipeline
from pymongo import MongoClient
from haystack.schema import Document
import pandas as pd


In [4]:

db_client = MongoClient(host="localhost", port=27017)
database = db_client['Website_Chatbot']
collection = database["MITS"]


In [60]:
df = pd.read_csv("../../Deployment/Data/mits_data.csv")

df_docs = []

for row in df.itertuples(index=False):
    content = row.content
    id = row._0
    source = row.source
    for para in content.split("\n\n"):
        if not para.strip():
            continue

        tmp_doc = Document(content=para)
        tmp_doc.id = str(id)
        tmp_doc.meta = {"source":source}
        tmp_doc.content_type = "str"
        df_docs.append(tmp_doc)


In [61]:
df_docs

[<Document: {'content': 'applications invited for the post of asst./assoc. professor in computer science and engineering/ mathematics. mail your resume to 2nd international conference on advances in manufacturing and material science organized jointly by department of mechanical engineering, mits and department of polymer technology, gdańsk university of technology, poland, on july 7-9, 2022. go to website happy to inform that mits secured 2nd position among 134 engineering colleges in the state for batch of 2017-21 in terms of , and 4th position among 134 engineering colleges in the state in terms of . everyone is requested to abide by the instructions issued by the government. lets play our role in the battle against covid-19. mits online distinguished lecture series. click .departments of cse, ece, eee, and me accredited by national board of accreditation for 3 years (2019-22). we wholeheartedly thank the management of muthoot group, faculty, staff, students, parents, and alumni of 

In [5]:
docs = [] 
for document in collection.find({}):
    content = document["content"]
    for para in content.split("\n\n"):
        if not para.strip():
            continue

        tmp_doc = Document(content=para)
        tmp_doc.id = str(document["_id"])
        tmp_doc.meta = {"source":document['source'], }
        tmp_doc.content_type = "str"
        docs.append(tmp_doc)


In [16]:

new_docs = [] 
for document in collection.find({}):
    content = document["content"]
    for para in content.split("\n\n"):
        if not para.strip():
            continue

        tmp_doc = dict() #Document(content=para)
        tmp_doc["content"] = para
        tmp_doc["id"] = str(document["_id"])
        tmp_doc["meta"] = {"source":document['source'],"name":str(document["_id"]) }
        # tmp_doc.content_type = "str"
        new_docs.append(tmp_doc)

In [12]:
preprocessor = PreProcessor(clean_header_footer=True, split_by=None)
clean_docs = preprocessor.process(docs)
# preprocessor.clean(docs, clean_whitespace=True, clean_header_footer=True, clean_empty_lines=True)


100%|██████████| 20/20 [00:00<?, ?docs/s]


In [15]:
document_store = InMemoryDocumentStore() #FAISSDocumentStore(faiss_index_factory_str="Flat")
document_store.write_documents(docs)

INFO - haystack.modeling.utils -  Using devices: CUDA:0
INFO - haystack.modeling.utils -  Number of GPUs: 1


In [16]:
retriever = DensePassageRetriever(document_store=document_store,
                                 query_embedding_model='facebook/dpr-question_encoder-single-nq-base',
                                 passage_embedding_model='facebook/dpr-ctx_encoder-single-nq-base',
                                 max_seq_len_query=64,
                                 max_seq_len_passage=512,
                                 batch_size=16,
                                 use_gpu=True,
                                 embed_title=True,
                                 use_fast_tokenizers=True,
                                 similarity_function="cosine")
document_store.update_embeddings(retriever)

INFO - haystack.modeling.utils -  Using devices: CUDA:0
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find facebook/dpr-question_encoder-single-nq-base locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...
INFO - haystack.modeling.model.language_model -  Loaded facebook/dpr-question_encoder-single-nq-base
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find facebook/dpr-ctx_encoder-single-nq-base locally.
IN

In [18]:
reader = FARMReader(model_name_or_path="Saved Models/roberta_base_squad2", use_gpu=True,
num_processes=0)

INFO - haystack.modeling.utils -  Using devices: CUDA:0
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Model found locally at Saved Models/roberta_base_squad2
INFO - haystack.modeling.model.language_model -  Loaded Saved Models/roberta_base_squad2
INFO - haystack.modeling.model.adaptive_model -  Found files for loading 1 prediction heads
INFO - haystack.modeling.model.prediction_head -  Loading prediction head from Saved Models\roberta_base_squad2\prediction_head_0.bin
file Saved Models/roberta_base_squad2\config.json not found
INFO - haystack.modeling.data_handler.processor -  Initialized processor without tasks. Supply `metric` and `label_list` to the constructor for using the default task or add a custom task later via processor.add_task()
INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack.modeling.utils -  Number of GPUs: 1


In [19]:
pipeline = ExtractiveQAPipeline(reader=reader,retriever=retriever)

In [20]:
prediction = pipeline.run(query="Who is the principal of mits".lower(),
                         params = {"Retriever":{"top_k":10}, 
                                  "Reader":{"top_k":10}})

print_answers(prediction)                                

Inferencing Samples: 100%|██████████| 2/2 [00:02<00:00,  1.20s/ Batches]


Query: who is the principal of mits
Answers:
[   <Answer {'answer': 'dr.neelakandan', 'type': 'extractive', 'score': 0.8960450887680054, 'context': 'houswise by dividing the students into 4 houses. principal of mits, dr.neelakandan p c inaugurated the inter house game competitions on 25th april 202', 'offsets_in_document': [{'start': 2725, 'end': 2739}], 'offsets_in_context': [{'start': 68, 'end': 82}], 'document_id': '6289cc6dce019b3af0bd26b4', 'meta': {'source': 'https://mgmits.ac.in/life-mits/sports/'}}>,
    <Answer {'answer': 'dr. s ramkumar', 'type': 'extractive', 'score': 0.8773443698883057, 'context': ' dignitaries mr. p george varghese (executive director), principal- dr. s ramkumar, academic advisor – dr. a c mathai and all head of the departments.', 'offsets_in_document': [{'start': 5392, 'end': 5406}], 'offsets_in_context': [{'start': 68, 'end': 82}], 'document_id': '6289cc6dce019b3af0bd26b2', 'meta': {'source': 'https://mgmits.ac.in/life-mits/arts-club/'}}>,
    <Answer {




### Saving the models

In [21]:
retriever.save("context_model_retriever_2")
document_store.save("document_store_inmemory")

INFO - haystack.modeling.model.biadaptive_model -  prediction_head saving


AttributeError: 'InMemoryDocumentStore' object has no attribute 'save'

In [10]:
# tmp_doc_store = FAISSDocumentStore.load("document_store")
tmp_doc_store = InMemoryDocumentStore()
tmp_doc_store.write_documents(docs)
tmp_reader = FARMReader(model_name_or_path="Saved Models/roberta_base_squad2",
use_gpu=True, num_processes=0)
tmp_retriever = DensePassageRetriever.load("context_model_retriever_2", tmp_doc_store)
tmp_doc_store = tmp_doc_store.update_embeddings(tmp_retriever)
tmp_pipeline = ExtractiveQAPipeline(tmp_reader, tmp_retriever)

INFO - haystack.modeling.utils -  Using devices: CUDA:0
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.utils -  Using devices: CUDA:0
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Model found locally at Saved Models/roberta_base_squad2
INFO - haystack.modeling.model.language_model -  Loaded Saved Models/roberta_base_squad2
INFO - haystack.modeling.model.adaptive_model -  Found files for loading 1 prediction heads
INFO - haystack.modeling.model.prediction_head -  Loading prediction head from Saved Models\roberta_base_squad2\prediction_head_0.bin
file Saved Models/roberta_base_squad2\config.json not found
INFO - haystack.modeling.data_handler.processor -  Initialized processor without tasks. Supply `metric` and `label_list` to the constructor for using the default task or add a custom task later via processor.add_task()
INFO - haystack.modeli

In [11]:
predictions = tmp_pipeline.run("Where is MITS located")

  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|██████████| 2/2 [00:02<00:00,  1.33s/ Batches]


In [12]:
answers = predictions["answers"]


In [14]:
answers

[<Answer {'answer': 'ernakulam', 'type': 'extractive', 'score': 0.8702623248100281, 'context': 'ering (cse) at the muthoot institute of technology and science (mits), ernakulam! as you glance through the faculty profile, i am sure that you will a', 'offsets_in_document': [{'start': 2452, 'end': 2461}], 'offsets_in_context': [{'start': 71, 'end': 80}], 'document_id': '6289cc71ce019b3af0bd26d6', 'meta': {'source': 'https://mgmits.ac.in/departments/computer-science-and-engineering/'}}>,
 <Answer {'answer': 'kerala', 'type': 'extractive', 'score': 0.5997016131877899, 'context': 'll games during sports and athletic meets among engineering colleges in kerala. we have always strived hard to excel in the fields of sports and games', 'offsets_in_document': [{'start': 125, 'end': 131}], 'offsets_in_context': [{'start': 72, 'end': 78}], 'document_id': '6289cc6dce019b3af0bd26b4', 'meta': {'source': 'https://mgmits.ac.in/life-mits/sports/'}}>,
 <Answer {'answer': 'industrial suburb of kochi', 'type