In [1]:
from pymongo import MongoClient
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
from haystack.schema import Document
from sentence_transformers import SentenceTransformer,CrossEncoder, util
from nltk import ngrams
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm
INFO - haystack.document_stores.base -  Numba not found, replacing njit() with no-op implementation. Enable it with 'pip install numba'.
INFO - haystack.modeling.model.optimization -  apex not found, won't use it. See https://nvidia.github.io/apex/
ERROR - root -  Failed to import 'magic' (from 'python-magic' and 'python-magic-bin' on Windows). FileTypeClassifier will not perform mimetype detection on extensionless files. Please make sure the necessary OS libraries are installed if you need this functionality.


In [6]:

db_client = MongoClient(host="localhost", port=27017)
database = db_client['Website_Chatbot']
collection = database["MITS"]

model_name = "deepset/roberta-base-squad2"

retriever_model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
retriever_model.max_seq_length = 256
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')


reader_model = AutoModelForQuestionAnswering.from_pretrained(model_name)
reader_tokenizer = AutoTokenizer.from_pretrained(model_name)

In [19]:
docs = [] 
for document in collection.find({}):
    content = document["content"].split()
    # print(" ".join([content[0+i : 256+i] for i in range(0, len(content), 256)]))
    for i in range(0, len(content), 256):
        docs.append(" ".join(content[i: 256+i]))


In [58]:
query = "What is eco club"
query_embedding = retriever_model.encode(query.lower(), convert_to_tensor=True)
document_embedding = retriever_model.encode(docs, convert_to_tensor=True, show_progress_bar=True)


Batches: 100%|██████████| 4/4 [00:00<00:00,  9.55it/s]


In [59]:
score = util.semantic_search(query_embedding, document_embedding, top_k=5)
# best_para = np.argmax(score)

In [60]:
score

[[{'corpus_id': 12, 'score': 0.7706955671310425},
  {'corpus_id': 13, 'score': 0.5434941053390503},
  {'corpus_id': 100, 'score': 0.4028398394584656},
  {'corpus_id': 77, 'score': 0.36827030777931213},
  {'corpus_id': 87, 'score': 0.35861924290657043}]]

In [24]:
cross_inp = [[query, docs[hit['corpus_id']]] for hit in score[0]]
cross_scores = cross_encoder.predict(cross_inp)

In [66]:
nlp = pipeline("question-answering", model = reader_model, tokenizer = reader_tokenizer)

qa_input = {"question":query, 
"context":docs[77]}

result = nlp(qa_input)
result

{'score': 0.010533596388995647,
 'start': 1293,
 'end': 1303,
 'answer': 'illuminati'}

In [10]:
prediction = pipeline.run(query="Who is the principal of MITS",
                         params = {"Retriever":{"top_k":10}, 
                                  "Reader":{"top_k":10}})

print_answers(prediction)                                

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.55 Batches/s]


Query: Who is the principal of MITS
Answers:
[   <Answer {'answer': 'Dr Chikku Abraham', 'type': 'extractive', 'score': 0.6559404730796814, 'context': 'llege Executive Director Mr.P George Varghese, the acting Pricipal Dr Chikku Abraham and the HODs of various departments were the dignitaries on the d', 'offsets_in_document': [{'start': 3880, 'end': 3897}], 'offsets_in_context': [{'start': 67, 'end': 84}], 'document_id': '6213cd02abf8a36f04f5c6d4', 'meta': {'source': 'http://mgmits.ac.in/life-mits/sports/', 'vector_id': '32'}}>,
    <Answer {'answer': 'Dr Chikku Abraham', 'type': 'extractive', 'score': 0.20968271791934967, 'context': 'Vice Principal Muthoot Institute of Technology and Science (MITS) Dr Chikku Abraham is currently the Vice- Principal and Associate Professor in Electr', 'offsets_in_document': [{'start': 66, 'end': 83}], 'offsets_in_context': [{'start': 66, 'end': 83}], 'document_id': '6213cd07abf8a36f04f5c750', 'meta': {'source': 'http://mgmits.ac.in/mits/executive-body




### Saving the models

In [26]:
retriever.save("context_model_retriever_2")
document_store.save("document_store")

INFO - haystack.modeling.model.biadaptive_model -  prediction_head saving


In [2]:
tmp_doc_store = FAISSDocumentStore.load("document_store")
tmp_reader = FARMReader(model_name_or_path="Saved Models/roberta_base_squad2",
use_gpu=True, num_processes=0)
tmp_retriever = DensePassageRetriever.load("context_model_retriever_2", tmp_doc_store)
tmp_pipeline = ExtractiveQAPipeline(tmp_reader, tmp_retriever)

INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Model found locally at Saved Models/roberta_base_squad2
INFO - haystack.modeling.model.language_model -  Loaded Saved Models/roberta_base_squad2
INFO - haystack.modeling.model.adaptive_model -  Found files for loading 1 prediction heads
INFO - haystack.modeling.model.prediction_head -  Loading prediction head from Saved Models\roberta_base_squad2\prediction_head_0.bin
INFO - haystack.modeling.data_handler.processor -  Initialized processor without tasks. Supply `metric` and `label_list` to the constructor for using the default task or add a custom task later via processor.add_task()
INFO - haystack.modeling.logger -  ML Logging is turned off. No parameters, metrics or artifacts will be logged to MLFlow.
INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack

In [3]:
predictions = tmp_pipeline.run("Where is MITS located")

  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.26s/ Batches]


In [7]:
answers = predictions["answers"]


In [9]:
answers[0].document_id

'6213cd05abf8a36f04f5c720'