<a href="https://colab.research.google.com/github/cmbhatt1/abstractive-question-answering-system/blob/main/Chinmaysmedbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Installing all the libraries
!pip install huggingface_hub
!pip install transformers
!pip install faker
!pip install datasets
!pip install wikipedia-api
%pip install -Uq chromadb numpy datasets
!pip install --upgrade gradio

In [None]:
#Fetching the API ket from hugging face. This step is not mandatory but if it is not done, we will get a warning.
from huggingface_hub import HfApi
my_api = HfApi()


In [None]:
#We will be using Bart model for conditional generation from Hugging face.
from transformers import BartTokenizer, BartForConditionalGeneration

tokenizer = BartTokenizer.from_pretrained('vblagoje/bart_lfqa')
generator = BartForConditionalGeneration.from_pretrained('vblagoje/bart_lfqa')


In [None]:
#Wikipedia has multiple "main" categories and one of them is related to health and fitness (https://en.wikipedia.org/wiki/Wikipedia:Contents/Categories#Health_and_fitness). This main category has numerous other categories attached to it. These are stored
#in the next code block named "list of categories". Each of these categories has pages associated to them.(not subcategory). These pages need to be scraped.

# Instantiate the wikipedia object
import wikipediaapi
wiki_wiki = wikipediaapi.Wikipedia('healthbot (cmbhatt99@gmail.com)','en')

def scrape_category_pages(category_name):
  """
  This function takes in each element of list_of_categories and visits all the pages in it to scrape their summary. Here, "member" means the names of the pages/article.
  """
   pages_with_summary = {}

    # Get the category page
   category_page = wiki_wiki.page("Category:" + category_name)

    # Iterate over the category members
   for member in category_page.categorymembers.values():
        # Check if the member is a page
       if member.ns == wikipediaapi.Namespace.MAIN:
            # Get the page object
           page = wiki_wiki.page(member.title)
            # Retrieve the summary of the page
           summary = page.summary
            # Store the title and summary in the dictionary
           pages_with_summary[member.title] = summary

   return pages_with_summary





In [None]:
#looping over a few categories to get the summary of the pages within them. All these categories have multiple articles/pages dedicated to them.
list_of_categories = ['Diseases and disorders', 'Pharmacy', 'Symptoms and signs', 'Pathology', 'Mental health', 'Pediatrics','Nutrition','Gynaecology','Ophthalmology','Physical exercise','Health care occupations','Phytochemicals','Symptoms and signs of mental disorders',
                      'Neurology','Dietary supplements','Surgery','Occupational safety and health','Chemical substances for emergency medicine','Medical and health organisations based in India',
                      'Pharmaceutical companies of India','Cause (medicine)','Symptoms','Symptoms and signs: Urinary system','Symptoms and signs: Circulatory system','Medical associations based in India','Geriatrics','Gastroenterology']
summary=[]
for category in list_of_categories:
  summary.append(scrape_category_pages(category))

# Data cleaning. Replace "\n" with "".
for i in range(len(summary)):
  for key, value in summary[i].items():
    summary[i][key] = value.replace("\n","")




In [None]:
# A category list of diseases is structured a bit differently. It has pages named "list of diseases(A), list of_diseases(b)....".
#These pages have further links to the actual disease articles and pages so the previous scraping technique doesn't work.
dict_of_diseases = {}
category_page_diseases = wiki_wiki.page('Category:' + 'Lists of diseases')
for member in category_page_diseases.categorymembers.values():
  if member.ns == wikipediaapi.Namespace.MAIN:
    page_py = wiki_wiki.page(member.title)
    for key, value in page_py.links.items():
      dict_of_diseases[key] = wiki_wiki.page(key).summary

knowledge_base_diseases = []
for key, value in dict_of_diseases.items():
  if len(value)>5:
    knowledge_base_diseases.append(value.replace("\n",""))




In [None]:
# We will be converting our knowlege base/context into vectors and storing it in chroma db which is open source.
#We have removed the key and only taken values from our knowledge base. Knowledge base was in this format [{"headache":"Headache, also known as cephalalgia, is the symptom...}"]
#but there is no point storing the keys. Additionally, we can only store these values in a list and not a dictionary.
import chromadb

client = chromadb.Client()

collection = client.create_collection("Chinmaymedbot1")

knowledge_base = [value for d in summary for value in d.values()]

knowledge_base.extend(knowledge_base_diseases)

collection.add(
    ids=[str(i) for i in range(len(knowledge_base))],  # IDs are just strings
    documents=knowledge_base,
    metadatas=[{"type": "support"} for _ in range(len(knowledge_base))
    ],
)

In [None]:
# This a synthetic data generated with the help of faker. It contains random patient names, doctors and some data you might see in a hospital database.
import pandas as pd
import random
from faker import Faker
from datetime import datetime, timedelta

random.seed(42)
# Initialize Faker library
fake = Faker()

# Initialize lists to store data
data = {
    'id': [],
    'Name': [],
    'Visit_number': [],
    'doctor_name': [],
    'specialty':[],
    'Expected time in minutes':[],
    'last_visit': []
}

# List of doctor specialties
specialties = ['Dentist', 'ENT', 'Opthalmalogist', 'Gynaecologist', 'Orthopedist', 'Pediatrician']


# Generate synthetic data
for i in range(1000):
    data['id'].append(i + 1)
    data['Name'].append(fake.name())
    data['Visit_number'].append(random.randint(1, 5))  # Random visit number between 1 and 5
    data['doctor_name'].append(fake.name())
    specialty = random.choice(specialties)
    data['specialty'].append(specialty)
    data['Expected time in minutes'].append(random.randint(10,30)),

    # Generate last visit date (within the last year)
    last_visit_date = fake.date_time_between(start_date='-1y', end_date='now')
    data['last_visit'].append(last_visit_date.strftime('%Y-%m-%d %H:%M:%S'))

# Create DataFrame
df = pd.DataFrame(data)



In [None]:
# Created a chat interface which can be used by both patients and hospital management. The patients have to type in yes and the medbot will try to answer questions related to healthcare domain. If no is typed, the bot will ask for a patient id which when provided fetches detail of the patient.
import gradio as gr
import time

def respond(message):
        results = collection.query(
                  query_texts=message,
                  n_results=5)

        conditioned_doc = "<P> " + " <P> ".join([d for d in results['documents'][0]])
        query_and_docs = "question: {} context: {}".format(message, conditioned_doc)

        model_input = tokenizer(query_and_docs, truncation=True, padding=True, return_tensors="pt")

        generated_answers_encoded = generator.generate(input_ids=model_input["input_ids"],
                                           attention_mask=model_input["attention_mask"],
                                           min_length=10,
                                           max_length=256,
                                           do_sample=False,
                                           early_stopping=True,
                                           num_beams=8,
                                           temperature=1.0,
                                           top_k=None,
                                           top_p=None,
                                           eos_token_id=tokenizer.eos_token_id,
                                           no_repeat_ngram_size=3,
                                           num_return_sequences=1)

        answer_to_the_query = tokenizer.batch_decode(generated_answers_encoded, skip_special_tokens=True,clean_up_tokenization_spaces=True)
        time.sleep(2)
        return answer_to_the_query[0]

def management_query(patient_id):
  return f'Patient: {df.loc[df["id"] == patient_id, "Name"].values[0]},\nVisit number: {df.loc[df["id"] == patient_id, "Visit_number"].values[0]},\nDoctor name: Dr. {df.loc[df["id"] == patient_id, "doctor_name"].values[0]},\nSpecialty: {df.loc[df["id"] == patient_id, "specialty"].values[0]},\nExpected time: {df.loc[df["id"] == patient_id, "Expected time in minutes"].values[0]},\nLast visit: {df.loc[df["id"] == patient_id, "last_visit"].values[0]}'


def chat_func(message, history):
  if len(history)==0:
    if message.lower()=="yes":
        return "Ask me anything!"
    elif message.lower()=="no":
        return "Please put in the patient id"
  else:
    if history[0][0]=="no":
      id_message = int(message)
      return management_query(id_message)
    else:
      history.append((message, respond(message)))
      return respond(message)


gr.ChatInterface(
    chat_func,
    chatbot=gr.Chatbot(height=300),
    textbox=gr.Textbox(placeholder="Type yes if you are a patient", container=False, scale=7),

    title="Chinmay's Medbot",
    description="Abstractive question answering system",
    theme="soft",
).launch(debug=True)