# Medical information retrieval system
--this notebook takes the data generated in the previous steps after pre-processing,cleaning and other EDA tasks as its input

In [None]:
# importing all necessary libraries
import warnings
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split, cross_val_score
from statistics import mean
from nltk.corpus import wordnet
import requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from itertools import combinations
from time import time
from collections import Counter
import operator
from xgboost import XGBClassifier
import math
from sklearn.linear_model import LogisticRegression
warnings.simplefilter("ignore")

In [None]:
#Downloading all NLTK resources using nltk.download('all') and specifically wordnet'
import nltk
nltk.download('all')
nltk.download('wordnet')
!pip install google

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    |   Package bcp47 is already up-to-dat




# **Synonyms function-**
It retrieves synonymous terms of a symptom inputted by the user, enhancing accuracy by mitigating discrepancies between user-entered symptoms and those in the dataset. Synonyms are sourced from Thesaurus and Wordnet, aiding in robust prediction even when symptom terminology varies slightly from the model's training data.

In [None]:
# returns the list of synonyms of the input word from thesaurus.com (https://www.thesaurus.com/) and wordnet (https://www.nltk.org/howto/wordnet.html)
def synonyms(term):
    synonyms = []
    response = requests.get('https://www.thesaurus.com/browse/{}'.format(term))
    soup = BeautifulSoup(response.content,  "html.parser")
    try:
        container=soup.find('section', {'class': 'MainContentContainer'})
        row=container.find('div',{'class':'css-191l5o0-ClassicContentCard'})
        row = row.find_all('li')
        for x in row:
            synonyms.append(x.get_text())
    except:
        None
    for syn in wordnet.synsets(term):
        synonyms+=syn.lemma_names()
    return set(synonyms)

In [None]:
# utlities for pre-processing
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
splitter = RegexpTokenizer(r'\w+')

**Load our datasets and install necessary libraries**

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:

# Load Dataset
# Scrapping and creation of dataset csv is done in a separate program
df_comb = pd.read_csv("/content/dis_sym_dataset_comb.csv") # Disease combination
df_norm = pd.read_csv("/content/dis_sym_dataset_norm.csv") # Individual Disease

X = df_comb.iloc[:, 1:]
Y = df_comb.iloc[:, 0:1]

lr = LogisticRegression()
lr = lr.fit(X, Y)
scores = cross_val_score(lr, X, Y, cv=5)

X = df_norm.iloc[:, 1:]
Y = df_norm.iloc[:, 0:1]

# List of symptoms
dataset_symptoms = list(X.columns)
#print(dataset_symptoms)

In [None]:
'''this code takes user input of symptoms, preprocesses them by converting to lowercase, removing punctuation, and lemmatizing using spaCy, and then prints the preprocessed symptoms.'''
import spacy
# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Taking symptoms from user as input
user_symptoms = input("Please enter symptoms separated by comma(,):\n").lower().split(',')

# Preprocessing the input symptoms
processed_user_symptoms = []
for sym in user_symptoms:
    sym = sym.strip()
    sym = sym.replace('-', ' ')
    sym = sym.replace("'", '')
    # Lemmatize using spaCy
    doc = nlp(sym)
    lemmatized_sym = ' '.join([token.lemma_ for token in doc])
    processed_user_symptoms.append(lemmatized_sym)

print()
print("*****************************")
print()
print("After Pre-processing the user input")
for i in processed_user_symptoms:
    print(i,end=", ")
#sample input: coughing,fever,loss of smell,sneezing
#stomach ache

Please enter symptoms separated by comma(,):
Nausea,Vomiting,Abdominal pain,Headache,Loss of appetite,Fever

*****************************

After Pre-processing the user input
nausea, vomit, abdominal pain, headache, loss of appetite, fever, 

In [None]:
# Taking each user symptom and finding all its synonyms and appending it to the pre-processed symptom string
user_symptoms = []
for user_sym in processed_user_symptoms:
    user_sym = user_sym.split()
    str_sym = set()
    for comb in range(1, len(user_sym)+1):
        for subset in combinations(user_sym, comb):
            subset=' '.join(subset)
            subset = synonyms(subset)
            str_sym.update(subset)
    str_sym.add(' '.join(user_sym))
    user_symptoms.append(' '.join(str_sym).replace('_',' '))
# query expansion performed by joining synonyms found for each symptoms initially entered
print("After query expansion done by using the symptoms entered")
print(user_symptoms)

After query expansion done by using the symptoms entered
['sickness nausea', 'purge regorge upchuck vomiting puke cast regurgitation regurgitate vomit up nauseant barf spue puking vomitive retch vomitus disgorgement emesis spew disgorge emetic honk vomit sick be sick throw up cat chuck', 'trouble annoyance abdominal pain abdominal hurting botheration anguish pain in the ass ail hurt pain in the neck abdominal muscle pain pain sensation ab painfulness infliction nuisance bother painful sensation', 'worry concern vexation headache cephalalgia head ache', 'passing going expiration appetency appetite appetence red deprivation loss exit loss of appetite release personnel casualty red ink departure', 'febrility pyrexia feverishness febricity fever']


In [None]:
# Loop over all the symptoms in dataset and check its similarity score to the synonym string of the user-input
# symptoms. If similarity>0.4, add the symptom to the final list
found_symptoms = set()
for idx, data_sym in enumerate(dataset_symptoms):
    data_sym_split=data_sym.split()
    for user_sym in user_symptoms:
        count=0
        for symp in data_sym_split:
            if symp in user_sym.split():
                count+=1
        if count/len(data_sym_split)>0.4:
            found_symptoms.add(data_sym)
found_symptoms = list(found_symptoms)

In [None]:
found_symptoms

['lower abdominal pain',
 'muscle spasm',
 'eye pain',
 'nausea',
 'webbed neck',
 'attack pain',
 'red',
 'hair loss',
 'muscle cramp',
 'neck stiffness',
 'fever',
 'abnormal sensation',
 'depending subtype abdominal pain',
 'poor appetite',
 'abdominal cramp',
 'trouble sensation',
 'severe pain',
 'bone pain',
 'chest pain',
 'decreased appetite',
 'muscle joint pain',
 'red skin',
 'testicular pain',
 'loss appetite',
 'small head',
 'muscle weakness',
 'ear pain',
 'trouble talking',
 'trouble walking',
 'neck',
 'painful skin',
 'red rash',
 'pulsing pain',
 'stomach pain',
 'hearing loss',
 'weak muscle',
 'vomiting',
 'little pain',
 'pain area',
 'trouble swallowing',
 'pain sex',
 'abdominal distention',
 'loss smell',
 'upper abdominal pain',
 'nausea vomiting',
 'red eye',
 'trouble sleeping',
 'vision loss',
 'widespread pain',
 'stiff neck',
 'tooth loss',
 'trouble speaking',
 'trouble coordination',
 'headache',
 'painful',
 'right lower abdominal pain',
 'stiff muscle

In [None]:
# Print all found symptoms
print("Top matched symptoms from your search:")
for idx, symp in enumerate(found_symptoms):
    print(idx,":",symp)

# Show the related symptoms found in the dataset and ask user to select among them
select_list = input("\nPlease select the relevant symptoms. Enter indices (space-separated):\n").split()

# Find other relevant symptoms from the dataset based on user symptoms based on the highest co-occurance with the
# ones that is input by the user
dis_list = set()
final_symp = []
counter_list = []
for idx in select_list:
    symp=found_symptoms[int(idx)]
    final_symp.append(symp)
    dis_list.update(set(df_norm[df_norm[symp]==1]['label_dis']))

for dis in dis_list:
    row = df_norm.loc[df_norm['label_dis'] == dis].values.tolist()
    row[0].pop(0)
    for idx,val in enumerate(row[0]):
        if val!=0 and dataset_symptoms[idx] not in final_symp:
            counter_list.append(dataset_symptoms[idx])

Top matched symptoms from your search:
0 : lower abdominal pain
1 : muscle spasm
2 : eye pain
3 : nausea
4 : webbed neck
5 : attack pain
6 : red
7 : hair loss
8 : muscle cramp
9 : neck stiffness
10 : fever
11 : abnormal sensation
12 : depending subtype abdominal pain
13 : poor appetite
14 : abdominal cramp
15 : trouble sensation
16 : severe pain
17 : bone pain
18 : chest pain
19 : decreased appetite
20 : muscle joint pain
21 : red skin
22 : testicular pain
23 : loss appetite
24 : small head
25 : muscle weakness
26 : ear pain
27 : trouble talking
28 : trouble walking
29 : neck
30 : painful skin
31 : red rash
32 : pulsing pain
33 : stomach pain
34 : hearing loss
35 : weak muscle
36 : vomiting
37 : little pain
38 : pain area
39 : trouble swallowing
40 : pain sex
41 : abdominal distention
42 : loss smell
43 : upper abdominal pain
44 : nausea vomiting
45 : red eye
46 : trouble sleeping
47 : vision loss
48 : widespread pain
49 : stiff neck
50 : tooth loss
51 : trouble speaking
52 : trouble c

In [None]:
# Symptoms that co-occur with the ones selected by user
dict_symp = dict(Counter(counter_list))
dict_symp_tup = sorted(dict_symp.items(), key=operator.itemgetter(1),reverse=True)
#print(dict_symp_tup)

In [None]:
# Iteratively, suggest top co-occuring symptoms to the user and ask to select the ones applicable
found_symptoms=[]
count=0
for tup in dict_symp_tup:
    count+=1
    found_symptoms.append(tup[0])
    if count%5==0 or count==len(dict_symp_tup):
        print("\nCommon co-occuring symptoms:")
        print("********************************")
        for idx,ele in enumerate(found_symptoms):
            print(idx,":",ele)
        select_list = input("Do you have have of these symptoms? If Yes, enter the indices (space-separated), 'no' to stop, '-1' to skip:\n").lower().split();
        if select_list[0]=='no'or select_list[0] == 'No' or select_list[0] == 'NO':
            break
        if select_list[0]=='-1':
            found_symptoms = []
            continue
        for idx in select_list:
            final_symp.append(found_symptoms[int(idx)])
        found_symptoms = []


Common co-occuring symptoms:
********************************
0 : headache
1 : testicular pain
2 : vomiting
3 : sore throat
4 : barky cough
Do you have have of these symptoms? If Yes, enter the indices (space-separated), 'no' to stop, '-1' to skip:
0 2 3

Common co-occuring symptoms:
********************************
0 : diarrhea
1 : confusion
2 : maculopapular rash
3 : feeling tired
4 : unintended weight loss
Do you have have of these symptoms? If Yes, enter the indices (space-separated), 'no' to stop, '-1' to skip:
0

Common co-occuring symptoms:
********************************
0 : swollen lymph node
1 : shortness breath
2 : runny nose
3 : chest pain
4 : large lymph node
Do you have have of these symptoms? If Yes, enter the indices (space-separated), 'no' to stop, '-1' to skip:
no


In [None]:
# Create query vector based on symptoms selected by the user
print("\nFinal list of Symptoms that will be used for prediction:")
print("****************************************************************")
sample_x = [0 for x in range(0,len(dataset_symptoms))]
for val in final_symp:
    print("*",val)
    sample_x[dataset_symptoms.index(val)]=1


Final list of Symptoms that will be used for prediction:
****************************************************************
* lower abdominal pain
* nausea
* webbed neck
* fever
* poor appetite
* abdominal cramp
* headache
* vomiting
* sore throat
* diarrhea


In [None]:
# Predict disease
lr = LogisticRegression()
lr = lr.fit(X, Y)
prediction = lr.predict_proba([sample_x])

In [None]:
k = 10
diseases = list(set(Y['label_dis']))
diseases.sort()
topk = prediction[0].argsort()[-k:][::-1]

In [None]:
import re
from googlesearch import search
import warnings
warnings.filterwarnings("ignore")
import requests
from bs4 import BeautifulSoup

# Take input a disease and return the content of wikipedia's infobox for that specific disease
def diseaseDetail(term):
    diseases=[term]
    ret=term+"\n"
    info_dict = {}
    for dis in diseases:
        # search "disease wilipedia" on google
        query = dis+' wikipedia'
        for sr in search(query,tld="co.in",stop=10,pause=0.5):
            # open wikipedia link
            match=re.search(r'wikipedia',sr)
            filled = 0
            if match:
                wiki = requests.get(sr,verify=False)
                soup = BeautifulSoup(wiki.content, 'html5lib')
                # Fetch HTML code for 'infobox'
                info_table = soup.find("table", {"class":"infobox"})
                if info_table is not None:
                    # Preprocess contents of infobox
                    for row in info_table.find_all("tr"):
                        data=row.find("th",{"scope":"row"})
                        if data is not None:
                            symptom=str(row.find("td"))
                            symptom = symptom.replace('.','')
                            symptom = symptom.replace(';',',')
                            symptom = symptom.replace('<b>','<b> \n')
                            symptom=re.sub(r'<a.*?>','',symptom)
                            symptom=re.sub(r'</a>','',symptom)
                            symptom=re.sub(r'<[^<]+?>',' ',symptom)
                            symptom=re.sub(r'\[.*\]','',symptom)
                            symptom=symptom.replace("&gt",">")
                            ret+=data.get_text()+" - "+symptom+"\n"
                            info_dict[data.get_text()]=symptom
                            #print(data.get_text(),"-",symptom)
                            filled = 1
                if filled:
                    break
    return info_dict

def take_disease_as_input_and_print_treatment(diseaseName):
  info_dict=diseaseDetail(diseaseName)
  return info_dict

def print_treatment(info_dict):
  flag=0
  for key, value in info_dict.items():
    if(key =="Treatment"):
      flag=1
      print(f"{key}: {value}")
    if(key =="Diagnostic method"):
      flag=1
      print(f"{key}: {value}")
    if(key =="Differential diagnosis"):
      flag=1
      print(f"{key}: {value}")
    if(key =="Medication"):
      flag=1
      print(f"{key}: {value}")
  return flag

def scrape_disease_info(disease_name):
    url = f"https://en.wikipedia.org/wiki/{disease_name}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    # Finding the sections containing treatment, diagnostic methods, and medications
    treatment_section = soup.find("span", id="Treatment")
    diagnostic_section = soup.find("span", id="Diagnosis")
    medication_section = soup.find("span", id="Medications")

    # Extracting text from the sections if found
    treatment_info = treatment_section.find_next("p").text if treatment_section else "Information not found."
    diagnostic_info = diagnostic_section.find_next("p").text if diagnostic_section else "Information not found."
    medication_info = medication_section.find_next("p").text if medication_section else "Information not found."

    return {
        "Treatment": treatment_info,
        "Diagnostic methods": diagnostic_info,
        "Medications": medication_info
    }

def chk_flag(returned_flag,diseases_input):
  if(returned_flag == 0):
    disease_info = scrape_disease_info(diseases_input)
    if(disease_info["Treatment"] != "Information not found."):
      print("Treatment:", disease_info["Treatment"])
    if(disease_info["Diagnostic methods"]!="Information not found."):
      print("Diagnostic methods:", disease_info["Diagnostic methods"])
    if(disease_info["Medications"] != "Information not found."):
      print("Medications:", disease_info["Medications"])





In [None]:
print(f"\nTop {k} diseases predicted based on symptoms")
topk_dict = {}
# Show top 10 highly probable disease to the user.
for idx,t in  enumerate(topk):
    match_sym=set()
    row = df_norm.loc[df_norm['label_dis'] == diseases[t]].values.tolist()
    row[0].pop(0)

    for idx,val in enumerate(row[0]):
        if val!=0:
            match_sym.add(dataset_symptoms[idx])
    prob = (len(match_sym.intersection(set(final_symp)))+1)/(len(set(final_symp))+1)
    prob *= mean(scores)
    topk_dict[t] = prob
j = 0
topk_index_mapping = {}
topk_sorted = dict(sorted(topk_dict.items(), key=lambda kv: kv[1], reverse=True))
for key in topk_sorted:
  prob = topk_sorted[key]*100
  print(str(j) + " Disease name:",diseases[key], "\tProbability:",str(round(prob, 2))+"%")
  topk_index_mapping[j] = key
  j += 1

select = input("\nMore details about the disease? Enter index of disease or '-1' to discontinue and close the system:\n")
if select!='-1':
    dis=diseases[topk_index_mapping[int(select)]]
    print()
    info_dict =take_disease_as_input_and_print_treatment(dis)
    returned_flag = print_treatment(info_dict)
    chk_flag(returned_flag,dis)
    #print(diseaseDetail(dis))


Top 10 diseases predicted based on symptoms
0 Disease name: Food Poisoning 	Probability: 40.54%
1 Disease name: Crimean Congo haemorrhagic fever (CCHF) 	Probability: 40.54%
2 Disease name: Hepatitis A 	Probability: 40.54%
3 Disease name: Anthrax 	Probability: 40.54%
4 Disease name: Malaria 	Probability: 32.43%
5 Disease name: Bubonic plague 	Probability: 32.43%
6 Disease name: Scarlet fever 	Probability: 32.43%
7 Disease name: Gastroenteritis 	Probability: 32.43%
8 Disease name: Acute encephalitis syndrome 	Probability: 32.43%
9 Disease name: Ebola 	Probability: 32.43%

More details about the disease? Enter index of disease or '-1' to discontinue and close the system:
3

Diagnostic method:  Based on antibodies or toxin in the blood, microbial culture   
Treatment:  Antibiotics, antitoxin   
