<a href="https://colab.research.google.com/github/cosmos-dx/ALone/blob/main/Doctorg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
################################################################################################################################################################
###########################################################################################################################################################################
###########################################################################################################################################################################
################################################################################################################################################################
################################################################################################################################################################
################################################################################################################################################################

In [1]:
pip install pandas scikit-learn fuzzywuzzy nltk textblob


Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


data = pd.read_csv('doctorg_data.csv')
data['symptom'] = data['symptom'].str.lower().str.replace(' ', '')
pivot_table = data.pivot_table(values='weight', index='name', columns='symptom',  fill_value=0)
X = pivot_table.values
y = pivot_table.index

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(pivot_table.index)
print(pivot_table.values)

Index(['Abdominal hernia', 'Abscess of the lung', 'Achalasia',
       'Actinic keratosis', 'Acute bronchospasm', 'Acute glaucoma',
       'Acute otitis media', 'Acute pancreatitis', 'Acute sinusitis',
       'Allergy',
       ...
       'Vasculitis', 'Viral warts', 'Vitamin B12 deficiency',
       'Vitreous hemorrhage', 'Vocal cord polyp', 'Volvulus',
       'Von Hippel-Lindau disease', 'Von Willebrand disease', 'Vulvodynia',
       'West Nile virus'],
      dtype='object', name='name', length=221)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 5.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [3]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [4]:
def predict_diseases(symptoms, pivot_table, top_n=5):
    # Initialize a dictionary to store the scores for each disease
    normalized_symptoms = [symptom.lower().replace(' ', '') for symptom in symptoms]
    disease_scores = {disease: 0 for disease in pivot_table.index}
    symptom_counts = {disease: 0 for disease in pivot_table.index}

    # Calculate scores based on the number of matching symptoms and their weights
    for symptom in normalized_symptoms:
        if symptom in pivot_table.columns:
            for disease in pivot_table.index:
                weight = pivot_table.at[disease, symptom]
                if weight > 0:
                    symptom_counts[disease] += 1
                    disease_scores[disease] += weight

    # Sort diseases by number of matching symptoms first, then by weight
    sorted_diseases = sorted(disease_scores.items(), key=lambda item: (symptom_counts[item[0]], item[1]), reverse=True)

    # Return the top N diseases
    top_diseases = sorted_diseases[:top_n]
    return top_diseases

# Example usage:
symptoms = ['pus draining from ear']
#symptoms = ['Diminished hearing','Throat feels tight', 'Lump in throat' ,'Mass on ear','Sharp abdominal pain', 'Headache','Chest tightness' ,'Fever', 'Abnormal involuntary movements']
#symptoms = ['Diminished hearing', 'Fluid in ear', 'Redness in ear', 'Mass on ear', 'Plugged feeling in ear','Bleeding from ear','Skin growth', 'Pus draining from ear','Allergic reaction','Muscle swelling']
top_diseases = predict_diseases(symptoms, pivot_table, top_n=6)

print("Top Possible Diseases:")
for disease, score in top_diseases:
    print(f"Disease: {disease}, Score: {score}")


Top Possible Diseases:
Disease: Cholesteatoma, Score: 21.0
Disease: Retinopathy due to high blood pressure, Score: 8.0
Disease: Abdominal hernia, Score: 0
Disease: Abscess of the lung, Score: 0
Disease: Achalasia, Score: 0
Disease: Actinic keratosis, Score: 0


In [19]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from fuzzywuzzy import process
from textblob import TextBlob
import nltk
import re

class DoctorG:
    def __init__(self, data_path):
        # Load and preprocess data
        self.data = pd.read_csv(data_path)
        self.data['symptom'] = self.data['symptom'].str.lower().str.replace(' ', '')
        self.pivot_table = self.data.pivot_table(values='weight', index='name', columns='symptom', fill_value=0)
        nltk.download('punkt')

    def preprocess_text(self, text):
        # Preprocess text: lowercasing, removing special characters, and correcting spelling
        text = text.lower()
        text = re.sub(r'[^a-z\s]', '', text)
        corrected_text = str(TextBlob(text).correct())
        return corrected_text

    def extract_symptoms(self, text):
        # Extract symptoms from text using fuzzy matching
        symptom_list = self.pivot_table.columns.tolist()
        extracted_symptoms = []
        words = nltk.word_tokenize(text)
        for word in words:
            print(" ----- ", word)
            match, score = process.extractOne(word, symptom_list)
            if score > 80:  # Threshold for fuzzy matching
                extracted_symptoms.append(match)
        return extracted_symptoms

    def predict_diseases(self, symptoms, top_n=5):
        # Predict diseases based on symptoms
        disease_scores = {disease: 0 for disease in self.pivot_table.index}
        symptom_counts = {disease: 0 for disease in self.pivot_table.index}

        for symptom in symptoms:
            if symptom in self.pivot_table.columns:
                for disease in self.pivot_table.index:
                    weight = self.pivot_table.at[disease, symptom]
                    if weight > 0:
                        symptom_counts[disease] += 1
                        disease_scores[disease] += weight

        # Sort by number of matching symptoms first, then by weight
        sorted_diseases = sorted(disease_scores.items(), key=lambda item: (symptom_counts[item[0]], item[1]), reverse=True)
        return sorted_diseases[:top_n]

    def predict_diseases_from_text(self, user_input, top_n=5):
        # Process user input and predict diseases
        cleaned_input = self.preprocess_text(user_input)
        extracted_symptoms = self.extract_symptoms(cleaned_input)

        if extracted_symptoms:
            top_diseases = self.predict_diseases(extracted_symptoms, top_n)
            if top_diseases:
                top_disease_name = top_diseases[0][0]
                top_disease_description = self.data.loc[self.data['name'] == top_disease_name, 'description'].values[0]
                cleaned_description = re.sub(r'\s+', ' ', re.sub(r'[^a-zA-Z0-9\s.,]', '', top_disease_description)).strip()
                paragraph_description = ' '.join(cleaned_description.split())
                return top_diseases, paragraph_description
            else:
                return "No matching diseases found."
        else:
            return "No matching symptoms found in the dataset."

# Example usage:
data_path = 'doctorg_data.csv'
predictor = DoctorG(data_path)

user_input = "I am having MassonEar and FluidinEar and Fever and PusDraininginEar and Rednessinear"
# user_input = "I am having Dizziness and Fever and Anexity"
top_diseases, top_description = predictor.predict_diseases_from_text(user_input, top_n=6)

print("Top Possible Diseases:")
for disease, score in top_diseases:
    print(f"Disease: {disease}, Score: {score}")

print("\nDescription of the Top Disease:")
print(top_description)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


 -----  i
 -----  am
 -----  having
 -----  massonear
 -----  and
 -----  fluidinear
 -----  and
 -----  fever
 -----  and
 -----  pusdraininginear
 -----  and
 -----  rednessinear
Top Possible Diseases:
Disease: Amyloidosis, Score: 215.0
Disease: Wernicke Korsakoff syndrome, Score: 160.0
Disease: Extrapyramidal effect of drugs, Score: 149.0
Disease: Panic disorder, Score: 352.0
Disease: Anxiety, Score: 328.0
Disease: Social phobia, Score: 328.0

Description of the Top Disease:
Amyloidosis Also known as Amyloid Disease In medicine amyloidosis refers to a variety of conditions wherein normally soluble proteins become insoluble and are deposited in the extracellular space of various organs or tissues disrupting normal function. The insoluble fibrous protein aggregates that develop in amyloidosis are known as amyloids. They result from a change in the proteins secondary structure which causes the protein to take on a particular aggregated insoluble form similar to the betapleated sheet. S