# Automated Prescription From Patient Texts
## Rough Modelling

In [None]:
# https://github.com/dreji18/Bio-Epidemiology-NER

In [96]:
input = """

I've been struggling with my health for months now. I've been experiencing severe abdominal pain, nausea, and vomiting. It started with occasional stomach cramps, but now it's constant and debilitating. I've also noticed blood in my stool and I'm terrified. I've lost my appetite and I'm losing weight rapidly. I feel weak and exhausted all the time. I've tried to manage my symptoms with over-the-counter medications, but nothing seems to be working. I'm worried it might be something serious and I need your help to figure out what's going on.



"""

In [97]:
import requests

API_URL = "https://api-inference.huggingface.co/models/d4data/biomedical-ner-all"
headers = {"Authorization": "Bearer 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"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	
output = query({
	"inputs": f"{input}"
})

In [98]:
sign_symptoms = [entity['word'] for entity in output if entity['entity_group'] == 'Sign_symptom']

print(sign_symptoms)

['blood', 'lost', 'appetite', 'weight', 'weak', 'exhausted', 'symptoms']


In [99]:
output

[{'entity_group': 'Severity',
  'score': 0.998444139957428,
  'word': 'severe',
  'start': 77,
  'end': 83},
 {'entity_group': 'Biological_structure',
  'score': 0.9996088147163391,
  'word': 'abdominal',
  'start': 84,
  'end': 93},
 {'entity_group': 'Biological_structure',
  'score': 0.9962928891181946,
  'word': 'stomach',
  'start': 149,
  'end': 156},
 {'entity_group': 'Sign_symptom',
  'score': 0.9990798234939575,
  'word': 'blood',
  'start': 223,
  'end': 228},
 {'entity_group': 'Biological_structure',
  'score': 0.9995960593223572,
  'word': 'stool',
  'start': 235,
  'end': 240},
 {'entity_group': 'Sign_symptom',
  'score': 0.9902464151382446,
  'word': 'lost',
  'start': 265,
  'end': 269},
 {'entity_group': 'Sign_symptom',
  'score': 0.9996623992919922,
  'word': 'appetite',
  'start': 273,
  'end': 281},
 {'entity_group': 'Sign_symptom',
  'score': 0.9993577599525452,
  'word': 'weight',
  'start': 297,
  'end': 303},
 {'entity_group': 'Sign_symptom',
  'score': 0.99970930

In [101]:
for entity in output:
    print (entity)

{'entity_group': 'Severity', 'score': 0.998444139957428, 'word': 'severe', 'start': 77, 'end': 83}
{'entity_group': 'Biological_structure', 'score': 0.9996088147163391, 'word': 'abdominal', 'start': 84, 'end': 93}
{'entity_group': 'Biological_structure', 'score': 0.9962928891181946, 'word': 'stomach', 'start': 149, 'end': 156}
{'entity_group': 'Sign_symptom', 'score': 0.9990798234939575, 'word': 'blood', 'start': 223, 'end': 228}
{'entity_group': 'Biological_structure', 'score': 0.9995960593223572, 'word': 'stool', 'start': 235, 'end': 240}
{'entity_group': 'Sign_symptom', 'score': 0.9902464151382446, 'word': 'lost', 'start': 265, 'end': 269}
{'entity_group': 'Sign_symptom', 'score': 0.9996623992919922, 'word': 'appetite', 'start': 273, 'end': 281}
{'entity_group': 'Sign_symptom', 'score': 0.9993577599525452, 'word': 'weight', 'start': 297, 'end': 303}
{'entity_group': 'Sign_symptom', 'score': 0.9997093081474304, 'word': 'weak', 'start': 320, 'end': 324}
{'entity_group': 'Sign_symptom'

## NER Customized approach

In the above method, we might not get the named entitied as we require for our disease prediction model. So below are some custom skeleton for mapping the texts to our dataset

In [1]:
import re

# Define keywords for each symptom
keywords = {
    "chest_pain": ["chest", "pain"],
    "knee_pain": ["knee", "pain"],
    "high_fever": ["high", "fever"],
    "itching": ["itching", "itch"],
    "rash": ["rash"]
}


In [2]:
import pandas as pd

# Example labeled dataset for NER
data = [
    ("pain in my left chest", {"entities": [(9, 14, "CHEST_PAIN")]}),
    ("having chest pain", {"entities": [(7, 18, "CHEST_PAIN")]}),
    ("sometimes I experience pain in my chests", {"entities": [(25, 30, "CHEST_PAIN")]}),
    ("knee pain while walking", {"entities": [(0, 9, "KNEE_PAIN")]}),
    ("high fever for two days", {"entities": [(0, 10, "HIGH_FEVER")]}),
    ("itching all over my body", {"entities": [(0, 7, "ITCHING")]}),
    ("developed a rash on my arm", {"entities": [(12, 16, "RASH")]}),
]

# Convert to DataFrame
df = pd.DataFrame(data, columns=["text", "annotations"])
print(df)


                                       text  \
0                     pain in my left chest   
1                         having chest pain   
2  sometimes I experience pain in my chests   
3                   knee pain while walking   
4                   high fever for two days   
5                  itching all over my body   
6                developed a rash on my arm   

                              annotations  
0   {'entities': [(9, 14, 'CHEST_PAIN')]}  
1   {'entities': [(7, 18, 'CHEST_PAIN')]}  
2  {'entities': [(25, 30, 'CHEST_PAIN')]}  
3     {'entities': [(0, 9, 'KNEE_PAIN')]}  
4   {'entities': [(0, 10, 'HIGH_FEVER')]}  
5       {'entities': [(0, 7, 'ITCHING')]}  
6        {'entities': [(12, 16, 'RASH')]}  


In [4]:
import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding

# Load a blank spaCy model
nlp = spacy.blank("en")

# Create the NER component and add it to the pipeline
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

# Add labels to the NER component
for _, annotations in data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# Convert the data to spaCy's format
train_data = []
for text, annotations in data:
    train_data.append(Example.from_dict(nlp.make_doc(text), annotations))

# Train the NER model
optimizer = nlp.begin_training()
for itn in range(20):
    losses = {}
    batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        nlp.update(batch, drop=0.5, losses=losses)
    print(f"Losses at iteration {itn}: {losses}")

# Save the model
nlp.to_disk("disease_ner_model")




Losses at iteration 0: {'ner': 27.84070575237274}
Losses at iteration 1: {'ner': 26.925025284290314}
Losses at iteration 2: {'ner': 25.55672335624695}
Losses at iteration 3: {'ner': 23.754194617271423}
Losses at iteration 4: {'ner': 20.782811164855957}
Losses at iteration 5: {'ner': 18.082658380270004}
Losses at iteration 6: {'ner': 14.469612628221512}
Losses at iteration 7: {'ner': 9.526612993329763}
Losses at iteration 8: {'ner': 7.625063873827457}
Losses at iteration 9: {'ner': 7.370976723264903}
Losses at iteration 10: {'ner': 7.7801862446794985}
Losses at iteration 11: {'ner': 6.982795780742094}
Losses at iteration 12: {'ner': 6.638748339872109}
Losses at iteration 13: {'ner': 7.187085566430142}
Losses at iteration 14: {'ner': 6.486299766957018}
Losses at iteration 15: {'ner': 6.612790195300477}
Losses at iteration 16: {'ner': 6.032506063231267}
Losses at iteration 17: {'ner': 6.332524562655635}
Losses at iteration 18: {'ner': 6.268404178639685}
Losses at iteration 19: {'ner': 5.9

In [5]:
# Load the trained model
nlp = spacy.load("disease_ner_model")

# Function to map entities to prediction model columns
def map_entities_to_columns(text):
    doc = nlp(text)
    mapping = {"chest_pain": 0, "knee_pain": 0, "high_fever": 0, "itching": 0, "rash": 0}
    for ent in doc.ents:
        if ent.label_ == "CHEST_PAIN":
            mapping["chest_pain"] = 1
        elif ent.label_ == "KNEE_PAIN":
            mapping["knee_pain"] = 1
        elif ent.label_ == "HIGH_FEVER":
            mapping["high_fever"] = 1
        elif ent.label_ == "ITCHING":
            mapping["itching"] = 1
        elif ent.label_ == "RASH":
            mapping["rash"] = 1
    return mapping


In [6]:
# Keyword-Based Mapping as a Fallback if NER model fails
def map_keywords_to_columns(text):
    mapping = {"chest_pain": 0, "knee_pain": 0, "high_fever": 0, "itching": 0, "rash": 0}
    for symptom, keys in keywords.items():
        pattern = re.compile(r'\b(' + '|'.join(keys) + r')\b', re.IGNORECASE)
        matches = pattern.findall(text)
        if matches:
            mapping[symptom] = 1
    return mapping


In [7]:
# Combine NER and Keyword Mapping
def map_symptoms(text):
    mapping = map_entities_to_columns(text)
    keyword_mapping = map_keywords_to_columns(text)
    for symptom in mapping.keys():
        if mapping[symptom] == 0 and keyword_mapping[symptom] == 1:
            mapping[symptom] = 1
    return mapping

# Example usage
text = "I have a severe chest pain and high fever."
mapped_columns = map_symptoms(text)
print(mapped_columns)


{'chest_pain': 1, 'knee_pain': 1, 'high_fever': 1, 'itching': 0, 'rash': 0}


In [8]:
# Final Integration with Disease Prediction Model
# Sample texts
texts = [
    "pain in my left chest",
    "having chest pain",
    "sometimes I experience pain in my chests",
    "knee pain while walking",
    "high fever for two days",
    "itching all over my body",
    "developed a rash on my arm"
]

# Map each text to columns
mapped_data = [map_symptoms(text) for text in texts]

# Convert to DataFrame
df_mapped = pd.DataFrame(mapped_data)
print(df_mapped)


   chest_pain  knee_pain  high_fever  itching  rash
0           1          1           0        0     0
1           1          1           0        0     0
2           1          1           0        0     0
3           1          1           0        0     0
4           0          0           1        0     0
5           0          0           0        1     0
6           0          0           0        0     1


### Debugging

In [9]:
# Test the NER model independently
def test_ner_model(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Example usage
test_text = "I have a severe chest pain and high fever."
entities = test_ner_model(test_text)
print(entities)


[]


In [10]:
# Test the keyword-based mapping function independently
def test_keyword_mapping(text):
    return map_keywords_to_columns(text)

# Example usage
test_text = "I have a severe chest pain and high fever."
keyword_mapping = test_keyword_mapping(test_text)
print(keyword_mapping)


{'chest_pain': 1, 'knee_pain': 1, 'high_fever': 1, 'itching': 0, 'rash': 0}


In [11]:
# Combine Both Methods Correctly
def map_symptoms(text):
    # First, use NER to map entities
    mapping = map_entities_to_columns(text)
    
    # Then, use keyword-based mapping as a fallback
    keyword_mapping = map_keywords_to_columns(text)
    
    # Update the mapping only if the NER did not find the symptom
    for symptom in mapping.keys():
        if mapping[symptom] == 0 and keyword_mapping[symptom] == 1:
            mapping[symptom] = 1
    return mapping

# Example usage
test_text = "I have a severe chest pain and high fever."
mapped_columns = map_symptoms(test_text)
print(mapped_columns)


{'chest_pain': 1, 'knee_pain': 1, 'high_fever': 1, 'itching': 0, 'rash': 0}


In [12]:
# Check NER model output
entities = test_ner_model(test_text)
print(f"NER model entities: {entities}")

# Check keyword-based mapping output
keyword_mapping = test_keyword_mapping(test_text)
print(f"Keyword-based mapping: {keyword_mapping}")

# Check combined mapping
mapped_columns = map_symptoms(test_text)
print(f"Combined mapping: {mapped_columns}")


NER model entities: []
Keyword-based mapping: {'chest_pain': 1, 'knee_pain': 1, 'high_fever': 1, 'itching': 0, 'rash': 0}
Combined mapping: {'chest_pain': 1, 'knee_pain': 1, 'high_fever': 1, 'itching': 0, 'rash': 0}


### Re modelling

In [19]:
import re

# Define refined keywords for each symptom
keywords = {
    "chest_pain": ["chest pain", "pain in chest", "chest"],
    "knee_pain": ["knee pain", "pain in knee", "knee", "knees"],
    "high_fever": ["high fever", "fever high", "high temperature", "temperature high"],
    "itching": ["itching", "itch"],
    "rash": ["rash"]
}

def map_keywords_to_columns(text):
    mapping = {"chest_pain": 0, "knee_pain": 0, "high_fever": 0, "itching": 0, "rash": 0}
    
    for symptom, keys in keywords.items():
        for key in keys:
            pattern = re.compile(r'\b' + re.escape(key) + r'\b', re.IGNORECASE)
            if pattern.search(text):
                mapping[symptom] = 1
                break
    
    return mapping

# Test refined keyword-based mapping function
test_text = "I have a severe chest pain and high fever."
keyword_mapping = map_keywords_to_columns(test_text)
print(f"Refined keyword-based mapping: {keyword_mapping}")


Refined keyword-based mapping: {'chest_pain': 1, 'knee_pain': 0, 'high_fever': 1, 'itching': 0, 'rash': 0}


In [20]:
# Combining NER and Refined Keyword Mapping
# With the refined keyword-based mapping function, we can now combine it with the NER results:

def map_symptoms(text):
    # First, use NER to map entities
    mapping = map_entities_to_columns(text)
    
    # Then, use refined keyword-based mapping as a fallback
    keyword_mapping = map_keywords_to_columns(text)
    
    # Update the mapping only if the NER did not find the symptom
    for symptom in mapping.keys():
        if mapping[symptom] == 0 and keyword_mapping[symptom] == 1:
            mapping[symptom] = 1
    return mapping

# Example usage with combined mapping
test_text = "I have a severe chest pain and high fever."
mapped_columns = map_symptoms(test_text)
print(f"Combined mapping: {mapped_columns}")


Combined mapping: {'chest_pain': 1, 'knee_pain': 0, 'high_fever': 1, 'itching': 0, 'rash': 0}


In [21]:
# Check NER model output
entities = test_ner_model(test_text)
print(f"NER model entities: {entities}")

# Check refined keyword-based mapping output
keyword_mapping = map_keywords_to_columns(test_text)
print(f"Refined keyword-based mapping: {keyword_mapping}")

# Check combined mapping
mapped_columns = map_symptoms(test_text)
print(f"Combined mapping: {mapped_columns}")

# Additional test cases
test_cases = [
    "I have a severe chest pain and high fever.",
    "My knee hurts a lot.",
    "I am experiencing high fever and rash.",
    "There is itching on my skin.",
    "I have a rash and knee pain.",
    "Sometimes I experience pain in my chest and knees."
]

for text in test_cases:
    print(f"\nText: {text}")
    entities = test_ner_model(text)
    print(f"NER model entities: {entities}")
    keyword_mapping = map_keywords_to_columns(text)
    print(f"Refined keyword-based mapping: {keyword_mapping}")
    combined_mapping = map_symptoms(text)
    print(f"Combined mapping: {combined_mapping}")


NER model entities: []
Refined keyword-based mapping: {'chest_pain': 1, 'knee_pain': 0, 'high_fever': 1, 'itching': 0, 'rash': 0}
Combined mapping: {'chest_pain': 1, 'knee_pain': 0, 'high_fever': 1, 'itching': 0, 'rash': 0}

Text: I have a severe chest pain and high fever.
NER model entities: []
Refined keyword-based mapping: {'chest_pain': 1, 'knee_pain': 0, 'high_fever': 1, 'itching': 0, 'rash': 0}
Combined mapping: {'chest_pain': 1, 'knee_pain': 0, 'high_fever': 1, 'itching': 0, 'rash': 0}

Text: My knee hurts a lot.
NER model entities: []
Refined keyword-based mapping: {'chest_pain': 0, 'knee_pain': 1, 'high_fever': 0, 'itching': 0, 'rash': 0}
Combined mapping: {'chest_pain': 0, 'knee_pain': 1, 'high_fever': 0, 'itching': 0, 'rash': 0}

Text: I am experiencing high fever and rash.
NER model entities: []
Refined keyword-based mapping: {'chest_pain': 0, 'knee_pain': 0, 'high_fever': 1, 'itching': 0, 'rash': 1}
Combined mapping: {'chest_pain': 0, 'knee_pain': 0, 'high_fever': 1, 'itch

## We will get to the actual work now

In [27]:
# reading disease prediction dataset 
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

dis_df = pd.read_csv("input/disease_prediction_Training.csv")
dis_df.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,muscle_wasting,vomiting,burning_micturition,spotting_ urination,fatigue,weight_gain,anxiety,cold_hands_and_feets,mood_swings,weight_loss,restlessness,lethargy,patches_in_throat,irregular_sugar_level,cough,high_fever,sunken_eyes,breathlessness,sweating,dehydration,indigestion,headache,yellowish_skin,dark_urine,nausea,loss_of_appetite,pain_behind_the_eyes,back_pain,constipation,abdominal_pain,diarrhoea,mild_fever,yellow_urine,yellowing_of_eyes,acute_liver_failure,fluid_overload,swelling_of_stomach,swelled_lymph_nodes,malaise,blurred_and_distorted_vision,phlegm,throat_irritation,redness_of_eyes,sinus_pressure,runny_nose,congestion,chest_pain,weakness_in_limbs,fast_heart_rate,pain_during_bowel_movements,pain_in_anal_region,bloody_stool,irritation_in_anus,neck_pain,dizziness,cramps,bruising,obesity,swollen_legs,swollen_blood_vessels,puffy_face_and_eyes,enlarged_thyroid,brittle_nails,swollen_extremeties,excessive_hunger,extra_marital_contacts,drying_and_tingling_lips,slurred_speech,knee_pain,hip_joint_pain,muscle_weakness,stiff_neck,swelling_joints,movement_stiffness,spinning_movements,loss_of_balance,unsteadiness,weakness_of_one_body_side,loss_of_smell,bladder_discomfort,foul_smell_of urine,continuous_feel_of_urine,passage_of_gases,internal_itching,toxic_look_(typhos),depression,irritability,muscle_pain,altered_sensorium,red_spots_over_body,belly_pain,abnormal_menstruation,dischromic _patches,watering_from_eyes,increased_appetite,polyuria,family_history,mucoid_sputum,rusty_sputum,lack_of_concentration,visual_disturbances,receiving_blood_transfusion,receiving_unsterile_injections,coma,stomach_bleeding,distention_of_abdomen,history_of_alcohol_consumption,fluid_overload.1,blood_in_sputum,prominent_veins_on_calf,palpitations,painful_walking,pus_filled_pimples,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis,Unnamed: 133
0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Fungal infection,
1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Fungal infection,
2,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Fungal infection,
3,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Fungal infection,
4,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Fungal infection,


In [34]:
dis_df.columns.tolist()

['itching',
 'skin_rash',
 'nodal_skin_eruptions',
 'continuous_sneezing',
 'shivering',
 'chills',
 'joint_pain',
 'stomach_pain',
 'acidity',
 'ulcers_on_tongue',
 'muscle_wasting',
 'vomiting',
 'burning_micturition',
 'spotting_ urination',
 'fatigue',
 'weight_gain',
 'anxiety',
 'cold_hands_and_feets',
 'mood_swings',
 'weight_loss',
 'restlessness',
 'lethargy',
 'patches_in_throat',
 'irregular_sugar_level',
 'cough',
 'high_fever',
 'sunken_eyes',
 'breathlessness',
 'sweating',
 'dehydration',
 'indigestion',
 'headache',
 'yellowish_skin',
 'dark_urine',
 'nausea',
 'loss_of_appetite',
 'pain_behind_the_eyes',
 'back_pain',
 'constipation',
 'abdominal_pain',
 'diarrhoea',
 'mild_fever',
 'yellow_urine',
 'yellowing_of_eyes',
 'acute_liver_failure',
 'fluid_overload',
 'swelling_of_stomach',
 'swelled_lymph_nodes',
 'malaise',
 'blurred_and_distorted_vision',
 'phlegm',
 'throat_irritation',
 'redness_of_eyes',
 'sinus_pressure',
 'runny_nose',
 'congestion',
 'chest_pain',


## Generating Keywords and Data for NER

In [50]:
keywords_new = {
    "itching": ["itching", "itch"],
    "skin_rash": ["skin rash", "rash"],
    "nodal_skin_eruptions": ["nodal skin eruptions"],
    "continuous_sneezing": ["continuous sneezing"],
    "shivering": ["shivering", "chills"],
    "chills": ["chills", "fever"],
    "joint_pain": ["joint pain", "arthritis"],
    "stomach_pain": ["stomach pain", "abdominal pain"],
    "acidity": ["acidity", "heartburn"],
    "ulcers_on_tongue": ["ulcers on tongue", "mouth sores"],
    "muscle_wasting": ["muscle wasting", "muscle weakness"],
    "vomiting": ["vomiting", "nausea"],
    "burning_micturition": ["burning micturition", "painful urination"],
    "spotting_urination": ["spotting urination", "frequent urination"],
    "fatigue": ["fatigue", "tiredness"],
    "weight_gain": ["weight gain", "obesity"],
    "anxiety": ["anxiety", "stress"],
    "cold_hands_and_feets": ["cold hands and feet", "poor circulation"],
    "mood_swings": ["mood swings", "irritability"],
    "weight_loss": ["weight loss", "slimming"],
    "restlessness": ["restlessness", "insomnia"],
    "lethargy": ["lethargy", "apathy"],
    "patches_in_throat": ["patches in throat", "sore throat"],
    "irregular_sugar_level": ["irregular sugar level", "diabetes"],
    "cough": ["cough", "respiratory issues"],
    "high_fever": ["high fever", "temperature elevation"],
    "sunken_eyes": ["sunken eyes", "fatigue"],
    "breathlessness": ["breathlessness", "shortness of breath"],
    "sweating": ["sweating", "perspiration"],
    "dehydration": ["dehydration", "water loss"],
    "indigestion": ["indigestion", "heartburn"],
    "headache": ["headache", "migraine"],
    "yellowish_skin": ["yellowish skin", "jaundice"],
    "dark_urine": ["dark urine", "kidney issues"],
    "nausea": ["nausea", "vomiting"],
    "loss_of_appetite": ["loss of appetite", "decreased hunger"],
    "pain_behind_the_eyes": ["pain behind the eyes", "eye strain"],
    "back_pain": ["back pain", "muscle strain"],
    "constipation": ["constipation", "bowel issues"],
    "abdominal_pain": ["abdominal pain", "stomach cramps"],
    "diarrhoea": ["diarrhoea", "frequent bowel movements"],
    "mild_fever": ["mild fever", "low-grade temperature"],
    "yellow_urine": ["yellow urine", "urine color change"],
    "yellowing_of_eyes": ["yellowing of eyes", "jaundice"],
    "acute_liver_failure": ["acute liver failure", "liver dysfunction"],
    "fluid_overload": ["fluid overload", "edema"],
    "swelling_of_stomach": ["swelling of stomach", "abdominal distension"],
    "swelled_lymph_nodes": ["swelled lymph nodes", "enlarged lymph nodes"],
    "malaise": ["malaise", "general discomfort"],
    "blurred_and_distorted_vision": ["blurred and distorted vision", "eye problems"],
    "phlegm": ["phlegm", "mucus"],
    "throat_irritation": ["throat irritation", "sore throat"],
    "redness_of_eyes": ["redness of eyes", "eye irritation"],
    "sinus_pressure": ["sinus pressure", "congestion"],
    "runny_nose": ["runny nose", "rhinorrhea"],
    "congestion": ["congestion", "stuffy nose"],
    "chest_pain": ["chest pain", "cardiac issues"],
    "weakness_in_limbs": ["weakness in limbs", "muscle weakness"],
    "fast_heart_rate": ["fast heart rate", "tachycardia"],
    "pain_during_bowel_movements": ["pain during bowel movements", "rectal pain"],
    "pain_in_anal_region": ["pain in anal region", "anal pain"],
    "bloody_stool": ["bloody stool", "hematochezia"],
    "irritation_in_anus": ["irritation in anus", "anal itching"],
    "neck_pain": ["neck pain", "cervical pain"],
    "dizziness": ["dizziness", "lightheadedness"],
    "cramps": ["cramps", "muscle spasms"],
    "bruising": ["bruising", "ecchymosis"],
    "obesity": ["obesity", "overweight"],
    "swollen_legs": ["swollen legs", "edema"],
    "swollen_blood_vessels": ["swollen blood vessels", "vasculitis"],
    "puffy_face_and_eyes": ["puffy face and eyes", "edema"],
    "enlarged_thyroid": ["enlarged thyroid", "goiter"],
    "brittle_nails": ["brittle nails", "nail fragility"],
    "swollen_extremeties": ["swollen extremeties", "edema"],
    "excessive_hunger": ["excessive hunger", "polyphagia"],
    "extra_marital_contacts": ["extra marital contacts", "infidelity"],
    "drying_and_tingling_lips": ["drying and tingling lips", "cheilitis"],
    "slurred_speech": ["slurred speech", "dysarthria"],
    "knee_pain": ["knee pain", "knee joint pain"],
    "hip_joint_pain": ["hip joint pain", "hip arthritis"],
    "muscle_weakness": ["muscle weakness", "muscle atrophy"],
    "stiff_neck": ["stiff neck", "cervical stiffness"],
    "swelling_joints": ["swelling joints", "joint swelling"],
    "movement_stiffness": ["movement stiffness", "muscle rigidity"],
    "spinning_movements": ["spinning movements", "vertigo"],
    "loss_of_balance": ["loss of balance", "ataxia"],
    "unsteadiness": ["unsteadiness", "lightheadedness"],
    "weakness_of_one_body_side": ["weakness of one body side", "hemiparesis"],
    "loss_of_smell": ["loss of smell", "anosmia"],
    "bladder_discomfort": ["bladder discomfort", "urinary issues"],
    "foul_smell_of_urine": ["foul smell of urine", "urinary tract infection"],
    "continuous_feel_of_urine": ["continuous feel of urine", "urinary frequency"],
    "passage_of_gases": ["passage of gases", "flatulence"],
    "internal_itching": ["internal itching", "pruritus"],
    "toxic_look_typhos": ["toxic look typhos", "typhoid fever"],
    "depression": ["depression", "mental health disorders"],
    "irritability": ["irritability", "mood swings"],
    "muscle_pain": ["muscle pain", "myalgia"],
    "altered_sensorium": ["altered sensorium", "confusion"],
    "red_spots_over_body": ["red spots over body", "petechiae"],
    "belly_pain": ["belly pain", "abdominal pain"],
    "abnormal_menstruation": ["abnormal menstruation", "menstrual irregularities"],
    "dischromic_patches": ["dischromic patches", "skin discoloration"],
    "watering_from_eyes": ["watering from eyes", "lacrimation"],
    "increased_appetite": ["increased appetite", "polyphagia"],
    "polyuria": ["polyuria", "frequent urination"],
    "family_history": ["family history", "genetic disorders"],
    "mucoid_sputum": ["mucoid sputum", "respiratory issues"],
    "rusty_sputum": ["rusty sputum", "hemoptysis"],
    "lack_of_concentration": ["lack of concentration", "attention deficit"],
    "visual_disturbances": ["visual disturbances", "blurred vision"],
    "receiving_blood_transfusion": ["receiving blood transfusion", "blood transfusion"],
    "receiving_unsterile_injections": ["receiving unsterile injections", "unsterile injections"],
    "coma": ["coma", "unconsciousness"],
    "stomach_bleeding": ["stomach bleeding", "gastrointestinal bleeding"],
    "distention_of_abdomen": ["distention of abdomen", "abdominal swelling"],
    "history_of_alcohol_consumption": ["history of alcohol consumption", "alcoholism"],
    "fluid_overload": ["fluid overload", "edema"],
    "blood_in_sputum": ["blood in sputum", "hemoptysis"],
    "prominent_veins_on_calf": ["prominent veins on calf", "varicose veins"],
    "palpitations": ["palpitations", "heart palpitations"],
    "painful_walking": ["painful walking", "foot pain"],
    "pus_filled_pimples": ["pus filled pimples", "acne"],
    "blackheads": ["blackheads", "comedones"],
    "scurring": ["scurring", "scarring"],
    "skin_peeling": ["skin peeling", "exfoliation"],
    "silver_like_dusting": ["silver like dusting", "skin discoloration"],
    "small_dents_in_nails": ["small dents in nails", "nail abnormalities"],
    "inflammatory_nails": ["inflammatory nails", "nail inflammation"],
    "blister": ["blister", "fluid-filled bumps"],
    "red_sore_around_nose": ["red sore around nose", "nasal irritation"],
    "yellow_crust_ooze": ["yellow crust ooze", "pus-filled crust"]
}

In [51]:
import random

# Function to generate training data based on keywords
def generate_training_data(keywords, num_samples=1000):
    training_data = []
    for _ in range(num_samples):
        # Randomly select a symptom and generate a sentence containing it
        symptom = random.choice(list(keywords.keys()))
        keyword = random.choice(keywords[symptom])
        sentence = f"I have {keyword}."
        start = sentence.find(keyword)
        end = start + len(keyword)
        # Annotate the sentence with the symptom entity
        annotations = {"entities": [(start, end, symptom.upper())]}
        training_data.append((sentence, annotations))
    return training_data

# Generate training data
training_data = generate_training_data(keywords_new, num_samples=1000)

# Print sample training data
for text, annotations in training_data[:5]:
    print("Text:", text)
    print("Annotations:", annotations)
    print()


Text: I have migraine.
Annotations: {'entities': [(7, 15, 'HEADACHE')]}

Text: I have vasculitis.
Annotations: {'entities': [(7, 17, 'SWOLLEN_BLOOD_VESSELS')]}

Text: I have fast heart rate.
Annotations: {'entities': [(7, 22, 'FAST_HEART_RATE')]}

Text: I have mild fever.
Annotations: {'entities': [(7, 17, 'MILD_FEVER')]}

Text: I have exfoliation.
Annotations: {'entities': [(7, 18, 'SKIN_PEELING')]}



In [52]:
training_data

[('I have migraine.', {'entities': [(7, 15, 'HEADACHE')]}),
 ('I have vasculitis.', {'entities': [(7, 17, 'SWOLLEN_BLOOD_VESSELS')]}),
 ('I have fast heart rate.', {'entities': [(7, 22, 'FAST_HEART_RATE')]}),
 ('I have mild fever.', {'entities': [(7, 17, 'MILD_FEVER')]}),
 ('I have exfoliation.', {'entities': [(7, 18, 'SKIN_PEELING')]}),
 ('I have polyuria.', {'entities': [(7, 15, 'POLYURIA')]}),
 ('I have altered sensorium.', {'entities': [(7, 24, 'ALTERED_SENSORIUM')]}),
 ('I have alcoholism.',
  {'entities': [(7, 17, 'HISTORY_OF_ALCOHOL_CONSUMPTION')]}),
 ('I have rhinorrhea.', {'entities': [(7, 17, 'RUNNY_NOSE')]}),
 ('I have migraine.', {'entities': [(7, 15, 'HEADACHE')]}),
 ('I have abdominal swelling.',
  {'entities': [(7, 25, 'DISTENTION_OF_ABDOMEN')]}),
 ('I have blurred and distorted vision.',
  {'entities': [(7, 35, 'BLURRED_AND_DISTORTED_VISION')]}),
 ('I have urinary frequency.',
  {'entities': [(7, 24, 'CONTINUOUS_FEEL_OF_URINE')]}),
 ('I have continuous sneezing.',
  {'e

In [53]:
ner_data = pd.DataFrame(training_data, columns=["text", "annotations"])
ner_data.head()

Unnamed: 0,text,annotations
0,I have migraine.,"{'entities': [(7, 15, 'HEADACHE')]}"
1,I have vasculitis.,"{'entities': [(7, 17, 'SWOLLEN_BLOOD_VESSELS')]}"
2,I have fast heart rate.,"{'entities': [(7, 22, 'FAST_HEART_RATE')]}"
3,I have mild fever.,"{'entities': [(7, 17, 'MILD_FEVER')]}"
4,I have exfoliation.,"{'entities': [(7, 18, 'SKIN_PEELING')]}"


## Modelling

In [65]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 186.2 kB/s eta 0:01:09
     --------------------------------------- 0.1/12.8 MB 326.1 kB/s eta 0:00:40
      -------------------------------------- 0.2/12.8 MB 935.2 kB/s eta 0:00:14
      --------------------------------------- 0.3/12.8 MB 1.1 MB/s eta 0:00:12
     - -------------------------------------- 0.5/12.8 MB 1.5 MB/s eta 0:00:09
     - -------------------------------------- 0.5/12.8 MB 1.6 MB/s eta 0:00:08
     -- ------------------------------------- 0.7/12.8 MB 1.7 MB/s eta 0:00:08
     -- ------------------------------------- 0.9/12

In [77]:
import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding

# Load a blank spaCy model
nlp = spacy.blank("en")

# Create the NER component and add it to the pipeline
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

# Add labels to the NER component
for _, annotations in training_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# Convert the data to spaCy's format
train_data = []
for text, annotations in training_data:
    train_data.append(Example.from_dict(nlp.make_doc(text), annotations))

# Train the NER model
optimizer = nlp.begin_training()
for itn in range(200):
    losses = {}
    batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        nlp.update(batch, drop=0.5, losses=losses)
    print(f"Losses at iteration {itn}: {losses}")

# Save the model
nlp.to_disk("disease_ner_model")

Losses at iteration 0: {'ner': 1680.028751162081}
Losses at iteration 1: {'ner': 1537.8425705473064}
Losses at iteration 2: {'ner': 1561.946504510356}
Losses at iteration 3: {'ner': 1588.4624026442548}
Losses at iteration 4: {'ner': 1611.0841504734426}
Losses at iteration 5: {'ner': 1553.2473968774257}
Losses at iteration 6: {'ner': 1488.3559774416854}
Losses at iteration 7: {'ner': 1450.2043879108776}
Losses at iteration 8: {'ner': 1382.3742767937065}
Losses at iteration 9: {'ner': 1301.1950843167556}
Losses at iteration 10: {'ner': 1183.4665436721432}
Losses at iteration 11: {'ner': 1151.869656275441}
Losses at iteration 12: {'ner': 1098.0600753770248}
Losses at iteration 13: {'ner': 964.9554516044798}
Losses at iteration 14: {'ner': 935.8281296842915}
Losses at iteration 15: {'ner': 845.169679939372}
Losses at iteration 16: {'ner': 826.2028294208667}
Losses at iteration 17: {'ner': 773.0046765840709}
Losses at iteration 18: {'ner': 737.9445675055522}
Losses at iteration 19: {'ner': 

Losses at iteration 159: {'ner': 260.4055683018764}
Losses at iteration 160: {'ner': 204.12452266080018}
Losses at iteration 161: {'ner': 209.48230310089238}
Losses at iteration 162: {'ner': 219.58519412979115}
Losses at iteration 163: {'ner': 193.56260458078492}
Losses at iteration 164: {'ner': 230.92446891036838}
Losses at iteration 165: {'ner': 190.23742782688282}
Losses at iteration 166: {'ner': 229.02092353194942}
Losses at iteration 167: {'ner': 228.69816765655023}
Losses at iteration 168: {'ner': 166.52580505716628}
Losses at iteration 169: {'ner': 185.69881253501435}
Losses at iteration 170: {'ner': 232.96030660577608}
Losses at iteration 171: {'ner': 180.10836850788277}
Losses at iteration 172: {'ner': 220.0308616764403}
Losses at iteration 173: {'ner': 214.18755562565997}
Losses at iteration 174: {'ner': 194.11619633852462}
Losses at iteration 175: {'ner': 203.43876422628244}
Losses at iteration 176: {'ner': 224.00990637289647}
Losses at iteration 177: {'ner': 236.38546288044

In [78]:
import spacy

# Load the trained model
nlp = spacy.load("disease_ner_model")

# Function to map entities to prediction model columns
def map_entities_to_columns(text, symptom_keywords):
    # Initialize mapping with zeros for all symptoms
    mapping = {symptom: 0 for symptom in symptom_keywords.keys()}
    
    # Process text with the NER model
    doc = nlp(text)
    
    # Update mapping based on detected entities
    for ent in doc.ents:
        symptom = ent.label_.lower()
        if symptom in mapping:
            mapping[symptom] = 1
            
    return mapping


# # Function to map entities to prediction model columns
# def map_entities_to_columns(text):
#     doc = nlp(text)
#     mapping = {"chest_pain": 0, "knee_pain": 0, "high_fever": 0, "itching": 0, "rash": 0}
#     for ent in doc.ents:
#         if ent.label_ == "CHEST_PAIN":
#             mapping["chest_pain"] = 1
#         elif ent.label_ == "KNEE_PAIN":
#             mapping["knee_pain"] = 1
#         elif ent.label_ == "HIGH_FEVER":
#             mapping["high_fever"] = 1
#         elif ent.label_ == "ITCHING":
#             mapping["itching"] = 1
#         elif ent.label_ == "RASH":
#             mapping["rash"] = 1
#     return mapping

In [79]:
import re

# Function to map keywords to prediction model columns
def map_keywords_to_columns(text, symptom_keywords):
    # Initialize mapping with zeros for all symptoms
    mapping = {symptom: 0 for symptom in symptom_keywords.keys()}
    
    # Check for keyword matches in the text
    for symptom, keys in symptom_keywords.items():
        pattern = re.compile(r'\b(' + '|'.join(keys) + r')\b', re.IGNORECASE)
        matches = pattern.findall(text)
        if matches:
            mapping[symptom] = 1
            
    return mapping

In [80]:
text = "I have itching and knee pain."

result = map_keywords_to_columns(text, keywords_new)
print(result)

{'itching': 1, 'skin_rash': 0, 'nodal_skin_eruptions': 0, 'continuous_sneezing': 0, 'shivering': 0, 'chills': 0, 'joint_pain': 0, 'stomach_pain': 0, 'acidity': 0, 'ulcers_on_tongue': 0, 'muscle_wasting': 0, 'vomiting': 0, 'burning_micturition': 0, 'spotting_urination': 0, 'fatigue': 0, 'weight_gain': 0, 'anxiety': 0, 'cold_hands_and_feets': 0, 'mood_swings': 0, 'weight_loss': 0, 'restlessness': 0, 'lethargy': 0, 'patches_in_throat': 0, 'irregular_sugar_level': 0, 'cough': 0, 'high_fever': 0, 'sunken_eyes': 0, 'breathlessness': 0, 'sweating': 0, 'dehydration': 0, 'indigestion': 0, 'headache': 0, 'yellowish_skin': 0, 'dark_urine': 0, 'nausea': 0, 'loss_of_appetite': 0, 'pain_behind_the_eyes': 0, 'back_pain': 0, 'constipation': 0, 'abdominal_pain': 0, 'diarrhoea': 0, 'mild_fever': 0, 'yellow_urine': 0, 'yellowing_of_eyes': 0, 'acute_liver_failure': 0, 'fluid_overload': 0, 'swelling_of_stomach': 0, 'swelled_lymph_nodes': 0, 'malaise': 0, 'blurred_and_distorted_vision': 0, 'phlegm': 0, 'thr

In [81]:
# Example usage
text = "I have itching and knee pain."

result = map_entities_to_columns(text, keywords)
print(result)

{'chest_pain': 0, 'knee_pain': 0, 'high_fever': 0, 'itching': 1, 'rash': 0}


In [82]:
#### Combinign keyword and NER model

def map_symptoms(text):
    # First, use NER to map entities
    mapping = map_entities_to_columns(text, keywords_new)
    
    # Then, use refined keyword-based mapping as a fallback
    keyword_mapping = map_keywords_to_columns(text, keywords_new)
    
    # Update the mapping only if the NER did not find the symptom
    for symptom in mapping.keys():
        if mapping[symptom] == 0 and keyword_mapping[symptom] == 1:
            mapping[symptom] = 1
    return mapping

In [86]:
# Example usage with combined mapping
test_text = "I am 22 years old. I am facing nose itching and vomiting with i have a severe chest pain and high fever."
mapped_columns = map_symptoms(test_text)
print(f"Combined mapping: {mapped_columns}")

Combined mapping: {'itching': 1, 'skin_rash': 0, 'nodal_skin_eruptions': 0, 'continuous_sneezing': 0, 'shivering': 0, 'chills': 1, 'joint_pain': 0, 'stomach_pain': 0, 'acidity': 0, 'ulcers_on_tongue': 0, 'muscle_wasting': 0, 'vomiting': 1, 'burning_micturition': 0, 'spotting_urination': 0, 'fatigue': 0, 'weight_gain': 0, 'anxiety': 0, 'cold_hands_and_feets': 0, 'mood_swings': 0, 'weight_loss': 1, 'restlessness': 0, 'lethargy': 0, 'patches_in_throat': 0, 'irregular_sugar_level': 0, 'cough': 0, 'high_fever': 1, 'sunken_eyes': 0, 'breathlessness': 0, 'sweating': 0, 'dehydration': 0, 'indigestion': 0, 'headache': 0, 'yellowish_skin': 0, 'dark_urine': 0, 'nausea': 1, 'loss_of_appetite': 0, 'pain_behind_the_eyes': 0, 'back_pain': 0, 'constipation': 0, 'abdominal_pain': 0, 'diarrhoea': 0, 'mild_fever': 0, 'yellow_urine': 0, 'yellowing_of_eyes': 0, 'acute_liver_failure': 0, 'fluid_overload': 1, 'swelling_of_stomach': 0, 'swelled_lymph_nodes': 0, 'malaise': 0, 'blurred_and_distorted_vision': 0,