In [61]:
import pandas as pd
import numpy as np

symptomsToDisease = pd.read_csv('../data/raw/symptoms/Disease and symptoms dataset.csv')

In [62]:
original_rows = len(symptomsToDisease)
original_cols = len(symptomsToDisease.columns)

symptom_cols = [c for c in symptomsToDisease.columns if c != "diseases"]
print(f"Original symptom columns: {len(symptom_cols)}")

Original symptom columns: 377


In [63]:
diseases = symptomsToDisease["diseases"].unique()
print(f"Total unique disease count: {len(diseases)}")
print(diseases)

Total unique disease count: 773
['panic disorder' 'vocal cord polyp' 'turner syndrome' 'cryptorchidism'
 'poisoning due to ethylene glycol' 'atrophic vaginitis'
 'fracture of the hand' 'cellulitis or abscess of mouth'
 'eye alignment disorder' 'headache after lumbar puncture'
 'pyloric stenosis' 'salivary gland disorder' 'osteochondrosis'
 'injury to the knee' 'metabolic disorder' 'vaginitis'
 'sick sinus syndrome' 'tinnitus of unknown cause' 'glaucoma'
 'eating disorder' 'transient ischemic attack' 'pyelonephritis'
 'rotator cuff injury' 'chronic pain disorder' 'problem during pregnancy'
 'liver cancer' 'atelectasis' 'injury to the hand' 'choledocholithiasis'
 'injury to the hip' 'cirrhosis' 'thoracic aortic aneurysm'
 'subdural hemorrhage' 'diabetic retinopathy' 'fibromyalgia'
 'ischemia of the bowel' 'fetal alcohol syndrome' 'peritonitis'
 'injury to the abdomen' 'acute pancreatitis' 'thrombophlebitis' 'asthma'
 'foreign body in the vagina' 'restless leg syndrome' 'emphysema'
 'cyst

In [67]:
final_diseases_to_remove = [
    # --- TRAUMATIC INJURIES AND ORTHOPEDICS ---
    # Rationale: These are diagnosed via physical trauma history or X-ray/imaging. 
    # AI prediction from systemic symptoms is redundant and clinically unnecessary.
    'birth trauma', 'bone spur of the calcaneous', 'concussion', 'corneal abrasion', 
    'crushing injury', 'dislocation of the ankle', 'dislocation of the elbow', 
    'dislocation of the finger', 'dislocation of the foot', 'dislocation of the hip', 
    'dislocation of the knee', 'dislocation of the patella', 'dislocation of the shoulder', 
    'dislocation of the vertebra', 'dislocation of the wrist', 'fracture of the ankle', 
    'fracture of the arm', 'fracture of the facial bones', 'fracture of the finger', 
    'fracture of the foot', 'fracture of the hand', 'fracture of the jaw', 
    'fracture of the leg', 'fracture of the neck', 'fracture of the patella', 
    'fracture of the pelvis', 'fracture of the rib', 'fracture of the shoulder', 
    'fracture of the skull', 'fracture of the vertebra', 'head injury', 
    'heart contusion', 'hematoma', 'injury of the ankle', 'injury to internal organ', 
    'injury to the abdomen', 'injury to the arm', 'injury to the face', 
    'injury to the finger', 'injury to the hand', 'injury to the hip', 
    'injury to the knee', 'injury to the leg', 'injury to the shoulder', 
    'injury to the spinal cord', 'injury to the trunk', 'joint effusion', 
    'knee ligament or meniscus tear', 'lung contusion', 'rotator cuff injury', 
    'sprain or strain',

    # --- OPEN WOUNDS AND POST-SURGICAL ---
    # Rationale: Diagnosed by visual inspection; these are acute physical findings, 
    # not "diseases" to be predicted by pattern recognition.
    'infection of open wound', 'open wound due to trauma', 'open wound from surgical incision', 
    'open wound of the abdomen', 'open wound of the arm', 'open wound of the back', 
    'open wound of the cheek', 'open wound of the chest', 'open wound of the ear', 
    'open wound of the eye', 'open wound of the face', 'open wound of the finger', 
    'open wound of the foot', 'open wound of the head', 'open wound of the jaw', 
    'open wound of the knee', 'open wound of the lip', 'open wound of the mouth', 
    'open wound of the neck', 'open wound of the nose', 'open wound of the shoulder', 
    'pain after an operation', 'postoperative infection', 'burn',

    # --- MINOR AILMENTS AND ROUTINE ILLNESSES ---
    # Rationale: These are "low-stakes" diagnoses that a doctor identifies in 
    # seconds. Including them lowers the utility of a high-tech prediction model.
    'acne', 'actinic keratosis', 'athlete\'s foot', 'broken tooth', 'bunion', 
    'callus', 'chalazion', 'chickenpox', 'cold sore', 'common cold', 
    'dental caries', 'diaper rash', 'ear wax impaction', 'epistaxis (nose disorder)', 
    'flat feet', 'flu', 'gum disease', 'hammer toe', 'impetigo', 'ingrown toe nail', 
    'intertrigo (skin condition)', 'lice', 'mumps', 'oral thrush (yeast infection)', 
    'pinguecula', 'pityriasis rosea', 'scabies', 'sebaceous cyst', 
    'seborrheic keratosis', 'skin polyp', 'stye', 'sunburn', 'teething syndrome', 
    'tooth abscess', 'tooth disorder', 'viral warts',

    # --- EXTERNAL CAUSES, POISONING, AND ENVIRONMENT ---
    # Rationale: Diagnosis is based on toxicology or patient history of exposure. 
    # AI pattern matching symptoms won't replace a blood test for ethylene glycol.
    'carbon monoxide poisoning', 'envenomation from spider or animal bite', 
    'frostbite', 'heat exhaustion', 'heat stroke', 'hypothermia', 
    'insect bite', 'insulin overdose', 'poisoning due to analgesics', 
    'poisoning due to anticonvulsants', 'poisoning due to antidepressants', 
    'poisoning due to antimicrobial drugs', 'poisoning due to antipsychotics', 
    'poisoning due to antihypertensives', 'poisoning due to ethylene glycol', 
    'poisoning due to gas', 'poisoning due to opioids', 'poisoning due to sedatives', 
    'drug poisoning due to medication', 'alcohol intoxication',

    # --- PHYSIOLOGICAL STATES AND LIFESTYLE ---
    # Rationale: Pregnancy/Menopause are life stages. Substance abuse is a behavioral 
    # history/social finding, not a clinical diagnostic puzzle for AI.
    'pregnancy', 'menopause', 'fetal alcohol syndrome', 'normal pressure hydrocephalus', 
    'induced abortion', 'spontaneous abortion', 'missed abortion', 'mastectomy', 
    'alcohol abuse', 'drug abuse', 'drug abuse (barbiturates)', 'drug abuse (cocaine)', 
    'drug abuse (methamphetamine)', 'drug abuse (opioids)', 'marijuana abuse', 
    'smoking or tobacco addiction',

    # --- VISUAL FINDINGS ---
    # Rationale: Doctors can see these immediately upon looking at the patient.
    'eye alignment disorder'
]

print(f"Number of diseases to remove: {len(diseases_to_remove)}")
print(f"Total diseases after removing: {symptomsToDisease['diseases'].nunique() - len(diseases_to_remove)}")

Number of diseases to remove: 108
Total diseases after removing: 665


In [68]:
# Find diseases in your 'remove' list that aren't in your dataframe
missing_from_df = set(diseases_to_remove) - set(symptomsToDisease['diseases'].unique())

print(f"These {len(missing_from_df)} diseases were in your list but not found in the data:")
print(missing_from_df)

symptomsToDisease = symptomsToDisease[
    ~symptomsToDisease['diseases'].str.lower().str.strip().isin(
        [d.lower().strip() for d in diseases_to_remove]
    )
].reset_index(drop=True)
print(f"Total diseases left: {symptomsToDisease['diseases'].nunique()}")
print(f"\nFinal rows: {len(symptomsToDisease):,}")


These 1 diseases were in your list but not found in the data:
{'sunburn'}
Total diseases left: 667

Final rows: 222,720


In [69]:
print(symptomsToDisease['diseases'].unique())

['panic disorder' 'vocal cord polyp' 'turner syndrome' 'cryptorchidism'
 'atrophic vaginitis' 'cellulitis or abscess of mouth'
 'headache after lumbar puncture' 'pyloric stenosis'
 'salivary gland disorder' 'osteochondrosis' 'metabolic disorder'
 'vaginitis' 'sick sinus syndrome' 'tinnitus of unknown cause' 'glaucoma'
 'eating disorder' 'transient ischemic attack' 'pyelonephritis'
 'rotator cuff injury' 'chronic pain disorder' 'problem during pregnancy'
 'liver cancer' 'atelectasis' 'choledocholithiasis' 'cirrhosis'
 'thoracic aortic aneurysm' 'subdural hemorrhage' 'diabetic retinopathy'
 'fibromyalgia' 'ischemia of the bowel' 'fetal alcohol syndrome'
 'peritonitis' 'injury to the abdomen' 'acute pancreatitis'
 'thrombophlebitis' 'asthma' 'restless leg syndrome' 'emphysema'
 'cysticercosis' 'infectious gastroenteritis' 'acute sinusitis'
 'substance-related mental disorder' 'postpartum depression'
 'coronary atherosclerosis' 'spondylitis' 'pituitary adenoma'
 'uterine fibroids' 'idiopat

In [70]:
# Using disease mapping to standardize disease names
import json

with open("../data/disease_mapping.json") as f:
    category_map = json.load(f)

In [71]:
# Create a flat list of all diseases in the JSON categories
mapped_diseases = []
for category in category_map:
    mapped_diseases.extend(category_map[category])

# Check for any diseases in your dataset that were missed by the mapping
missing_from_json = set(symptomsToDisease['diseases'].unique()) - set(mapped_diseases)

print(f"Diseases not yet categorized: {len(missing_from_json)}")
print(missing_from_json)

Diseases not yet categorized: 125
{'chronic ulcer', 'gum disease', 'septic arthritis', 'joint effusion', 'drug abuse', 'allergy to animals', 'drug withdrawal', 'impetigo', 'down syndrome', 'chronic rheumatic fever', 'ganglion cyst', 'mucositis', 'drug abuse (barbiturates)', 'acute bronchospasm', 'hypovolemia', 'brachial neuritis', 'hammer toe', 'chronic pain disorder', 'pain disorder affecting the neck', 'sprain or strain', 'fetal alcohol syndrome', 'seasonal allergies (hay fever)', 'gestational diabetes', 'smoking or tobacco addiction', 'dental caries', 'hypertrophic obstructive cardiomyopathy (hocm)', 'marijuana abuse', 'atrophy of the corpus cavernosum', 'developmental disability', 'acute glaucoma', 'hydrocephalus', 'insulin overdose', 'allergy', 'alcohol withdrawal', 'pityriasis rosea', 'lactose intolerance', 'chickenpox', 'acute otitis media', 'edward syndrome', 'hematoma', 'primary immunodeficiency', 'muscle spasm', 'breast cyst', 'birth trauma', 'postoperative infection', 'absce

In [72]:
disease_to_category = {
    disease: category
    for category, diseases in category_map.items()
    for disease in diseases
}

symptomsToDisease['disease_category'] = symptomsToDisease['diseases'].map(disease_to_category)
symptomsToDisease.fillna({'disease_category': 'Unknown Type'}, inplace=True)
display(symptomsToDisease[['diseases', 'disease_category']].sample(10, random_state=42))

Unnamed: 0,diseases,disease_category
85167,nose disorder,Ophthalmology and ENT
87527,muscle spasm,Unknown Type
89475,urge incontinence,Genitourinary and Reproductive
11936,thrombophlebitis,Cardiovascular and Circulatory
22628,cornea infection,Ophthalmology and ENT
13013,asthma,Respiratory System
87204,muscle spasm,Unknown Type
160548,post-traumatic stress disorder (ptsd),Mental and Behavioral Health
106356,skin disorder,Dermatological
122314,pyogenic skin infection,Dermatological


In [73]:
missing = symptomsToDisease.isnull().sum().sum()
print(f"Missing values: {missing}")
if missing > 0:
    print(f"WARNING: Missing values detected!")

Missing values: 0


In [74]:
symptoms_per_patient = symptomsToDisease[symptom_cols].sum(axis=1)
print(f"   Avg symptoms per patient: {symptoms_per_patient.mean():.2f}")
print(f"   Min symptoms per patient: {symptoms_per_patient.min():.0f}")
print(f"   Max symptoms per patient: {symptoms_per_patient.max():.0f}")

print("-"*50)
disease_counts = symptomsToDisease['diseases'].value_counts()
print(f"   Total diseases: {symptomsToDisease['diseases'].nunique()}")
print(f"   Min samples per disease: {disease_counts.min()}")
print(f"   Max samples per disease: {disease_counts.max()}")

print("-"*50)

category_counts = symptomsToDisease['disease_category'].value_counts()
print(f"\nðŸ“ˆ Category Distribution:")
for cat, count in category_counts.items():
    print(f"   {cat:30s}: {count:5d} samples")

cat_imbalance = category_counts.max() / category_counts.min()
print(f"\nâœ… Category imbalance: {cat_imbalance:.1f}:1 (much better than {disease_counts.iloc[0] / disease_counts.iloc[-1]:.1f}:1)")

   Avg symptoms per patient: 5.34
   Min symptoms per patient: 1
   Max symptoms per patient: 12
--------------------------------------------------
   Total diseases: 667
   Min samples per disease: 1
   Max samples per disease: 1219
--------------------------------------------------

ðŸ“ˆ Category Distribution:
   Unknown Type                  : 43769 samples
   Genitourinary and Reproductive: 26542 samples
   Ophthalmology and ENT         : 24198 samples
   Gastrointestinal and Hepatic  : 21233 samples
   Musculoskeletal               : 17893 samples
   Cardiovascular and Circulatory: 16665 samples
   Mental and Behavioral Health  : 16092 samples
   Dermatological                : 15435 samples
   Neurological Disorders        : 10033 samples
   Respiratory System            :  9922 samples
   Endocrine and Metabolic       :  7323 samples
   Infectious Diseases           :  5786 samples
   Hematology and Oncology       :  4341 samples
   Obstetrics and Neonatal       :  3488 samples


In [75]:
# Save the cleaned data

output_path = "../data/processed/symptoms/symptoms_to_disease_cleaned.csv"
symptomsToDisease.to_csv(output_path, index=False)
print(f"Final dataset shape: {symptomsToDisease.shape}")
print(f"\nCleaned data saved to: {output_path}")

Final dataset shape: (222720, 379)

Cleaned data saved to: ../data/processed/symptoms/symptoms_to_disease_cleaned.csv
