In [5]:
import pandas as pd
import numpy as np

symptomsToDisease = pd.read_csv('../data/raw/symptoms/Disease and symptoms dataset.csv')

In [6]:
original_rows = len(symptomsToDisease)
original_cols = len(symptomsToDisease.columns)

symptom_cols = [c for c in symptomsToDisease.columns if c != "diseases"]
print(f"Original symptom columns: {len(symptom_cols)}")

Original symptom columns: 375


In [7]:
diseases = symptomsToDisease["diseases"].unique()
print(f"Total unique disease count: {len(diseases)}")
print(diseases)

Total unique disease count: 773
['panic disorder' 'vocal cord polyp' 'turner syndrome' 'cryptorchidism'
 'poisoning due to ethylene glycol' 'atrophic vaginitis'
 'fracture of the hand' 'cellulitis or abscess of mouth'
 'eye alignment disorder' 'headache after lumbar puncture'
 'pyloric stenosis' 'salivary gland disorder' 'osteochondrosis'
 'injury to the knee' 'metabolic disorder' 'vaginitis'
 'sick sinus syndrome' 'tinnitus of unknown cause' 'glaucoma'
 'eating disorder' 'transient ischemic attack' 'pyelonephritis'
 'rotator cuff injury' 'chronic pain disorder' 'problem during pregnancy'
 'liver cancer' 'atelectasis' 'injury to the hand' 'choledocholithiasis'
 'injury to the hip' 'cirrhosis' 'thoracic aortic aneurysm'
 'subdural hemorrhage' 'diabetic retinopathy' 'fibromyalgia'
 'ischemia of the bowel' 'fetal alcohol syndrome' 'peritonitis'
 'injury to the abdomen' 'acute pancreatitis' 'thrombophlebitis' 'asthma'
 'foreign body in the vagina' 'restless leg syndrome' 'emphysema'
 'cyst

In [8]:
final_diseases_to_remove = [
    # --- TRAUMATIC INJURIES AND ORTHOPEDICS ---
    # Rationale: These are diagnosed via physical trauma history or X-ray/imaging. 
    # AI prediction from systemic symptoms is redundant and clinically unnecessary.
    'birth trauma', 'bone spur of the calcaneous', 'concussion', 'corneal abrasion', 
    'crushing injury', 'dislocation of the ankle', 'dislocation of the elbow', 
    'dislocation of the finger', 'dislocation of the foot', 'dislocation of the hip', 
    'dislocation of the knee', 'dislocation of the patella', 'dislocation of the shoulder', 
    'dislocation of the vertebra', 'dislocation of the wrist', 'fracture of the ankle', 
    'fracture of the arm', 'fracture of the facial bones', 'fracture of the finger', 
    'fracture of the foot', 'fracture of the hand', 'fracture of the jaw', 
    'fracture of the leg', 'fracture of the neck', 'fracture of the patella', 
    'fracture of the pelvis', 'fracture of the rib', 'fracture of the shoulder', 
    'fracture of the skull', 'fracture of the vertebra', 'head injury', 
    'heart contusion', 'hematoma', 'injury of the ankle', 'injury to internal organ', 
    'injury to the abdomen', 'injury to the arm', 'injury to the face', 
    'injury to the finger', 'injury to the hand', 'injury to the hip', 
    'injury to the knee', 'injury to the leg', 'injury to the shoulder', 
    'injury to the spinal cord', 'injury to the trunk', 'joint effusion', 
    'knee ligament or meniscus tear', 'lung contusion', 'rotator cuff injury', 
    'sprain or strain',

    # --- OPEN WOUNDS AND POST-SURGICAL ---
    # Rationale: Diagnosed by visual inspection; these are acute physical findings, 
    # not "diseases" to be predicted by pattern recognition.
    'infection of open wound', 'open wound due to trauma', 'open wound from surgical incision', 
    'open wound of the abdomen', 'open wound of the arm', 'open wound of the back', 
    'open wound of the cheek', 'open wound of the chest', 'open wound of the ear', 
    'open wound of the eye', 'open wound of the face', 'open wound of the finger', 
    'open wound of the foot', 'open wound of the head', 'open wound of the jaw', 
    'open wound of the knee', 'open wound of the lip', 'open wound of the mouth', 
    'open wound of the neck', 'open wound of the nose', 'open wound of the shoulder', 
    'pain after an operation', 'postoperative infection', 'burn',

    # --- MINOR AILMENTS AND ROUTINE ILLNESSES ---
    # Rationale: These are "low-stakes" diagnoses that a doctor identifies in 
    # seconds. Including them lowers the utility of a high-tech prediction model.
    'acne', 'actinic keratosis', 'athlete\'s foot', 'broken tooth', 'bunion', 
    'callus', 'chalazion', 'chickenpox', 'cold sore', 'common cold', 
    'dental caries', 'diaper rash', 'ear wax impaction', 'epistaxis (nose disorder)', 
    'flat feet', 'flu', 'gum disease', 'hammer toe', 'impetigo', 'ingrown toe nail', 
    'intertrigo (skin condition)', 'lice', 'mumps', 'oral thrush (yeast infection)', 
    'pinguecula', 'pityriasis rosea', 'scabies', 'sebaceous cyst', 
    'seborrheic keratosis', 'skin polyp', 'stye', 'sunburn', 'teething syndrome', 
    'tooth abscess', 'tooth disorder', 'viral warts',

    # --- EXTERNAL CAUSES, POISONING, AND ENVIRONMENT ---
    # Rationale: Diagnosis is based on toxicology or patient history of exposure. 
    # AI pattern matching symptoms won't replace a blood test for ethylene glycol.
    'carbon monoxide poisoning', 'envenomation from spider or animal bite', 
    'frostbite', 'heat exhaustion', 'heat stroke', 'hypothermia', 
    'insect bite', 'insulin overdose', 'poisoning due to analgesics', 
    'poisoning due to anticonvulsants', 'poisoning due to antidepressants', 
    'poisoning due to antimicrobial drugs', 'poisoning due to antipsychotics', 
    'poisoning due to antihypertensives', 'poisoning due to ethylene glycol', 
    'poisoning due to gas', 'poisoning due to opioids', 'poisoning due to sedatives', 
    'drug poisoning due to medication', 'alcohol intoxication',

    # --- PHYSIOLOGICAL STATES AND LIFESTYLE ---
    # Rationale: Pregnancy/Menopause are life stages. Substance abuse is a behavioral 
    # history/social finding, not a clinical diagnostic puzzle for AI.
    'pregnancy', 'menopause', 'fetal alcohol syndrome', 'normal pressure hydrocephalus', 
    'induced abortion', 'spontaneous abortion', 'missed abortion', 'mastectomy', 
    'alcohol abuse', 'drug abuse', 'drug abuse (barbiturates)', 'drug abuse (cocaine)', 
    'drug abuse (methamphetamine)', 'drug abuse (opioids)', 'marijuana abuse', 
    'smoking or tobacco addiction',

    # --- VISUAL FINDINGS ---
    # Rationale: Doctors can see these immediately upon looking at the patient.
    'eye alignment disorder'
]

print(f"Number of diseases to remove: {len(final_diseases_to_remove)}")
print(f"Total diseases after removing: {symptomsToDisease['diseases'].nunique() - len(final_diseases_to_remove)}")

Number of diseases to remove: 148
Total diseases after removing: 625


In [9]:
# Find diseases in your 'remove' list that aren't in your dataframe
missing_from_df = set(final_diseases_to_remove) - set(symptomsToDisease['diseases'].unique())

print(f"These {len(missing_from_df)} diseases were in your list but not found in the data:")
print(missing_from_df)

symptomsToDisease = symptomsToDisease[
    ~symptomsToDisease['diseases'].str.lower().str.strip().isin(
        [d.lower().strip() for d in final_diseases_to_remove]
    )
].reset_index(drop=True)
print(f"Total diseases left: {symptomsToDisease['diseases'].nunique()}")
print(f"\nFinal rows: {len(symptomsToDisease):,}")


These 2 diseases were in your list but not found in the data:
{'epistaxis (nose disorder)', 'sunburn'}
Total diseases left: 627

Final rows: 206,267


In [10]:
print(symptomsToDisease['diseases'].unique())

['panic disorder' 'vocal cord polyp' 'turner syndrome' 'cryptorchidism'
 'atrophic vaginitis' 'cellulitis or abscess of mouth'
 'headache after lumbar puncture' 'pyloric stenosis'
 'salivary gland disorder' 'osteochondrosis' 'metabolic disorder'
 'vaginitis' 'sick sinus syndrome' 'tinnitus of unknown cause' 'glaucoma'
 'eating disorder' 'transient ischemic attack' 'pyelonephritis'
 'chronic pain disorder' 'problem during pregnancy' 'liver cancer'
 'atelectasis' 'choledocholithiasis' 'cirrhosis'
 'thoracic aortic aneurysm' 'subdural hemorrhage' 'diabetic retinopathy'
 'fibromyalgia' 'ischemia of the bowel' 'peritonitis' 'acute pancreatitis'
 'thrombophlebitis' 'asthma' 'foreign body in the vagina'
 'restless leg syndrome' 'emphysema' 'cysticercosis'
 'infectious gastroenteritis' 'acute sinusitis'
 'substance-related mental disorder' 'postpartum depression'
 'coronary atherosclerosis' 'spondylitis' 'pituitary adenoma'
 'uterine fibroids' 'idiopathic nonmenstrual bleeding' 'ovarian torsio

In [11]:
# Using disease mapping to standardize disease names
import json

with open("../data/disease_mapping.json") as f:
    category_map = json.load(f)

In [12]:
# Create a flat list of all diseases in the JSON categories
mapped_diseases = []
for category in category_map:
    mapped_diseases.extend(category_map[category])

# Check for any diseases in your dataset that were missed by the mapping
missing_from_json = set(symptomsToDisease['diseases'].unique()) - set(mapped_diseases)

print(f"Diseases not yet categorized: {len(missing_from_json)}")
print(missing_from_json)

Diseases not yet categorized: 86
{'hypertrophic obstructive cardiomyopathy (hocm)', 'allergy to animals', 'fibroadenoma', 'acute glaucoma', 'breast cancer', 'dermatitis due to sun exposure', 'foreign body in the throat', 'gestational diabetes', 'chronic pain disorder', 'headache after lumbar puncture', 'thoracic outlet syndrome', 'foreign body in the ear', 'acariasis', 'hypovolemia', 'lactose intolerance', 'primary immunodeficiency', 'abscess of the pharynx', 'trichomonas infection', 'down syndrome', 'acute kidney injury', 'ectropion', 'hyperosmotic hyperketotic state', 'hepatitis due to a toxin', 'gonorrhea', 'hemorrhoids', 'breast infection (mastitis)', 'seasonal allergies (hay fever)', 'vertebrobasilar insufficiency', 'fluid overload', 'itching of unknown cause', 'fibrocystic breast disease', 'foreign body in the gastrointestinal tract', 'abscess of the lung', 'edward syndrome', 'spina bifida', 'croup', 'acute otitis media', 'chronic rheumatic fever', 'temporomandibular joint disord

In [13]:
disease_to_category = {
    disease: category
    for category, diseases in category_map.items()
    for disease in diseases
}

symptomsToDisease['disease_category'] = symptomsToDisease['diseases'].map(disease_to_category)
symptomsToDisease.fillna({'disease_category': 'Unknown Type'}, inplace=True)
display(symptomsToDisease[['diseases', 'disease_category']].sample(10, random_state=42))

Unnamed: 0,diseases,disease_category
101782,cervicitis,Genitourinary and Reproductive
195937,breast infection (mastitis),Unknown Type
55860,gout,Endocrine and Metabolic
1417,cellulitis or abscess of mouth,Ophthalmology and ENT
26146,bursitis,Musculoskeletal
109842,acute bronchiolitis,Respiratory System
82919,mittelschmerz,Genitourinary and Reproductive
88332,dry eye of unknown cause,Ophthalmology and ENT
14346,infectious gastroenteritis,Infectious Diseases
79467,nose disorder,Ophthalmology and ENT


In [14]:
missing = symptomsToDisease.isnull().sum().sum()
print(f"Missing values: {missing}")
if missing > 0:
    print(f"WARNING: Missing values detected!")

Missing values: 0


In [15]:
symptoms_per_patient = symptomsToDisease[symptom_cols].sum(axis=1)
print(f"   Avg symptoms per patient: {symptoms_per_patient.mean():.2f}")
print(f"   Min symptoms per patient: {symptoms_per_patient.min():.0f}")
print(f"   Max symptoms per patient: {symptoms_per_patient.max():.0f}")

print("-"*50)
disease_counts = symptomsToDisease['diseases'].value_counts()
print(f"   Total diseases: {symptomsToDisease['diseases'].nunique()}")
print(f"   Min samples per disease: {disease_counts.min()}")
print(f"   Max samples per disease: {disease_counts.max()}")

print("-"*50)

category_counts = symptomsToDisease['disease_category'].value_counts()
print(f"\nðŸ“ˆ Category Distribution:")
for cat, count in category_counts.items():
    print(f"   {cat:30s}: {count:5d} samples")

cat_imbalance = category_counts.max() / category_counts.min()
print(f"\nâœ… Category imbalance: {cat_imbalance:.1f}:1 (much better than {disease_counts.iloc[0] / disease_counts.iloc[-1]:.1f}:1)")

   Avg symptoms per patient: 5.32
   Min symptoms per patient: 1
   Max symptoms per patient: 12
--------------------------------------------------
   Total diseases: 627
   Min samples per disease: 1
   Max samples per disease: 1219
--------------------------------------------------

ðŸ“ˆ Category Distribution:
   Unknown Type                  : 27421 samples
   Genitourinary and Reproductive: 26542 samples
   Ophthalmology and ENT         : 24198 samples
   Gastrointestinal and Hepatic  : 21233 samples
   Musculoskeletal               : 17893 samples
   Cardiovascular and Circulatory: 16665 samples
   Mental and Behavioral Health  : 16092 samples
   Dermatological                : 15330 samples
   Neurological Disorders        : 10033 samples
   Respiratory System            :  9922 samples
   Endocrine and Metabolic       :  7323 samples
   Infectious Diseases           :  5786 samples
   Hematology and Oncology       :  4341 samples
   Obstetrics and Neonatal       :  3488 samples


In [16]:
# Prepare for normalization (Defer saving)
print(f"Dataset shape before normalization: {symptomsToDisease.shape}")
# We will save after the normalization step in the next section


Dataset shape before normalization: (206267, 377)


---
# Part 3: Symptom Normalization Pipeline

**Added from merged scripts**

Normalize symptoms, fix typos, and merge duplicate columns across all datasets.

In [17]:
import pandas as pd
import json
import sys
import shutil
import gc
from pathlib import Path
from datetime import datetime

# Add project root to path to import utils
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from utils.symptom_normalizer import normalize_symptom, validate_vocabulary, TYPO_MAP, SYNONYM_MAP

print(f"Project root: {project_root}")
print(f"Loaded normalizer with {len(TYPO_MAP)} typo rules and {len(SYNONYM_MAP)} synonym rules.")


Project root: c:\Users\henry\Desktop\Programming\Python\Multimodal_Diagnosis
Loaded normalizer with 20 typo rules and 91 synonym rules.


## Part 1: Clean Symptom Vocabulary
Ensure `data/symptom_vocabulary.json` is free of duplicates and typos.

In [18]:
vocab_path = project_root / "data" / "symptom_vocabulary.json"

# Load Vocabulary
with open(vocab_path) as f:
    symptoms = json.load(f)

print(f"Loaded {len(symptoms)} symptoms")

# Validate
valid_symptoms, issues = validate_vocabulary(symptoms)

print(f"Valid symptoms: {len(valid_symptoms)}")
print(f"Issues found: {len(issues)}")

for issue in issues:
    print(f"  - {issue}")

# Save if changes needed
if len(valid_symptoms) < len(symptoms) or issues:
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    backup_path = vocab_path.parent / f"{vocab_path.stem}_backup_{timestamp}{vocab_path.suffix}"
    shutil.copy2(vocab_path, backup_path)
    print(f"Backup created: {backup_path.name}")
    
    with open(vocab_path, 'w') as f:
        json.dump(sorted(valid_symptoms), f, indent=2)
    print(f"Saved cleaned vocabulary ({len(valid_symptoms)} symptoms)")
else:
    print("Vocabulary is already clean.")

Loaded 454 symptoms
Valid symptoms: 454
Issues found: 5
  - removed
  - merged
  - typo_fixed
  - original_count
  - final_count
Backup created: symptom_vocabulary_backup_20260117_002704.json
Saved cleaned vocabulary (454 symptoms)


## Part 2: Clean Datasets
Scan all CSV files, rename columns to match the clean vocabulary, and merge any duplicates.

In [19]:
# Configuration
NON_SYMPTOM_COLS = {'diseases', 'disease_category', 'symptoms', 'age', 'gender', 
                    'age_group', 'weight', 'height', 'bmi', 'occupation'}

def find_processing_targets(columns: list) -> dict:
    """Find columns that normalize to the same symptom or need renaming."""
    normalized_map = {}
    for col in columns:
        if col.lower() in NON_SYMPTOM_COLS:
            continue
        norm = normalize_symptom(col)
        if norm not in normalized_map:
            normalized_map[norm] = []
        normalized_map[norm].append(col)
    
    targets = {}
    for norm, cols in normalized_map.items():
        # If multiple original cols map to same norm -> MERGE
        if len(cols) > 1:
            targets[norm] = cols
        # If single original col doesn't match norm -> RENAME
        elif len(cols) == 1 and cols[0] != norm:
            targets[norm] = cols
            
    return targets

def apply_normalization(df: pd.DataFrame, dataframe_name: str = "DataFrame"):
    print(f"\nProcessing: {dataframe_name}...")
    targets = find_processing_targets(df.columns.tolist())
    
    if not targets:
        print("  -> No changes needed.")
        return df, False
    
    changes = {'merged': 0, 'renamed': 0}
    
    for canonical, cols in targets.items():
        if len(cols) > 1:
            # MERGE
            merged_values = df[cols].max(axis=1)
            df.drop(columns=cols, inplace=True)
            df[canonical] = merged_values
            print(f"  Merged {cols} -> '{canonical}'")
            changes['merged'] += 1
        else:
            # RENAME
            original = cols[0]
            df.rename(columns={original: canonical}, inplace=True)
            print(f"  Renamed '{original}' -> '{canonical}'")
            changes['renamed'] += 1
            
    print(f"  -> normalization complete. (Merged: {changes['merged']}, Renamed: {changes['renamed']})")
    return df, True

# 1. Normalize the in-memory dataframe `symptomsToDisease`
print("--- Normalizing In-Memory Data ---")
symptomsToDisease, changed = apply_normalization(symptomsToDisease, "symptomsToDisease")

# 2. Save the final cleaned dataframe
output_path = project_root / "data" / "processed" / "symptoms" / "symptoms_to_disease_cleaned.csv"
print(f"\nSaving to: {output_path}")

# Optimization: Downcast types before saving to reduce memory usage during write
for col in symptomsToDisease.select_dtypes(include=['float']).columns:
    symptomsToDisease[col] = pd.to_numeric(symptomsToDisease[col], downcast='float')
for col in symptomsToDisease.select_dtypes(include=['int']).columns:
    symptomsToDisease[col] = pd.to_numeric(symptomsToDisease[col], downcast='integer')

symptomsToDisease.to_csv(output_path, index=False)
print("Saved successfully.")

# 3. Memory Cleanup
print("\n--- Memory Cleanup ---")
del symptomsToDisease
gc.collect()
print("Cleared symptomsToDisease from memory.")


--- Normalizing In-Memory Data ---

Processing: symptomsToDisease...
  -> No changes needed.

Saving to: c:\Users\henry\Desktop\Programming\Python\Multimodal_Diagnosis\data\processed\symptoms\symptoms_to_disease_cleaned.csv


Saved successfully.

--- Memory Cleanup ---
Cleared symptomsToDisease from memory.
