# Data Cleaning: Symptom-Disease Dataset

This notebook cleans and prepares the raw symptom-disease data for training.

## Table of Contents
1. **Setup & Imports** - Load libraries and constants
2. **Data Loading** - Load raw dataset
3. **Disease Filtering** - Remove non-predictable diseases
4. **Category Assignment** - Map diseases to categories
5. **Data Quality Checks** - Verify integrity
6. **Symptom Normalization** - Standardize symptom columns
7. **Save Cleaned Data** - Export processed dataset

---

In [None]:
# Standard libraries
import pandas as pd
import numpy as np
import json
import sys
import shutil
import gc
from pathlib import Path
from datetime import datetime

# Project setup
project_root = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
sys.path.insert(0, str(project_root))

# Import project utilities and constants
from utils.symptom_normalizer import normalize_symptom, validate_vocabulary
from utils.consts import (
    TYPO_MAP, SYNONYM_MAP, PLURAL_MAP,
    NON_SYMPTOM_COLS, DISEASES_TO_EXCLUDE
)

print(f"Project root: {project_root}")
print(f"Loaded {len(DISEASES_TO_EXCLUDE)} diseases to exclude")
print(f"Loaded {len(TYPO_MAP)} typo rules, {len(SYNONYM_MAP)} synonym rules")

## 1. Data Loading

In [None]:
# Load raw data
symptomsToDisease = pd.read_csv(project_root / 'data/raw/symptoms/Disease and symptoms dataset.csv')

original_rows = len(symptomsToDisease)
original_cols = len(symptomsToDisease.columns)
symptom_cols = [c for c in symptomsToDisease.columns if c != 'diseases']

print(f"Loaded dataset: {original_rows:,} rows x {original_cols} columns")
print(f"Symptom columns: {len(symptom_cols)}")
print(f"Unique diseases: {symptomsToDisease['diseases'].nunique()}")

## 2. Disease Filtering

Remove diseases that are diagnosed via physical trauma, imaging, or lab tests rather than symptom patterns.

In [None]:
# Filter out non-predictable diseases using centralized constant
print(f"Diseases to exclude: {len(DISEASES_TO_EXCLUDE)}")

# Check which diseases from exclusion list are actually in our data
diseases_in_data = set(symptomsToDisease['diseases'].str.lower().str.strip().unique())
exclusions_found = set(d.lower() for d in DISEASES_TO_EXCLUDE) & diseases_in_data
print(f"Exclusions matching our data: {len(exclusions_found)}")

# Apply filter
symptomsToDisease = symptomsToDisease[
    ~symptomsToDisease['diseases'].str.lower().str.strip().isin(
        [d.lower().strip() for d in DISEASES_TO_EXCLUDE]
    )
].reset_index(drop=True)

print(f"\nAfter filtering:")
print(f"  Rows: {len(symptomsToDisease):,} (removed {original_rows - len(symptomsToDisease):,})")
print(f"  Diseases: {symptomsToDisease['diseases'].nunique()}")

## 3. Category Assignment

Map each disease to a medical category using the disease_mapping.json file.

In [None]:
# Load disease-to-category mapping
with open(project_root / 'data/disease_mapping.json') as f:
    category_map = json.load(f)

# Create flat lookup
disease_to_category = {
    disease: category
    for category, diseases in category_map.items()
    for disease in diseases
}

# Apply mapping
symptomsToDisease['disease_category'] = symptomsToDisease['diseases'].map(disease_to_category)
symptomsToDisease['disease_category'].fillna('Unknown Type', inplace=True)

# Show category distribution
print("Category Distribution:")
for cat, count in symptomsToDisease['disease_category'].value_counts().items():
    print(f"  {cat:35s}: {count:5d} samples")

## 4. Data Quality Checks

In [None]:
# Check for missing values
missing = symptomsToDisease.isnull().sum().sum()
print(f"Missing values: {missing}")

# Check symptoms per patient
symptoms_per_patient = symptomsToDisease[symptom_cols].sum(axis=1)
print(f"\nSymptoms per patient:")
print(f"  Mean: {symptoms_per_patient.mean():.2f}")
print(f"  Min:  {symptoms_per_patient.min():.0f}")
print(f"  Max:  {symptoms_per_patient.max():.0f}")

# Check disease distribution
disease_counts = symptomsToDisease['diseases'].value_counts()
print(f"\nDisease distribution:")
print(f"  Total diseases: {len(disease_counts)}")
print(f"  Min samples:    {disease_counts.min()}")
print(f"  Max samples:    {disease_counts.max()}")
print(f"  Imbalance:      {disease_counts.max() / disease_counts.min():.1f}:1")

## 5. Symptom Normalization

Standardize symptom column names using the normalizer.

In [None]:
def find_processing_targets(columns: list) -> dict:
    """Find columns that normalize to the same symptom or need renaming."""
    normalized_map = {}
    for col in columns:
        if col.lower() in NON_SYMPTOM_COLS:
            continue
        norm = normalize_symptom(col)
        if norm not in normalized_map:
            normalized_map[norm] = []
        normalized_map[norm].append(col)
    
    targets = {}
    for norm, cols in normalized_map.items():
        if len(cols) > 1 or (len(cols) == 1 and cols[0] != norm):
            targets[norm] = cols
    return targets

def apply_normalization(df: pd.DataFrame, name: str = 'DataFrame'):
    """Apply symptom normalization to a dataframe."""
    print(f"\nProcessing: {name}...")
    targets = find_processing_targets(df.columns.tolist())
    
    if not targets:
        print("  -> No changes needed.")
        return df, False
    
    for canonical, cols in targets.items():
        if len(cols) > 1:
            # Merge columns
            df[canonical] = df[cols].max(axis=1)
            df.drop(columns=cols, inplace=True)
            print(f"  Merged {cols} -> '{canonical}'")
        else:
            # Rename column
            df.rename(columns={cols[0]: canonical}, inplace=True)
            print(f"  Renamed '{cols[0]}' -> '{canonical}'")
    
    return df, True

In [None]:
# Apply normalization
symptomsToDisease, changed = apply_normalization(symptomsToDisease, 'symptomsToDisease')
print(f"\nFinal shape: {symptomsToDisease.shape}")

## 6. Save Cleaned Data

In [None]:
# Optimize memory before saving
for col in symptomsToDisease.select_dtypes(include=['float']).columns:
    symptomsToDisease[col] = pd.to_numeric(symptomsToDisease[col], downcast='float')
for col in symptomsToDisease.select_dtypes(include=['int']).columns:
    symptomsToDisease[col] = pd.to_numeric(symptomsToDisease[col], downcast='integer')

# Save
output_path = project_root / 'data/processed/symptoms/symptoms_to_disease_cleaned.csv'
symptomsToDisease.to_csv(output_path, index=False)
print(f"Saved to: {output_path}")
print(f"File size: {output_path.stat().st_size / 1024**2:.2f} MB")

# Cleanup
del symptomsToDisease
gc.collect()
print("Memory cleared.")