In [7]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

np.random.seed(42)

# Define conditions
conditions = ["Malaria", "Typhoid", "Both"]
n_samples = 50000  # total number of patients

# Expanded symptom set
symptom_columns = [
    # General/systemic
    "Fever", "Fatigue", "Weakness", "Loss_of_Appetite", "Weight_Loss", "Sweating", "Chills", "Headache",
    "Muscle_Pain", "Joint_Pain", "Dizziness", "Paleness",
    # Gastrointestinal
    "Nausea", "Vomiting", "Abdominal_Pain", "Diarrhea", "Constipation", "Bloating",
    # Neurological
    "Confusion", "Insomnia", "Seizures", "Irritability",
    # Respiratory
    "Cough", "Shortness_of_Breath", "Chest_Pain", "Sore_Throat",
    # Hematological/other
    "Anemia", "Jaundice", "Bleeding_Gums", "Bruising",
    # Skin/sensory
    "Rash", "Itching", "Yellow_Eyes", "Eye_Pain", "Blurred_Vision",
    # Urinary
    "Dark_Urine", "Frequent_Urination", "Burning_Urine",
    # Rare/systemic
    "Back_Pain", "Dehydration", "Swelling_Feet"
]

# Generate base symptom probabilities (introduce slight correlation between diseases)
def generate_symptom_probs():
    malaria = np.random.uniform(0.3, 0.8, len(symptom_columns))
    typhoid = np.random.uniform(0.25, 0.75, len(symptom_columns))
    both = np.clip((malaria + typhoid) / 1.4, 0.3, 0.9)
    return malaria, typhoid, both

malaria_probs, typhoid_probs, both_probs = generate_symptom_probs()

# Class proportions
proportions = {"Malaria": 0.64, "Typhoid": 0.35, "Both": 0.01}

# Synthetic dataset builder
data = []
for condition, p in proportions.items():
    n = int(n_samples * p)
    probs = {
        "Malaria": malaria_probs,
        "Typhoid": typhoid_probs,
        "Both": both_probs
    }[condition]

    # Binary symptom presence
    symptoms = np.random.binomial(1, probs, size=(n, len(symptom_columns)))
    df_part = pd.DataFrame(symptoms, columns=symptom_columns)
    df_part["Condition"] = condition

    # Continuous clinical features (add overlap + noise)
    if condition == "Malaria":
        df_part["Temperature"] = np.random.normal(38.7, 0.6, n)
        df_part["Heart_Rate"] = np.random.normal(95, 12, n)
        df_part["Platelet_Count"] = np.random.normal(150, 40, n)  # overlapped
    elif condition == "Typhoid":
        df_part["Temperature"] = np.random.normal(38.4, 0.6, n)
        df_part["Heart_Rate"] = np.random.normal(88, 13, n)
        df_part["Platelet_Count"] = np.random.normal(170, 45, n)
    else:  # Both
        df_part["Temperature"] = np.random.normal(39.0, 0.5, n)
        df_part["Heart_Rate"] = np.random.normal(98, 14, n)
        df_part["Platelet_Count"] = np.random.normal(145, 50, n)

    # Add minor noise & derived feature
    df_part["Noise"] = np.random.rand(n)
    df_part["Symptom_Count"] = df_part[symptom_columns].sum(axis=1)
    df_part["Severity_Index"] = (
        0.3 * df_part["Temperature"] +
        0.002 * df_part["Heart_Rate"] -
        0.001 * df_part["Platelet_Count"] +
        0.1 * df_part["Symptom_Count"] +
        np.random.normal(0, 0.05, n)
    )

    data.append(df_part)

# Combine & shuffle
df_synth = shuffle(pd.concat(data, ignore_index=True)).reset_index(drop=True)

print(df_synth.head())
print("\nDataset shape:", df_synth.shape)
print("Number of symptom columns:", len(symptom_columns))
print("\nCondition proportions:")
print(df_synth["Condition"].value_counts(normalize=True).round(3))


   Fever  Fatigue  Weakness  Loss_of_Appetite  Weight_Loss  Sweating  Chills  \
0      0        1         1                 0            0         1       0   
1      1        1         1                 0            1         1       0   
2      1        0         0                 1            1         0       0   
3      0        0         0                 0            0         1       0   
4      0        1         1                 1            1         1       1   

   Headache  Muscle_Pain  Joint_Pain  ...  Back_Pain  Dehydration  \
0         1            0           1  ...          1            1   
1         1            1           1  ...          1            0   
2         1            0           0  ...          1            0   
3         0            1           0  ...          0            0   
4         1            1           1  ...          0            1   

   Swelling_Feet  Condition  Temperature  Heart_Rate  Platelet_Count  \
0              0    Malaria    3

In [8]:
df_synth.to_csv("synthetic_malaria_typhoid_dataset.csv", index=False)