In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle
import os
import warnings
import time
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, Flatten, LSTM
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
df = pd.read_csv('data/Final_Augmented_dataset_Diseases_and_Symptoms.csv')
print(f"Dataset loaded with {df.shape[0]} rows and {df.shape[1]} columns")

print("\nDisease distribution (top 10):")
print(df['diseases'].value_counts().head(10))
print(f"\nTotal unique diseases: {df['diseases'].nunique()}")

symptom_columns = [col for col in df.columns if col != 'diseases']

print(df)
print(symptom_columns)

Dataset loaded with 246945 rows and 378 columns

Disease distribution (top 10):
diseases
cystitis                          1219
vulvodynia                        1218
nose disorder                     1218
complex regional pain syndrome    1217
spondylosis                       1216
hypoglycemia                      1215
peripheral nerve disorder         1215
esophagitis                       1215
vaginal cyst                      1215
conjunctivitis due to allergy     1215
Name: count, dtype: int64

Total unique diseases: 773
                      diseases  anxiety and nervousness  depression  \
0               panic disorder                        1           0   
1               panic disorder                        0           0   
2               panic disorder                        1           1   
3               panic disorder                        1           0   
4               panic disorder                        1           1   
...                        ...           

In [None]:
X = df[symptom_columns]
y = df['diseases']

# Check for missing values
missing_values = X.isnull().sum().sum()
if missing_values > 0:
    print(f"Found {missing_values} missing values. Filling with 0")
    X = X.fillna(0)

# Handle imbalanced classes
class_counts = y.value_counts()
min_samples = 2  

diseases_to_keep = class_counts[class_counts >= min_samples].index
mask = y.isin(diseases_to_keep)

if mask.sum() < len(y):
    print(f"Filtering out {len(y) - mask.sum()} samples with rare diseases")
    X = X[mask]
    y = y[mask]

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
try:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    X_train_enc, X_test_enc, y_train_enc, y_test_enc = train_test_split(
        X, y_encoded, test_size=0.2, random_state=42, stratify=y
    )
    print("Using stratified sampling for train-test split")
except ValueError as e:
    print(f"Stratified split failed: {str(e)}")
    print("Falling back to random split")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    X_train_enc, X_test_enc, y_train_enc, y_test_enc = train_test_split(
        X, y_encoded, test_size=0.2, random_state=42
    )

num_classes = len(label_encoder.classes_)
y_train_categorical = to_categorical(y_train_enc, num_classes=num_classes)
y_test_categorical = to_categorical(y_test_enc, num_classes=num_classes)


Filtering out 19 samples with rare diseases
Using stratified sampling for train-test split


In [None]:
print("\nTraining Random Forest classifier...")
start_time = time.time()

model_rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=1,
    max_features='sqrt',
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
model_rf.fit(X_train, y_train)

y_pred_rf = model_rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
training_time_rf = time.time() - start_time

print(f"RF Model accuracy: {accuracy_rf:.4f} (trained in {training_time_rf:.2f} seconds)")

# Detailed classification report
print("\nRF Classification Report:")
print(classification_report(y_test, y_pred_rf))

# Feature importance analysis
feature_importance_rf = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': model_rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nTop 10 Important Symptoms (RF):")
print(feature_importance_rf.head(10))



Training Random Forest classifier...
RF Model accuracy: 0.7571 (trained in 61.35 seconds)

RF Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                                          precision    recall  f1-score   support

                               abdominal aortic aneurysm       1.00      0.82      0.90        28
                                        abdominal hernia       0.99      0.98      0.98        81
                                         abscess of nose       0.62      0.83      0.71        58
                                     abscess of the lung       0.57      1.00      0.73         4
                                  abscess of the pharynx       0.89      0.79      0.84        68
                                    acanthosis nigricans       0.04      1.00      0.07         6
                                               acariasis       1.00      0.86      0.92         7
                                               achalasia       0.86      0.71      0.77        17
                                                    acne       0.45      0.64      0.53        99
                   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
