In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, roc_auc_score
from imblearn.over_sampling import RandomOverSampler
from statistics import mode

# ------------------------------------------
# Step 1: Load and Preprocess Dataset
# ------------------------------------------
data = pd.read_csv(r'C:\Users\USER\Downloads\improved_disease_dataset.csv')

# Encode target variable (disease)
encoder = LabelEncoder()
data["disease"] = encoder.fit_transform(data["disease"])

# Features & target
X = data.iloc[:, :-1]  # Symptoms
y = data.iloc[:, -1]  # Disease labels

# Handle class imbalance with oversampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Split dataset into training & test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Standardize feature values for better model performance
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)  # Ensure feature names are retained
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

# ------------------------------------------
# Step 2: Hyperparameter Tuning
# ------------------------------------------

# Define hyperparameter grids
param_grid_rf = {"n_estimators": [50, 100, 200], "max_depth": [None, 10, 20]}
param_grid_svm = {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]}

# Perform GridSearchCV for Random Forest
grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5)
grid_rf.fit(X_train, y_train)

# Perform GridSearchCV for SVM
grid_svm = GridSearchCV(SVC(probability=True, random_state=42), param_grid_svm, cv=5)  # FIX: `probability=True`
grid_svm.fit(X_train, y_train)

# Best hyperparameters
rf_model = grid_rf.best_estimator_
svm_model = grid_svm.best_estimator_
nb_model = GaussianNB()

# Train optimized models
rf_model.fit(X_train, y_train)
svm_model.fit(X_train, y_train)
nb_model.fit(X_train, y_train)

# ------------------------------------------
# Step 3: Model Evaluation
# ------------------------------------------
rf_preds = rf_model.predict(X_test)
nb_preds = nb_model.predict(X_test)
svm_preds = svm_model.predict(X_test)

rf_accuracy = accuracy_score(y_test, rf_preds)
nb_accuracy = accuracy_score(y_test, nb_preds)
svm_accuracy = accuracy_score(y_test, svm_preds)

# Calculate F1-score and ROC AUC for deeper evaluation
rf_f1 = f1_score(y_test, rf_preds, average="weighted")
nb_f1 = f1_score(y_test, nb_preds, average="weighted")
svm_f1 = f1_score(y_test, svm_preds, average="weighted")

rf_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test), multi_class="ovr")
nb_auc = roc_auc_score(y_test, nb_model.predict_proba(X_test), multi_class="ovr")
svm_auc = roc_auc_score(y_test, svm_model.predict_proba(X_test), multi_class="ovr")  # FIX: Using `predict_proba()`

print(f"Random Forest - Accuracy: {rf_accuracy:.2f}, F1-score: {rf_f1:.2f}, AUC: {rf_auc:.2f}")
print(f"Naive Bayes - Accuracy: {nb_accuracy:.2f}, F1-score: {nb_f1:.2f}, AUC: {nb_auc:.2f}")
print(f"SVM - Accuracy: {svm_accuracy:.2f}, F1-score: {svm_f1:.2f}, AUC: {svm_auc:.2f}")

# Normalize model weights
total_accuracy = rf_accuracy + nb_accuracy + svm_accuracy
rf_weight = rf_accuracy / total_accuracy
nb_weight = nb_accuracy / total_accuracy
svm_weight = svm_accuracy / total_accuracy

print(f"Model Weights - RF: {rf_weight:.2f}, NB: {nb_weight:.2f}, SVM: {svm_weight:.2f}")

# ------------------------------------------
# Step 4: Feature Importance Analysis
# ------------------------------------------
feature_importance = rf_model.feature_importances_
sorted_idx = np.argsort(feature_importance)[::-1]

print("\nTop 10 Most Important Symptoms:")
for i in sorted_idx[:10]:  # Display top 10 features
    print(f"{X.columns[i]}: {feature_importance[i]:.4f}")

# ------------------------------------------
# Step 5: Weighted Voting for Final Prediction
# ------------------------------------------
def predict_disease(input_symptoms):
    input_symptoms = input_symptoms.split(",")
    input_data = [0] * len(X.columns)

    for symptom in input_symptoms:
        symptom = symptom.strip()
        if symptom in X.columns:
            input_data[X.columns.get_loc(symptom)] = 1

    input_data = pd.DataFrame([input_data], columns=X.columns)
    input_data = pd.DataFrame(scaler.transform(input_data), columns=X.columns)  # Apply scaling

    rf_pred = encoder.classes_[rf_model.predict(input_data)[0]]
    nb_pred = encoder.classes_[nb_model.predict(input_data)[0]]
    svm_pred = encoder.classes_[svm_model.predict(input_data)[0]]

    weighted_preds = {rf_pred: rf_weight, nb_pred: nb_weight, svm_pred: svm_weight}
    final_pred = max(weighted_preds, key=weighted_preds.get)

    return {
        "Random Forest Prediction": rf_pred,
        "Naive Bayes Prediction": nb_pred,
        "SVM Prediction": svm_pred,
        "Final Prediction": final_pred
    }

# ------------------------------------------
# Step 6: Save Trained Models
# ------------------------------------------
joblib.dump(rf_model, "random_forest_model.pkl")
joblib.dump(nb_model, "naive_bayes_model.pkl")
joblib.dump(svm_model, "svm_model.pkl")
joblib.dump(encoder, "label_encoder.pkl")
joblib.dump(scaler, "scaler.pkl")

print("Models saved successfully!")

# ------------------------------------------
# Step 7: Test the Function
# ------------------------------------------
user_input = "Fever,Headache,Nausea,Vomiting,Fatigue"
result = predict_disease(user_input)

print("Test Results:")
for model, prediction in result.items():
    print(f"{model}: {prediction}")


Random Forest - Accuracy: 0.53, F1-score: 0.52, AUC: 0.92
Naive Bayes - Accuracy: 0.16, F1-score: 0.17, AUC: 0.81
SVM - Accuracy: 0.54, F1-score: 0.54, AUC: 0.95
Model Weights - RF: 0.43, NB: 0.13, SVM: 0.44

Top 10 Most Important Symptoms:
fever: 0.1212
skin_rash: 0.1210
nausea: 0.1129
fatigue: 0.1096
headache: 0.1052
vomiting: 0.0994
weight_loss: 0.0881
joint_pain: 0.0846
yellow_eyes: 0.0835
cough: 0.0745
Models saved successfully!
Test Results:
Random Forest Prediction: Heart attack
Naive Bayes Prediction: Urinary tract infection
SVM Prediction: Heart attack
Final Prediction: Heart attack
