In [2]:
import pandas as pd
import numpy as np

def preprocess(data, min_occurrence_percentage=5):
    df = data.copy()
    all_symptoms = set()
    for col in df.columns[1:]:
        all_symptoms.update(df[col].dropna().unique())

    symptom_dict = {'Disease': df['Disease']}

    for symptom in sorted(all_symptoms):
        has_symptom = pd.Series(False, index=df.index)
        for col in df.columns[1:]:
            has_symptom = has_symptom | (df[col] == symptom)
        symptom_dict[symptom] = has_symptom.astype(int)

    symptom_df = pd.DataFrame(symptom_dict)
    symptom_occurrences = symptom_df.drop('Disease', axis=1).mean() * 100
    frequent_symptoms = symptom_occurrences[symptom_occurrences >= min_occurrence_percentage].index
    reduced_df = symptom_df[['Disease'] + list(frequent_symptoms)]

    print(f"\nDimensionality Reduction Summary:")
    print(f"Original number of symptoms: {len(all_symptoms)}")
    print(f"Number of symptoms after reduction: {len(frequent_symptoms)}")
    print(f"Symptoms removed: {len(all_symptoms) - len(frequent_symptoms)}")

    return reduced_df


data = pd.read_csv("data/dataset.csv")
processed_df = preprocess(data, min_occurrence_percentage=5)
processed_df.to_csv("data/final_data.csv", index=False)

print("\nFinal Dataset Info:")
print(f"Number of samples: {len(processed_df)}")
print(f"Number of features: {len(processed_df.columns) - 1}")


Dimensionality Reduction Summary:
Original number of symptoms: 131
Number of symptoms after reduction: 31
Symptoms removed: 100

Final Dataset Info:
Number of samples: 4920
Number of features: 31


In [3]:
import pandas as pd
import numpy as np
import os
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [4]:
def load_data():
    data = pd.read_csv("data/final_data.csv")
    if 'Disease' not in data.columns:
        if 'disease' in data.columns:
            data = data.rename(columns={'disease': 'Disease'})
        elif 'DISEASE' in data.columns:
            data = data.rename(columns={'DISEASE': 'Disease'})
        else:
            raise KeyError("'Disease' column not found in any variation")

    x = data.drop('Disease', axis=1)
    y = data['Disease']
    return x, y

In [5]:
from sklearn.exceptions import UndefinedMetricWarning
import warnings

def evaluate_model(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    with warnings.catch_warnings():
        warnings.simplefilter("ignore", UndefinedMetricWarning)

        metrics = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred, average='weighted', zero_division=0),
            'recall': recall_score(y_test, y_pred, average='weighted', zero_division=0),
            'f1': f1_score(y_test, y_pred, average='weighted', zero_division=0)
        }

    return metrics

In [6]:
def get_models():
    models = {
        'SVM': SVC(kernel='rbf'),
        'Naive Bayes': GaussianNB(),
        'Logistic Regression': LogisticRegression(),
        'Random Forest': RandomForestClassifier(),
        'Gradient Boosting': GradientBoostingClassifier(),
        'AdaBoost': AdaBoostClassifier(),
        'Decision Tree': DecisionTreeClassifier(),
        'KNN': KNeighborsClassifier()
    }
    return models

In [7]:
def evaluate_all_models():
    x, y = load_data()
    scaler = StandardScaler()
    train_sizes = [0.5, 0.6, 0.7, 0.8, 0.9]
    models = get_models()
    results = []

    os.makedirs('public/models', exist_ok=True)

    for train_size in train_sizes:
        x_train, x_test, y_train, y_test = train_test_split(
            x, y, train_size=train_size, random_state=42
        )

        x_train_scaled = scaler.fit_transform(x_train)
        x_test_scaled = scaler.transform(x_test)

        for model_name, model in models.items():
            print(f"Evaluating {model_name} with {train_size*100}% training data")

            model_folder = os.path.join('public', 'models', model_name.replace(" ", "_"))
            os.makedirs(model_folder, exist_ok=True)

            model.fit(x_train_scaled, y_train)

            model_filename = os.path.join(
                model_folder, 
                f"{model_name.replace(' ', '_')}_model_{train_size*100}.0.pkl"
            )
            joblib.dump(model, model_filename)
            print(f"Saved {model_name} model as {model_filename}")

            metrics = evaluate_model(model, x_train_scaled, x_test_scaled, y_train, y_test)

            results.append({
                'Model': model_name,
                'Training Size': f"{train_size*100}%",
                'Accuracy': metrics['accuracy'],
                'Precision': metrics['precision'],
                'Recall': metrics['recall'],
                'F1 Score': metrics['f1']
            })

            y_pred = model.predict(x_test_scaled)
            cm = confusion_matrix(y_test, y_pred)
            plt.figure(figsize=(8, 6))
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                       xticklabels=np.unique(y), yticklabels=np.unique(y))
            plt.title(f'Confusion Matrix for {model_name} (Train Size: {train_size*100}%)')
            plt.xlabel('Predicted')
            plt.ylabel('Actual')
            confusion_matrix_filename = os.path.join(
                model_folder, 
                f"{model_name.replace(' ', '_')}_confusion_matrix_{train_size*100}.png"
            )
            plt.savefig(confusion_matrix_filename)
            plt.close()

            if hasattr(model, 'feature_importances_'):
                importance = model.feature_importances_
                feature_names = x.columns
                importance_df = pd.DataFrame({
                    'Feature': feature_names,
                    'Importance': importance
                }).sort_values(by='Importance', ascending=False)

                plt.figure(figsize=(10, 6))
                sns.barplot(x='Importance', y='Feature', data=importance_df)
                plt.title(f'Feature Importance for {model_name} (Train Size: {train_size*100}%)')
                feature_importance_filename = os.path.join(
                    model_folder,
                    f"{model_name.replace(' ', '_')}_feature_importance_{train_size*100}.png"
                )
                plt.savefig(feature_importance_filename)
                plt.close()

    results_df = pd.DataFrame(results)
    results_df.sort_values(by=['Model', 'Training Size'], inplace=True)
    results_df.to_csv('public/model_evaluation_results.csv', index=False)

    print("Model evaluation results saved to public/model_evaluation_results.csv")
    print("All models evaluated and saved successfully.")



In [8]:
evaluate_all_models()

Evaluating SVM with 50.0% training data
Saved SVM model as public\models\SVM\SVM_model_50.0.0.pkl
Evaluating Naive Bayes with 50.0% training data
Saved Naive Bayes model as public\models\Naive_Bayes\Naive_Bayes_model_50.0.0.pkl
Evaluating Logistic Regression with 50.0% training data
Saved Logistic Regression model as public\models\Logistic_Regression\Logistic_Regression_model_50.0.0.pkl
Evaluating Random Forest with 50.0% training data
Saved Random Forest model as public\models\Random_Forest\Random_Forest_model_50.0.0.pkl
Evaluating Gradient Boosting with 50.0% training data
Saved Gradient Boosting model as public\models\Gradient_Boosting\Gradient_Boosting_model_50.0.0.pkl
Evaluating AdaBoost with 50.0% training data
Saved AdaBoost model as public\models\AdaBoost\AdaBoost_model_50.0.0.pkl
Evaluating Decision Tree with 50.0% training data
Saved Decision Tree model as public\models\Decision_Tree\Decision_Tree_model_50.0.0.pkl
Evaluating KNN with 50.0% training data
Saved KNN model as pub