In [2]:
import pandas as pd
import numpy as np

def preprocess(data, min_occurrence_percentage=5):
    df = data.copy()
    all_symptoms = set()
    for col in df.columns[1:]:
        all_symptoms.update(df[col].dropna().unique())

    symptom_dict = {'Disease': df['Disease']}

    for symptom in sorted(all_symptoms):
        has_symptom = pd.Series(False, index=df.index)
        for col in df.columns[1:]:
            has_symptom = has_symptom | (df[col] == symptom)
        symptom_dict[symptom] = has_symptom.astype(int)

    symptom_df = pd.DataFrame(symptom_dict)
    symptom_occurrences = symptom_df.drop('Disease', axis=1).mean() * 100
    frequent_symptoms = symptom_occurrences[symptom_occurrences >= min_occurrence_percentage].index
    reduced_df = symptom_df[['Disease'] + list(frequent_symptoms)]

    print(f"\nDimensionality Reduction Summary:")
    print(f"Original number of symptoms: {len(all_symptoms)}")
    print(f"Number of symptoms after reduction: {len(frequent_symptoms)}")
    print(f"Symptoms removed: {len(all_symptoms) - len(frequent_symptoms)}")

    return reduced_df


data = pd.read_csv("data/dataset.csv")
processed_df = preprocess(data, min_occurrence_percentage=5)
processed_df.to_csv("data/final_data.csv", index=False)

print("\nFinal Dataset Info:")
print(f"Number of samples: {len(processed_df)}")
print(f"Number of features: {len(processed_df.columns) - 1}")


Dimensionality Reduction Summary:
Original number of symptoms: 131
Number of symptoms after reduction: 31
Symptoms removed: 100

Final Dataset Info:
Number of samples: 4920
Number of features: 31


In [3]:
import pandas as pd
import numpy as np
import os
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [4]:
def load_data():
    data = pd.read_csv("data/final_data.csv")
    if 'Disease' not in data.columns:
        if 'disease' in data.columns:
            data = data.rename(columns={'disease': 'Disease'})
        elif 'DISEASE' in data.columns:
            data = data.rename(columns={'DISEASE': 'Disease'})
        else:
            raise KeyError("'Disease' column not found in any variation")

    x = data.drop('Disease', axis=1)
    y = data['Disease']
    return x, y