In [1]:
import pandas as pd
import numpy as np
import pickle
import dill  # for saving function
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

#  Load and preprocess dataset
df = pd.read_csv('training_data.csv')

# Drop unnecessary column
df = df.drop(columns=['Unnamed: 133'], axis=1)

# Shuffle the dataset
df = df.sample(frac=1).reset_index(drop=True)

# Prepare features and labels
X = df.drop(columns=['prognosis'], axis=1)
y = df['prognosis']

# Encode target labels
le = LabelEncoder()
y = le.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train multiple models and compare
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='linear', random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Logistic Regression': LogisticRegression(max_iter=200, random_state=42),
    'Naive Bayes': MultinomialNB()
}

result = {}
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    result[name] = scores.mean()
    print(f"{name} CV Accuracy: {scores.mean():.4f}")

# Train best model (e.g., SVM)
best_model = SVC(kernel='linear', random_state=42)
best_model.fit(X_train, y_train)

# Evaluate
y_pred = best_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred) * 100)

#  Save the trained model and label encoder using pickle
with open('diseases_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

# Save symptoms list
symptoms = list(X.columns)
with open('symptoms_list.pkl', 'wb') as f:
    pickle.dump(symptoms, f)

# # Define and save predict_disease function using dill

# # Create a disease ID -> name mapping
# disease_mapping = dict(zip(range(len(le.classes_)), le.classes_))

# def predict_disease(user_symptoms: list[str]) -> str:
#     """Predict disease from user symptoms (binary encoded input)."""
#     with open('diseases_model.pkl', 'rb') as model_file:
#         model = pickle.load(model_file)

#     with open('label_encoder.pkl', 'rb') as le_file:
#         le = pickle.load(le_file)

#     with open('symptoms_list.pkl', 'rb') as sym_file:
#         symptoms = pickle.load(sym_file)

#     input_data = np.zeros(len(symptoms))
#     for symptom in user_symptoms:
#         if symptom in symptoms:
#             index = symptoms.index(symptom)
#             input_data[index] = 1

#     input_data = input_data.reshape(1, -1)
#     prediction = model.predict(input_data)
#     predicted_disease = le.inverse_transform(prediction)[0]
#     return predicted_disease

# # Save the function using dill (for FastAPI import)
# with open('predict_function.dill', 'wb') as f:
#     dill.dump(predict_disease, f)

Random Forest CV Accuracy: 1.0000
SVM CV Accuracy: 1.0000
KNN CV Accuracy: 1.0000
Logistic Regression CV Accuracy: 1.0000
Naive Bayes CV Accuracy: 1.0000
Test Accuracy: 100.0
