In [6]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

print("‚úÖ Libraries imported.")
print("üîÅ Loading data...")

main_df = pd.read_csv("../data/dataset.csv")
severity_df = pd.read_csv("../data/Symptom-severity.csv")
desc_df = pd.read_csv("../data/symptom_Description.csv")
precaution_df = pd.read_csv("../data/symptom_precaution.csv")

print("‚úÖ Data loaded.")
print("Main dataset shape:", main_df.shape)

# Replace NaNs with 'None'
main_df.fillna('None', inplace=True)

# Combine symptoms into a list
symptom_cols = main_df.columns[1:]
main_df["Symptoms"] = main_df[symptom_cols].values.tolist()
main_df["Symptoms"] = main_df["Symptoms"].apply(lambda x: [i.strip() for i in x if i != 'None'])

# Features and labels
X_raw = main_df["Symptoms"]
y = main_df["Disease"]

# Encode symptoms using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
X = mlb.fit_transform(X_raw)

print("üß¨ Total unique symptoms:", len(mlb.classes_))
print("üìè Input shape (X):", X.shape)
print("üìè Output shape (y):", y.shape)

print("üîß Training model...")

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)

y_pred = model.predict(X)
accuracy = accuracy_score(y, y_pred)

print("‚úÖ Accuracy:", accuracy)
print("üìä Classification Report:\n")
print(classification_report(y, y_pred))

# Create models folder if it doesn't exist
os.makedirs("../models", exist_ok=True)

joblib.dump(model, "../models/model.pkl")
joblib.dump(mlb, "../models/mlb.pkl")

print("‚úÖ Model and encoder saved in models/")


‚úÖ Libraries imported.
üîÅ Loading data...
‚úÖ Data loaded.
Main dataset shape: (4920, 18)
üß¨ Total unique symptoms: 131
üìè Input shape (X): (4920, 131)
üìè Output shape (y): (4920,)
üîß Training model...
‚úÖ Accuracy: 1.0
üìä Classification Report:

                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00       120
                                   AIDS       1.00      1.00      1.00       120
                                   Acne       1.00      1.00      1.00       120
                    Alcoholic hepatitis       1.00      1.00      1.00       120
                                Allergy       1.00      1.00      1.00       120
                              Arthritis       1.00      1.00      1.00       120
                       Bronchial Asthma       1.00      1.00      1.00       120
                   Cervical spondylosis       1.00      1.00      1.00       120
          