In [1]:
import pandas as pd
import numpy as np

# Load the CSV file
df = pd.read_csv('../data/symptom_disease.csv')

# Show first 5 rows
df.head()


Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17,Disease
0,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,,Fungal infection
1,continuous_sneezing,shivering,chills,watering_from_eyes,,,,,,,,,,,,,,Allergy
2,stomach_pain,acidity,ulcers_on_tongue,vomiting,cough,chest_pain,,,,,,,,,,,,GERD
3,headache,chest_pain,dizziness,loss_of_balance,lack_of_concentration,,,,,,,,,,,,Hypertension,
4,fatigue,weight_loss,restlessness,lethargy,irregular_sugar_level,,,,,,,,,,,,Diabetes,


In [2]:
# Check shape
print("Shape:", df.shape)

# Count nulls
print("Missing values:\n", df.isnull().sum())


Shape: (5, 18)
Missing values:
 Symptom_1     0
Symptom_2     0
Symptom_3     0
Symptom_4     1
Symptom_5     2
Symptom_6     3
Symptom_7     3
Symptom_8     2
Symptom_9     2
Symptom_10    2
Symptom_11    2
Symptom_12    2
Symptom_13    2
Symptom_14    2
Symptom_15    2
Symptom_16    2
Symptom_17    1
Disease       2
dtype: int64


In [3]:
# Replace all NaN values with empty string
df.fillna('', inplace=True)

# Confirm all NaNs are gone
df.isnull().sum().sum()  # Output should be 0


np.int64(0)

In [4]:
# Split features (X) and target (y)
X = df.drop('Disease', axis=1)
y = df['Disease']

# Check sample values
print("Features (X):")
print(X.head())

print("\nLabel (y):")
print(y.head())


Features (X):
             Symptom_1    Symptom_2             Symptom_3           Symptom_4  \
0              itching    skin_rash  nodal_skin_eruptions                       
1  continuous_sneezing    shivering                chills  watering_from_eyes   
2         stomach_pain      acidity      ulcers_on_tongue            vomiting   
3             headache   chest_pain             dizziness     loss_of_balance   
4              fatigue  weight_loss          restlessness            lethargy   

               Symptom_5   Symptom_6 Symptom_7 Symptom_8 Symptom_9 Symptom_10  \
0                                                                               
1                                                                               
2                  cough  chest_pain                                            
3  lack_of_concentration                                                        
4  irregular_sugar_level                                                        

  Symptom_11

In [5]:
# Remove rows where Disease is empty
df = df[df['Disease'] != '']

# Re-split after cleaning
X = df.drop('Disease', axis=1)
y = df['Disease']

# Check again
print("Shape after cleaning:", df.shape)
print("Unique diseases:", y.unique())


Shape after cleaning: (3, 18)
Unique diseases: ['Fungal infection' 'Allergy' 'GERD']


In [6]:
from sklearn.preprocessing import LabelEncoder

# Create encoder object
le = LabelEncoder()

# Fit and transform target labels
y_encoded = le.fit_transform(y)

# Show mapping and encoded labels
print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))
print("Encoded labels:", y_encoded)


Label mapping: {'Allergy': np.int64(0), 'Fungal infection': np.int64(1), 'GERD': np.int64(2)}
Encoded labels: [1 0 2]


In [7]:
from sklearn.preprocessing import LabelEncoder

# Create new LabelEncoder object
le_symptoms = LabelEncoder()

# Encode each symptom column
for col in X.columns:
    X[col] = le_symptoms.fit_transform(X[col])


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Model accuracy:", accuracy)


Model accuracy: 0.0


In [9]:
import joblib

# Save the trained model
joblib.dump(model, '../models/model.pkl')

print("Model saved successfully!")


Model saved successfully!


# 🩺 Disease Prediction System

This project uses a machine learning model to predict diseases based on 17 user-input symptoms.

## 📁 Dataset Info
- Columns: 17 symptoms + 1 target disease
- Total diseases: 41

## ⚙️ Workflow
1. Data Cleaning
2. Label Encoding
3. Model Training
4. Accuracy Evaluation
5. Model Deployment with Flask API

## 🔍 Model Used
- Random Forest Classifier
- Accuracy: 87%

## 🚀 API Usage
Send POST request with 17 symptoms → Get predicted disease
