In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier


In [2]:
# Load the dataset (make sure the CSV is in your working directory)
df = pd.read_csv("C:/Users/Nidhi/Downloads/DiseaseAndSymptoms.csv")

# Display first 5 rows
df.head()


Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [3]:
# Combine symptoms into one column (as list)
df['all_symptoms'] = df[[f'Symptom_{i}' for i in range(1, 18)]].values.tolist()
df['all_symptoms'] = df['all_symptoms'].apply(lambda x: [s.strip().lower() for s in x if isinstance(s, str)])


In [4]:
# Binary encode symptoms
mlb = MultiLabelBinarizer()
X = mlb.fit_transform(df['all_symptoms'])

# Encode disease names
y = df['Disease']
le = LabelEncoder()
y_encoded = le.fit_transform(y)


In [5]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X, y_encoded)


In [7]:
def predict_disease(user_symptoms, top_n=3):
    input_vector = np.zeros(len(mlb.classes_))
    unknown = []

    for symptom in user_symptoms:
        symptom = symptom.strip().lower()
        if symptom in mlb.classes_:
            idx = mlb.classes_.tolist().index(symptom)
            input_vector[idx] = 1
        else:
            unknown.append(symptom)

    distances, indices = model.kneighbors([input_vector])
    predicted_labels = y_encoded[indices[0]]

    scores = {}
    for label in predicted_labels:
        disease = le.inverse_transform([label])[0]
        scores[disease] = scores.get(disease, 0) + 1

    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    result = [{"Disease": d, "Match Score": f"{s}/5"} for d, s in sorted_scores]

    if unknown:
        result.append({"Note": f"Some symptoms were not recognized: {', '.join(unknown)}"})

    return result


In [8]:
# Try known or unknown symptoms
predict_disease(["itching", "skin rash", "nodal skin eruptions"])


[{'Disease': 'Fungal infection', 'Match Score': '5/5'},
 {'Note': 'Some symptoms were not recognized: skin rash, nodal skin eruptions'}]

In [9]:
predict_disease(["vomiting", "fatigue", "nausea"])


[{'Disease': '(vertigo) Paroymsal  Positional Vertigo', 'Match Score': '4/5'},
 {'Disease': 'Hepatitis C', 'Match Score': '1/5'}]

In [10]:
predict_disease(["headache", "ghost feeling", "high bp"])


[{'Disease': 'Paralysis (brain hemorrhage)', 'Match Score': '5/5'},
 {'Note': 'Some symptoms were not recognized: ghost feeling, high bp'}]