In [110]:
# Importing Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [155]:
# Load CSV for disease data. 
disease_df = pd.read_csv('./disease_and_symptoms_data/DiseaseAndSymptoms.csv')
disease_df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [156]:
disease_df.drop(columns=['Symptom_11', 'Symptom_12','Symptom_13','Symptom_14','Symptom_15','Symptom_16','Symptom_17'], inplace=True)

In [113]:
# Determine the number of null values in each symptom column
disease_df.isna().sum()

Disease          0
Symptom_1        0
Symptom_2        0
Symptom_3        0
Symptom_4      348
Symptom_5     1206
Symptom_6     1986
Symptom_7     2652
Symptom_8     2976
Symptom_9     3228
Symptom_10    3408
dtype: int64

In [114]:
# Determine the number of unique values in each column.
unique_counts = disease_df.nunique()
unique_counts

Disease       41
Symptom_1     34
Symptom_2     48
Symptom_3     54
Symptom_4     50
Symptom_5     38
Symptom_6     32
Symptom_7     26
Symptom_8     21
Symptom_9     22
Symptom_10    21
dtype: int64

In [169]:
# Split our preprocessed data into our features and target arrays
y = disease_df.Disease.values.reshape(-1, 1)
X = disease_df.drop("Disease", axis=1)
feature_names = X.columns

# Replace NaN values with "Unknown"
X = X.fillna("Unknown")

# Helper function to strip whitespace from dataset
def strip_whitespace(x):
    if isinstance(x, str):
        return x.strip()
    else:
        return x

# Strip the whitespace from each symptom value in each column - most values had whitespace e.g. " skin_rash"
X = X.apply(lambda col: col.map(strip_whitespace) if col.dtype == 'object' else col)


In [170]:
# Review the y variable Series
y[:8]

array([['Fungal infection'],
       ['Fungal infection'],
       ['Fungal infection'],
       ['Fungal infection'],
       ['Fungal infection'],
       ['Fungal infection'],
       ['Fungal infection'],
       ['Fungal infection']], dtype=object)

In [171]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10
0,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
1,skin_rash,nodal_skin_eruptions,dischromic _patches,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
2,itching,nodal_skin_eruptions,dischromic _patches,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
3,itching,skin_rash,dischromic _patches,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
4,itching,skin_rash,nodal_skin_eruptions,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown


In [172]:
# Create a dictionary to store mappings of encoded labels to original values
label_mappings = {}
X_encoded = X_original.copy()

# Initialize a LabelEncoder object
label_encoder = LabelEncoder()

# Concatenate all Symptom_x columns into a single series
symptoms_series = X.stack().reset_index(drop=True)

# Drop NaN values
symptoms_series.dropna(inplace=True)

# Fit LabelEncoder on the concatenated series
label_encoder.fit(symptoms_series)

# Encode all values in the DataFrame
for column in X.columns:
    X_encoded[column] = X[column].apply(lambda x: label_encoder.transform([x])[0] if pd.notnull(x) else x)

    label_mappings[column] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

print(X)

# # Create a dictionary to store mappings of encoded labels to original values
# label_mappings = {}

# display(X.columns)

# # Iterate over each column in the DataFrame
# for column in X.columns:
#     if X[column].dtype == 'object':  # Check if the column contains string values
#         # Replace NaN values with a placeholder, for example, 'Unknown'
#         X[column].fillna('Unknown', inplace=True)
#         # Fit label encoder and transform the labels in the column
#         X[column] = label_encoder.fit_transform(X[column])

#         print(X[column])
#         # Display the original labels associated with the encoded values
#         # Store the mappings of encoded labels to original values
#         label_mappings[column] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
#     else:
#         print(X[column].dtype)



                Symptom_1             Symptom_2             Symptom_3  \
0                 itching             skin_rash  nodal_skin_eruptions   
1               skin_rash  nodal_skin_eruptions   dischromic _patches   
2                 itching  nodal_skin_eruptions   dischromic _patches   
3                 itching             skin_rash   dischromic _patches   
4                 itching             skin_rash  nodal_skin_eruptions   
...                   ...                   ...                   ...   
4915             vomiting              headache                nausea   
4916            skin_rash    pus_filled_pimples            blackheads   
4917  burning_micturition    bladder_discomfort   foul_smell_of urine   
4918            skin_rash            joint_pain          skin_peeling   
4919            skin_rash            high_fever               blister   

                     Symptom_4             Symptom_5           Symptom_6  \
0          dischromic _patches               Un

In [173]:
# Print label mappings
for column, mappings in label_mappings.items():
    print(f"Column: {column}")
    for original_value, encoded_value in mappings.items():
        print(f"Encoded: {encoded_value} -> Original: {original_value}")
    print()

Column: Symptom_1
Encoded: 0 -> Original: Unknown
Encoded: 1 -> Original: abdominal_pain
Encoded: 2 -> Original: abnormal_menstruation
Encoded: 3 -> Original: acidity
Encoded: 4 -> Original: acute_liver_failure
Encoded: 5 -> Original: altered_sensorium
Encoded: 6 -> Original: anxiety
Encoded: 7 -> Original: back_pain
Encoded: 8 -> Original: belly_pain
Encoded: 9 -> Original: blackheads
Encoded: 10 -> Original: bladder_discomfort
Encoded: 11 -> Original: blister
Encoded: 12 -> Original: bloody_stool
Encoded: 13 -> Original: blurred_and_distorted_vision
Encoded: 14 -> Original: breathlessness
Encoded: 15 -> Original: brittle_nails
Encoded: 16 -> Original: bruising
Encoded: 17 -> Original: burning_micturition
Encoded: 18 -> Original: chest_pain
Encoded: 19 -> Original: chills
Encoded: 20 -> Original: cold_hands_and_feets
Encoded: 21 -> Original: constipation
Encoded: 22 -> Original: continuous_feel_of_urine
Encoded: 23 -> Original: continuous_sneezing
Encoded: 24 -> Original: cough
Encode

In [174]:
# Initialize a LabelEncoder object for target variable y
label_encoder_y = LabelEncoder()

# Fit label encoder and transform the target variable
y_encoded = label_encoder_y.fit_transform(y)

# Create a dictionary to store mappings of encoded labels to original values for target variable y
label_mappings_y = dict(zip(label_encoder_y.classes_, label_encoder_y.transform(label_encoder_y.classes_)))

# Print label mappings for target variable y
print("Target variable (y) label mappings:")
for original_value, encoded_value in label_mappings_y.items():
    print(f"Encoded: {encoded_value} -> Original: {original_value}")

Target variable (y) label mappings:
Encoded: 0 -> Original: (vertigo) Paroymsal  Positional Vertigo
Encoded: 1 -> Original: AIDS
Encoded: 2 -> Original: Acne
Encoded: 3 -> Original: Alcoholic hepatitis
Encoded: 4 -> Original: Allergy
Encoded: 5 -> Original: Arthritis
Encoded: 6 -> Original: Bronchial Asthma
Encoded: 7 -> Original: Cervical spondylosis
Encoded: 8 -> Original: Chicken pox
Encoded: 9 -> Original: Chronic cholestasis
Encoded: 10 -> Original: Common Cold
Encoded: 11 -> Original: Dengue
Encoded: 12 -> Original: Diabetes 
Encoded: 13 -> Original: Dimorphic hemmorhoids(piles)
Encoded: 14 -> Original: Drug Reaction
Encoded: 15 -> Original: Fungal infection
Encoded: 16 -> Original: GERD
Encoded: 17 -> Original: Gastroenteritis
Encoded: 18 -> Original: Heart attack
Encoded: 19 -> Original: Hepatitis B
Encoded: 20 -> Original: Hepatitis C
Encoded: 21 -> Original: Hepatitis D
Encoded: 22 -> Original: Hepatitis E
Encoded: 23 -> Original: Hypertension 
Encoded: 24 -> Original: Hypert

  y = column_or_1d(y, warn=True)


In [175]:
X_encoded.isna().sum()

Symptom_1     0
Symptom_2     0
Symptom_3     0
Symptom_4     0
Symptom_5     0
Symptom_6     0
Symptom_7     0
Symptom_8     0
Symptom_9     0
Symptom_10    0
dtype: int64

In [176]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, random_state=1, stratify=y_encoded)

In [193]:
len(X_encoded)

4920

In [177]:
X_train.head()

Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10
2023,17,41,22,0,0,0,0,0,0,0
4559,113,100,27,29,0,0,0,0,0,0
559,53,113,39,118,43,122,26,0,0,0
976,19,113,43,42,68,29,65,0,0,0
3761,39,25,16,71,107,105,80,0,0,0


In [178]:
X_encoded.describe()

Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10
count,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0
mean,58.607317,70.257317,56.190244,57.726829,44.371951,32.80122,22.134146,21.159756,23.289024,22.386585
std,34.510621,38.256653,34.890652,38.293112,36.739766,35.920333,32.287787,34.238679,38.407989,38.796236
min,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,39.0,39.0,32.0,26.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,53.0,73.0,43.0,57.0,42.0,26.0,0.0,0.0,0.0,0.0
75%,93.0,113.0,87.0,92.0,64.0,58.0,38.5,35.0,46.0,38.0
max,122.0,122.0,122.0,122.0,122.0,122.0,121.0,121.0,121.0,121.0


In [179]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [180]:
X_train_scaled

array([[-1.20850497, -0.75995656, -0.9841854 , ..., -0.61616305,
        -0.60602483, -0.57839536],
       [ 1.57471383,  0.78214135, -0.84067051, ..., -0.61616305,
        -0.60602483, -0.57839536],
       [-0.16479792,  1.12192563, -0.49623479, ..., -0.61616305,
        -0.60602483, -0.57839536],
       ...,
       [ 0.99487658, -0.42017227,  1.62778547, ...,  1.0784041 ,
         1.27412694, -0.3982288 ],
       [ 1.57471383, -0.60313304,  0.04912176, ..., -0.61616305,
        -0.60602483, -0.57839536],
       [ 0.99487658, -0.70768205, -1.29991814, ..., -0.61616305,
        -0.60602483, -0.57839536]])

In [181]:
 # Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [182]:
 # Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [183]:
y_train

array([38, 17, 28, ..., 11, 33, 27])

In [184]:
 # Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [185]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [186]:
 # Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix
cm_df = pd.DataFrame(cm)

# Display the confusion matrix DataFrame
print("Confusion matrix DataFrame:\n", cm_df)

Confusion matrix DataFrame:
     0   1   2   3   4   5   6   7   8   9   ...  31  32  33  34  35  36  37  \
0   30   0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   
1    0  30   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   
2    0   0  30   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   
3    0   0   0  30   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   
4    0   0   0   0  30   0   0   0   0   0  ...   0   0   0   0   0   0   0   
5    0   0   0   0   0  30   0   0   0   0  ...   0   0   0   0   0   0   0   
6    0   0   0   0   0   0  30   0   0   0  ...   0   0   0   0   0   0   0   
7    0   0   0   0   0   0   0  30   0   0  ...   0   0   0   0   0   0   0   
8    0   0   0   0   0   0   0   0  30   0  ...   0   0   0   0   0   0   0   
9    0   0   0   0   0   0   0   0   0  30  ...   0   0   0   0   0   0   0   
10   0   0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   
11   0   0   0   0   0 

In [187]:
# Displaying results
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Accuracy Score : 1.0
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        30
           2       1.00      1.00      1.00        30
           3       1.00      1.00      1.00        30
           4       1.00      1.00      1.00        30
           5       1.00      1.00      1.00        30
           6       1.00      1.00      1.00        30
           7       1.00      1.00      1.00        30
           8       1.00      1.00      1.00        30
           9       1.00      1.00      1.00        30
          10       1.00      1.00      1.00        30
          11       1.00      1.00      1.00        30
          12       1.00      1.00      1.00        30
          13       1.00      1.00      1.00        30
          14       1.00      1.00      1.00        30
          15       1.00      1.00      1.00        30
          16       1.00      1.00     

In [188]:
# Define a function for predicting disease based on symptoms
def predict_disease_with_input(symptoms, label_encoder, rf_model):
    # Create a DataFrame with the user input symptoms
    # data = pd.DataFrame({'Symptom_1': [Symptom_1], 'Symptom_2': [Symptom_2], 'Symptom_3': [Symptom_3]})

    # display(data)

    
    # Fill the list with "Unknown" up to 10 elements
    filled_symptoms = symptoms + ["Unknown"] * (10 - len(symptoms))
    display(filled_symptoms)
    
    # Ensure consistency with label encoding
    encoded_symptoms = label_encoder.transform(filled_symptoms)
    encoded_symptoms = encoded_symptoms.reshape(1, -1)

    # data_encoded = data.apply(lambda x: label_encoder.transform(x))

    display(encoded_symptoms)
    
    # Make the prediction using the trained model
    prediction = rf_model.predict(encoded_symptoms)

    print(f"Prediction: {prediction}")
    
    # Convert the encoded prediction back to the original label
    predicted_disease = label_encoder_y.inverse_transform(prediction)
    
    # Return the predicted disease
    return predicted_disease  # Assuming it's a single prediction


In [191]:
# Get user input for symptoms
symptoms = ["itching", "skin_rash", "dischromic _patches"]

predicted_disease = predict_disease_with_input(symptoms, label_encoder, rf_model)
print(f"Predicted disease: {predicted_disease}")

['itching',
 'skin_rash',
 'dischromic _patches',
 'Unknown',
 'Unknown',
 'Unknown',
 'Unknown',
 'Unknown',
 'Unknown',
 'Unknown']

array([[53, 93, 30,  0,  0,  0,  0,  0,  0,  0]])

Prediction: [40]
Predicted disease: ['hepatitis A']


In [152]:
type(X.iloc[0]["Symptom_5"])

float

In [None]:
STARTING OVER...

In [230]:
X_new = disease_df.drop("Disease", axis=1)

# Helper function to strip whitespace from dataset
def strip_whitespace(x):
    if isinstance(x, str):
        return x.strip()
    else:
        return x

# Strip the whitespace from each symptom value in each column - most values had whitespace e.g. " skin_rash"
X_new = X_new.apply(lambda col: col.map(strip_whitespace) if col.dtype == 'object' else col)

X_new.fillna("Unknown", inplace=True)

# # Concatenate all Symptom_x columns into a single series
# symptoms_series = X_new.stack().reset_index(drop=True)

# # Drop NaN values
# symptoms_series.dropna(inplace=True)

# # Create separate columns for each symptom
# for symptom in symptoms_series:
#     X_new[symptom] = X_new['Symptom_1'].apply(lambda x: 1 if symptom in x else 0)

# X_new

# Get list of unique symptoms
unique_symptoms = list(set(symptom for col in X_new.columns for symptom in X_new[col]))
filtered_symptoms = [symptom for symptom in unique_symptoms if symptom != "Unknown"]


# Create separate columns for each symptom
for symptom in filtered_symptoms:
    X_new[symptom] = X_new.apply(lambda row: 1 if symptom in row.values else 0, axis=1)

# Drop the original symptom columns
X_new.drop(X_new.columns[X_new.columns.str.startswith('Symptom_')], axis=1, inplace=True)
X_new

  X_new[symptom] = X_new.apply(lambda row: 1 if symptom in row.values else 0, axis=1)
  X_new[symptom] = X_new.apply(lambda row: 1 if symptom in row.values else 0, axis=1)
  X_new[symptom] = X_new.apply(lambda row: 1 if symptom in row.values else 0, axis=1)
  X_new[symptom] = X_new.apply(lambda row: 1 if symptom in row.values else 0, axis=1)
  X_new[symptom] = X_new.apply(lambda row: 1 if symptom in row.values else 0, axis=1)
  X_new[symptom] = X_new.apply(lambda row: 1 if symptom in row.values else 0, axis=1)
  X_new[symptom] = X_new.apply(lambda row: 1 if symptom in row.values else 0, axis=1)
  X_new[symptom] = X_new.apply(lambda row: 1 if symptom in row.values else 0, axis=1)
  X_new[symptom] = X_new.apply(lambda row: 1 if symptom in row.values else 0, axis=1)
  X_new[symptom] = X_new.apply(lambda row: 1 if symptom in row.values else 0, axis=1)
  X_new[symptom] = X_new.apply(lambda row: 1 if symptom in row.values else 0, axis=1)
  X_new[symptom] = X_new.apply(lambda row: 1 if sympto

Unnamed: 0,bladder_discomfort,indigestion,fast_heart_rate,dehydration,dizziness,spinning_movements,yellowing_of_eyes,cramps,restlessness,patches_in_throat,...,prominent_veins_on_calf,cold_hands_and_feets,dischromic _patches,pain_behind_the_eyes,muscle_wasting,acidity,muscle_weakness,swollen_legs,visual_disturbances,excessive_hunger
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4916,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4917,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4918,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [231]:
unique_symptoms = list(set(symptom for col in X_new.columns for symptom in X_new[col]))
unique_symptoms

[0, 1]

In [232]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_new, y_encoded, test_size=0.2, random_state=42)

# Train the random forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Make predictions
predictions = rf_model.predict(X_test)

In [233]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [234]:
 # Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix
cm_df = pd.DataFrame(cm)

# Display the confusion matrix DataFrame
print("Confusion matrix DataFrame:\n", cm_df)

Confusion matrix DataFrame:
     0   1   2   3   4   5   6   7   8   9   ...  31  32  33  34  35  36  37  \
0   18   0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   
1    0  30   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   
2    0   0  24   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   
3    0   0   0  25   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   
4    0   0   0   0  24   0   0   0   0   0  ...   0   0   0   0   0   0   0   
5    0   0   0   0   0  23   0   0   0   0  ...   0   0   0   0   0   0   0   
6    0   0   0   0   0   0  33   0   0   0  ...   0   0   0   0   0   0   0   
7    0   0   0   0   0   0   0  23   0   0  ...   0   0   0   0   0   0   0   
8    0   0   0   0   0   0   0   0  21   0  ...   0   0   0   0   0   0   0   
9    0   0   0   0   0   0   0   0   0  15  ...   0   0   0   0   0   0   0   
10   0   0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   
11   0   0   0   0   0 

In [235]:
# Displaying results
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Accuracy Score : 1.0
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        18
           1       1.00      1.00      1.00        30
           2       1.00      1.00      1.00        24
           3       1.00      1.00      1.00        25
           4       1.00      1.00      1.00        24
           5       1.00      1.00      1.00        23
           6       1.00      1.00      1.00        33
           7       1.00      1.00      1.00        23
           8       1.00      1.00      1.00        21
           9       1.00      1.00      1.00        15
          10       1.00      1.00      1.00        23
          11       1.00      1.00      1.00        26
          12       1.00      1.00      1.00        21
          13       1.00      1.00      1.00        29
          14       1.00      1.00      1.00        24
          15       1.00      1.00      1.00        19
          16       1.00      1.00     

In [236]:
# Creating a function to predict disease based off user-provided symptoms
def predict_disease(input_symptoms, rf_model, unique_symptoms, label_encoder):
    # Create an empty DataFrame with columns for each symptom
    input_data = pd.DataFrame(columns=unique_symptoms, dtype=int)
    
    # Populate the DataFrame with symptom values
    for symptom in unique_symptoms:
        if symptom in input_symptoms:
            input_data.loc[0, symptom] = 1  # Set the value to 1 if the symptom is present
        else:
            input_data.loc[0, symptom] = 0  # Set the value to 0 if the symptom is not present
    
    # Make predictions using the random forest model
    predictions = rf_model.predict(input_data)

    # Convert encoded disease to string form
    return label_encoder.inverse_transform(predictions)[0]


In [237]:
# Taking user input (symptoms) to predict disease - example 1
predict_disease(["itching", "skin_rash"], rf_model, filtered_symptoms, label_encoder_y)

'Fungal infection'

In [239]:
# Taking user input (symptoms) to predict disease - example 2
predict_disease(["shivering", "chills"], rf_model, filtered_symptoms, label_encoder_y)

'Allergy'

In [244]:
# Taking user input (symptoms) to predict disease - example 3
predict_disease(["itching","vomiting","fatigue"], rf_model, filtered_symptoms, label_encoder_y)

'Jaundice'

In [238]:
# Exporting symptoms .csv
X_new.to_csv('disease_w_dummies_df.csv', index=False)