In [22]:
# Importing Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [48]:
# Load CSV for disease data. 
disease_df = pd.read_csv('./disease_and_symptoms_data/DiseaseAndSymptoms.csv')
disease_df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [49]:
# Determine the number of null values in each symptom column
disease_df.isna().sum()

Disease          0
Symptom_1        0
Symptom_2        0
Symptom_3        0
Symptom_4      348
Symptom_5     1206
Symptom_6     1986
Symptom_7     2652
Symptom_8     2976
Symptom_9     3228
Symptom_10    3408
Symptom_11    3726
Symptom_12    4176
Symptom_13    4416
Symptom_14    4614
Symptom_15    4680
Symptom_16    4728
Symptom_17    4848
dtype: int64

In [50]:
# Determine the number of unique values in each column.
unique_counts = disease_df.nunique()
unique_counts

Disease       41
Symptom_1     34
Symptom_2     48
Symptom_3     54
Symptom_4     50
Symptom_5     38
Symptom_6     32
Symptom_7     26
Symptom_8     21
Symptom_9     22
Symptom_10    21
Symptom_11    18
Symptom_12    11
Symptom_13     8
Symptom_14     4
Symptom_15     3
Symptom_16     3
Symptom_17     1
dtype: int64

In [51]:
# Split our preprocessed data into our features and target arrays
y = disease_df.Disease.values.reshape(-1, 1)
X = disease_df.drop("Disease", axis=1)
feature_names = X.columns

In [52]:
# Review the y variable Series
y[:5]

array([['Fungal infection'],
       ['Fungal infection'],
       ['Fungal infection'],
       ['Fungal infection'],
       ['Fungal infection']], dtype=object)

In [53]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [54]:
# Remove null values
X.fillna(value=0) 

Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0
1,skin_rash,nodal_skin_eruptions,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,itching,nodal_skin_eruptions,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,itching,skin_rash,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,itching,skin_rash,nodal_skin_eruptions,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,vomiting,headache,nausea,spinning_movements,loss_of_balance,unsteadiness,0,0,0,0,0,0,0,0,0,0,0
4916,skin_rash,pus_filled_pimples,blackheads,scurring,0,0,0,0,0,0,0,0,0,0,0,0,0
4917,burning_micturition,bladder_discomfort,foul_smell_of urine,continuous_feel_of_urine,0,0,0,0,0,0,0,0,0,0,0,0,0
4918,skin_rash,joint_pain,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,0,0,0,0,0,0,0,0,0,0,0


In [55]:
label_encoder = LabelEncoder()
for i in range(X.shape[1]):
    # Check if the column has non-numeric values (assuming string values represent categorical variables)
    if isinstance(X.iloc[:, i].iloc[0], str):
        # Apply label encoding to non-numeric columns
        X.iloc[:, i] = label_encoder.fit_transform(X.iloc[:, i])

# Encode the target variable (y)
y = label_encoder.fit_transform(y)

# Display the features DataFrame after label encoding
X[0:15]

  y = column_or_1d(y, warn=True)


Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,33,35,35,11,,,,,,,,,,,,,
1,24,27,18,50,,,,,,,,,,,,,
2,33,27,18,50,,,,,,,,,,,,,
3,33,35,18,50,,,,,,,,,,,,,
4,33,35,35,50,,,,,,,,,,,,,
5,24,27,18,50,,,,,,,,,,,,,
6,33,27,18,50,,,,,,,,,,,,,
7,33,35,18,50,,,,,,,,,,,,,
8,33,35,35,50,,,,,,,,,,,,,
9,33,35,35,11,,,,,,,,,,,,,


In [61]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)