In [109]:
# Importing Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [110]:
# Load CSV for disease data. 
disease_df = pd.read_csv('./disease_and_symptoms_data/DiseaseAndSymptoms.csv')
disease_df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [111]:
# Determine the number of null values in each symptom column
disease_df.isna().sum()

Disease          0
Symptom_1        0
Symptom_2        0
Symptom_3        0
Symptom_4      348
Symptom_5     1206
Symptom_6     1986
Symptom_7     2652
Symptom_8     2976
Symptom_9     3228
Symptom_10    3408
Symptom_11    3726
Symptom_12    4176
Symptom_13    4416
Symptom_14    4614
Symptom_15    4680
Symptom_16    4728
Symptom_17    4848
dtype: int64

In [112]:
# Determine the number of unique values in each column.
unique_counts = disease_df.nunique()
unique_counts

Disease       41
Symptom_1     34
Symptom_2     48
Symptom_3     54
Symptom_4     50
Symptom_5     38
Symptom_6     32
Symptom_7     26
Symptom_8     21
Symptom_9     22
Symptom_10    21
Symptom_11    18
Symptom_12    11
Symptom_13     8
Symptom_14     4
Symptom_15     3
Symptom_16     3
Symptom_17     1
dtype: int64

In [113]:
# Split our preprocessed data into our features and target arrays
y = disease_df.Disease.values.reshape(-1, 1)
X = disease_df.drop("Disease", axis=1)
feature_names = X.columns

In [6]:
# Review the y variable Series
y[:5]

array([['Fungal infection'],
       ['Fungal infection'],
       ['Fungal infection'],
       ['Fungal infection'],
       ['Fungal infection']], dtype=object)

In [7]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [8]:
label_encoder = LabelEncoder()

# Iterate over each column in the DataFrame
for column in X.columns:
    if X[column].dtype == 'object':  # Check if the column contains string values
        # Replace NaN values with a placeholder, for example, 'Unknown'
        X[column].fillna('Unknown', inplace=True)
        # Fit label encoder and transform the labels in the column
        X[column] = label_encoder.fit_transform(X[column])

y = label_encoder.fit_transform(y)


  y = column_or_1d(y, warn=True)


In [9]:
y

array([15, 15, 15, ..., 38, 35, 27])

In [10]:
X.isna().sum()

Symptom_1     0
Symptom_2     0
Symptom_3     0
Symptom_4     0
Symptom_5     0
Symptom_6     0
Symptom_7     0
Symptom_8     0
Symptom_9     0
Symptom_10    0
Symptom_11    0
Symptom_12    0
Symptom_13    0
Symptom_14    0
Symptom_15    0
Symptom_16    0
Symptom_17    0
dtype: int64

In [11]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [12]:
X_train.head()

Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
2023,4,16,13,50,38,32,26,21,22,21,18,11,8,4,3,3,1
4559,28,38,16,10,38,32,26,21,22,21,18,11,8,4,3,3,1
559,33,42,21,46,13,31,4,21,22,21,18,11,8,4,3,3,1
976,6,42,24,18,24,7,16,21,22,21,18,11,8,4,3,3,1
3761,11,12,8,30,33,28,20,21,22,21,18,11,8,4,3,3,1


In [122]:
X.describe()

Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
count,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0,4920.0
mean,18.095122,26.941463,27.067073,26.69878,22.830488,21.917073,19.010976,16.219512,18.135366,17.981707,15.6,10.071951,7.541463,3.835366,2.902439,2.917073,0.985366
std,10.234807,13.968681,14.795179,15.523655,12.333713,10.61069,9.141491,7.067688,6.617353,5.668923,4.955275,2.502145,1.553906,0.699177,0.452943,0.439502,0.120095
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,11.0,15.0,19.0,13.0,12.0,13.0,12.0,11.0,16.0,18.0,18.0,11.0,8.0,4.0,3.0,3.0,1.0
50%,15.0,28.0,24.0,26.0,23.0,25.0,26.0,21.0,22.0,21.0,18.0,11.0,8.0,4.0,3.0,3.0,1.0
75%,28.0,42.0,39.0,40.0,37.0,32.0,26.0,21.0,22.0,21.0,18.0,11.0,8.0,4.0,3.0,3.0,1.0
max,33.0,47.0,53.0,50.0,38.0,32.0,26.0,21.0,22.0,21.0,18.0,11.0,8.0,4.0,3.0,3.0,1.0


In [15]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [16]:
X_train_scaled

array([[-1.38074079, -0.7777184 , -0.95603163, ...,  0.21499975,
         0.18942627,  0.11838429],
       [ 0.96485788,  0.7963717 , -0.75279252, ...,  0.21499975,
         0.18942627,  0.11838429],
       [ 1.45352427,  1.0825699 , -0.41406067, ...,  0.21499975,
         0.18942627,  0.11838429],
       ...,
       [ 0.57392477, -0.4915202 ,  1.48283769, ...,  0.21499975,
         0.18942627,  0.11838429],
       [ 0.96485788, -0.56306975,  0.12791029, ...,  0.21499975,
         0.18942627,  0.11838429],
       [ 0.57392477, -0.6346193 , -1.56574896, ...,  0.21499975,
         0.18942627,  0.11838429]])

ValueError: could not convert string to float: ' high_fever'