In [2]:
# Importing Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
import pandas as pd
from sqlalchemy import create_engine
from pathlib import Path

# Connect to the SQLite database
database_path = Path(".../../disease_and_symptoms_data/DiseasesSymptoms.db")
engine = create_engine(f"sqlite:///{database_path}")

# Query All Records in the the Database
query = "SELECT * FROM DiseaseAndSymptomstable"

disease_df = pd.read_sql(query, engine)

In [4]:
disease_df

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9836,(vertigo) Paroymsal Positional Vertigo,vomiting,headache,nausea,spinning_movements,loss_of_balance,unsteadiness,,,,,,,,,,,
9837,Acne,skin_rash,pus_filled_pimples,blackheads,scurring,,,,,,,,,,,,,
9838,Urinary tract infection,burning_micturition,bladder_discomfort,foul_smell_of urine,continuous_feel_of_urine,,,,,,,,,,,,,
9839,Psoriasis,skin_rash,joint_pain,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,,,,,,,,,,,


In [5]:
# Determine the number of null values in each symptom column
disease_df.isna().sum()

Disease       0
Symptom_1     0
Symptom_2     0
Symptom_3     0
Symptom_4     0
Symptom_5     0
Symptom_6     0
Symptom_7     0
Symptom_8     0
Symptom_9     0
Symptom_10    0
Symptom_11    0
Symptom_12    0
Symptom_13    0
Symptom_14    0
Symptom_15    0
Symptom_16    0
Symptom_17    0
dtype: int64

In [6]:
# Determine the number of unique values in each column.
unique_counts = disease_df.nunique()
unique_counts

Disease       42
Symptom_1     35
Symptom_2     49
Symptom_3     55
Symptom_4     52
Symptom_5     40
Symptom_6     34
Symptom_7     28
Symptom_8     23
Symptom_9     24
Symptom_10    23
Symptom_11    20
Symptom_12    13
Symptom_13    10
Symptom_14     6
Symptom_15     5
Symptom_16     5
Symptom_17     3
dtype: int64

In [7]:
# Split our preprocessed data into our features and target arrays
y = disease_df.Disease.values.reshape(-1, 1)
X = disease_df.drop("Disease", axis=1)
feature_names = X.columns

In [8]:
# Review the y variable Series
y[:5]

array([['Fungal infection'],
       ['Fungal infection'],
       ['Fungal infection'],
       ['Fungal infection'],
       ['Fungal infection']], dtype=object)

In [9]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [10]:
label_encoder = LabelEncoder()

# Iterate over each column in the DataFrame
for column in X.columns:
    if X[column].dtype == 'object':  # Check if the column contains string values
        # Replace NaN values with a placeholder, for example, 'Unknown'
        X[column].fillna('Unknown', inplace=True)
        # Fit label encoder and transform the labels in the column
        X[column] = label_encoder.fit_transform(X[column])

y = label_encoder.fit_transform(y)


  y = column_or_1d(y, warn=True)


In [11]:
y

array([16, 16, 16, ..., 39, 36, 28])

In [12]:
X.isna().sum()

Symptom_1     0
Symptom_2     0
Symptom_3     0
Symptom_4     0
Symptom_5     0
Symptom_6     0
Symptom_7     0
Symptom_8     0
Symptom_9     0
Symptom_10    0
Symptom_11    0
Symptom_12    0
Symptom_13    0
Symptom_14    0
Symptom_15    0
Symptom_16    0
Symptom_17    0
dtype: int64

In [13]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=None)

In [14]:
X_train.head()

Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
8269,6,15,14,20,3,27,15,16,3,6,15,0,0,0,0,0,0
3812,24,18,4,34,36,0,0,0,0,0,0,0,0,0,0,0,0
9358,11,11,24,5,11,20,0,0,0,0,0,0,0,0,0,0,0
3455,0,19,23,4,10,26,6,9,21,0,0,0,0,0,0,0,0
5275,18,36,32,32,0,0,0,0,0,0,0,0,0,0,0,0,0


In [15]:
X.describe()

Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
count,9841.0,9841.0,9841.0,9841.0,9841.0,9841.0,9841.0,9841.0,9841.0,9841.0,9841.0,9841.0,9841.0,9841.0,9841.0,9841.0,9841.0
mean,18.234427,26.943603,27.06981,24.094198,14.273245,9.59872,5.459506,3.914033,4.047048,3.744538,2.212682,0.887715,0.464282,0.146835,0.097958,0.07357,0.014836
std,10.439295,13.968874,14.796166,15.615989,11.981635,10.302322,7.829757,6.19844,6.822065,6.572901,4.67531,2.418613,1.56874,0.636685,0.454603,0.39647,0.12174
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,11.0,15.0,19.0,10.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,15.0,28.0,24.0,25.0,13.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,28.0,42.0,39.0,38.0,24.0,18.0,11.0,7.0,7.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,34.0,48.0,54.0,51.0,39.0,33.0,27.0,22.0,23.0,22.0,19.0,12.0,9.0,5.0,4.0,4.0,2.0


In [16]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [17]:
X_train_scaled

array([[-1.17284235, -0.85445258, -0.88405792, ..., -0.21780091,
        -0.18918582, -0.12471777],
       [ 0.55210149, -0.63970946, -1.55913474, ..., -0.21780091,
        -0.18918582, -0.12471777],
       [-0.69369128, -1.14077673, -0.2089811 , ..., -0.21780091,
        -0.18918582, -0.12471777],
       ...,
       [ 0.93542235,  0.79191131, -0.74904256, ..., -0.21780091,
        -0.18918582, -0.12471777],
       [-1.17284235, -0.85445258, -0.88405792, ..., -0.21780091,
        -0.18918582, -0.12471777],
       [-0.31037043,  1.07823547, -0.2089811 , ..., -0.21780091,
        -0.18918582, -0.12471777]])

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
#Using random forest model
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=50, random_state=78)
rf_model

In [21]:
#Fitting model
rf_model = rf_model.fit(X_train_scaled, y_train)


In [22]:
#Making predictions using test data
predictions = rf_model.predict(X_test_scaled)

In [23]:
# Displaying results
# Calculate the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame to display the confusion matrix
cm_df = pd.DataFrame(cm, index=range(len(cm)), columns=range(len(cm)))
cm_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
0,54,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,54,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,68,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,62,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,55,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,51,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,59,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,53,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,68,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,57,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

1.0

In [25]:
 # Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
0,54,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,54,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,68,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,62,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,55,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,51,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,59,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,53,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,68,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,57,...,0,0,0,0,0,0,0,0,0,0


Accuracy Score : 1.0
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        54
           1       1.00      1.00      1.00        54
           2       1.00      1.00      1.00        68
           3       1.00      1.00      1.00        62
           4       1.00      1.00      1.00        55
           5       1.00      1.00      1.00        51
           6       1.00      1.00      1.00        59
           7       1.00      1.00      1.00        53
           8       1.00      1.00      1.00        68
           9       1.00      1.00      1.00        57
          10       1.00      1.00      1.00        47
          11       1.00      1.00      1.00        58
          12       1.00      1.00      1.00        63
          13       1.00      1.00      1.00        60
          15       1.00      1.00      1.00        57
          16       1.00      1.00      1.00        50
          17       1.00      1.00     

In [26]:
from sklearn import tree

In [27]:
#decision tree model
model = tree.DecisionTreeClassifier()

In [28]:
#Fit model
model = model.fit(X_train_scaled, y_train)

In [29]:
#Make predictions
predictions = model.predict(X_test_scaled)

In [30]:
#Accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

1.0

In [32]:
 # Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, index=range(len(cm)), columns=range(len(cm)))

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [33]:
 # Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
0,54,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,54,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,68,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,62,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,55,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,51,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,59,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,53,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,68,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,57,...,0,0,0,0,0,0,0,0,0,0


Accuracy Score : 1.0
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        54
           1       1.00      1.00      1.00        54
           2       1.00      1.00      1.00        68
           3       1.00      1.00      1.00        62
           4       1.00      1.00      1.00        55
           5       1.00      1.00      1.00        51
           6       1.00      1.00      1.00        59
           7       1.00      1.00      1.00        53
           8       1.00      1.00      1.00        68
           9       1.00      1.00      1.00        57
          10       1.00      1.00      1.00        47
          11       1.00      1.00      1.00        58
          12       1.00      1.00      1.00        63
          13       1.00      1.00      1.00        60
          15       1.00      1.00      1.00        57
          16       1.00      1.00      1.00        50
          17       1.00      1.00     