# Symptoms Based Disease Predition and Its Treatments
A Bangkit Final Project proposed by B21-CAP0170

There are a lot of diseases that you need to detect at an early stage to be able to identify the treatment plan early on and help the patient secure a good way to live. An early detection of disease and a precise diagnosis allows for quicker action, saving precious time, and to prevent complications and rapid worsening. Health practitioners have conducted surveys and collected data on patient information, their disease, and symptoms that allow them to distinguish the patient's disease with common symptoms. Therefore, the data set can be used to train the model that predicts the disease based on the symptoms.

In [1]:
# Importing Libraries
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import tensorflow_decision_forests as tfdf
import tensorflow as tf

try:
  from wurlitzer import sys_pipes
except:
  from colabtools.googlelog import CaptureLog as sys_pipes

In [2]:
# Read the dataset csv file
df = pd.read_csv("Dataset/dataset.csv")
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [3]:
# Filter DataFrame to disease only (delete all symptom columns)
disease_only = df
for a in range(1, 18):
    disease_only = disease_only.drop([df.columns[a]], axis=1)

disease_only.head()

Unnamed: 0,Disease
0,Fungal infection
1,Fungal infection
2,Fungal infection
3,Fungal infection
4,Fungal infection


In [4]:
# Put all unique disease to a list
disease = []

for x in disease_only['Disease'].tolist():
    if x not in disease:
        disease.append(x)

print(disease)

['Fungal infection', 'Allergy', 'GERD', 'Chronic cholestasis', 'Drug Reaction', 'Peptic ulcer diseae', 'AIDS', 'Diabetes ', 'Gastroenteritis', 'Bronchial Asthma', 'Hypertension ', 'Migraine', 'Cervical spondylosis', 'Paralysis (brain hemorrhage)', 'Jaundice', 'Malaria', 'Chicken pox', 'Dengue', 'Typhoid', 'hepatitis A', 'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E', 'Alcoholic hepatitis', 'Tuberculosis', 'Common Cold', 'Pneumonia', 'Dimorphic hemmorhoids(piles)', 'Heart attack', 'Varicose veins', 'Hypothyroidism', 'Hyperthyroidism', 'Hypoglycemia', 'Osteoarthristis', 'Arthritis', '(vertigo) Paroymsal  Positional Vertigo', 'Acne', 'Urinary tract infection', 'Psoriasis', 'Impetigo']


In [5]:
# Print the total number of the diseases in the dataset
print(len(disease))

41


In [6]:
# Filter DataFrame to symptoms only (delete the disease column)
first_column = df.columns[0]
symptoms_only = df.drop([first_column], axis=1)

# Saving to new file
df.to_csv('file.csv', index=False)

symptoms_only.head()

Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [7]:
# Put all unique symptoms to a list
symptoms = []

for i in range(1, 18):
    for x in symptoms_only[f'Symptom_{i}'].tolist():
        if str(x) != 'nan' and str(x) not in symptoms:
            symptoms.append(x)


print(symptoms)

['itching', ' skin_rash', ' continuous_sneezing', ' shivering', ' stomach_pain', ' acidity', ' vomiting', ' indigestion', ' muscle_wasting', ' patches_in_throat', ' fatigue', ' weight_loss', ' sunken_eyes', ' cough', ' headache', ' chest_pain', ' back_pain', ' weakness_in_limbs', ' chills', ' joint_pain', ' yellowish_skin', ' constipation', ' pain_during_bowel_movements', ' breathlessness', ' cramps', ' weight_gain', ' mood_swings', ' neck_pain', ' muscle_weakness', ' stiff_neck', ' pus_filled_pimples', ' burning_micturition', ' bladder_discomfort', ' high_fever', ' nodal_skin_eruptions', ' ulcers_on_tongue', ' loss_of_appetite', ' restlessness', ' dehydration', ' dizziness', ' weakness_of_one_body_side', ' lethargy', ' nausea', ' abdominal_pain', ' pain_in_anal_region', ' sweating', ' bruising', ' cold_hands_and_feets', ' anxiety', ' knee_pain', ' swelling_joints', ' blackheads', ' foul_smell_of urine', ' skin_peeling', ' blister', ' dischromic _patches', ' watering_from_eyes', ' extr

In [None]:
# Print the total number of the symptoms in the dataset
print(len(symptoms))

In [None]:
# Get a list like this --> ['Symptom_1', Symptom_2, ..., Symptom_17 ]
cols = [i for i in df.iloc[:,1:].columns]
cols

In [None]:
# Turns all columns in 'cols' to variable column.
# The number of rows of tmp DataFrame is 17x from the original DataFrame

tmp = pd.melt(df.reset_index() ,id_vars = ['index'], value_vars = cols )

# Adding a value to each symptoms column by 1
tmp['add_value'] = 1
tmp.head()

In [None]:
# Pivot table
diseases = pd.pivot_table(tmp, 
                          values = 'add_value',
                          index = 'index',
                          columns = 'value')

# Add labels column
diseases.insert(0,'label',df['Disease'])

# Fill NaN with zero
diseases = diseases.fillna(0)
pd.set_option('display.max_columns', 150)
diseases.head()

# TensorFlow Decision Forests Algorithm

In [None]:
label = "label"

classes = diseases[label].unique().tolist()
print(f"Label classes: {classes}")

diseases[label] = diseases[label].map(classes.index)

In [None]:
# Split dataset for training 70% and test 30%
train_7 = diseases.sample(frac = 0.7, random_state = 1)
test_7 = diseases.drop(index = train_7.index)

# Split dataset for training 80% and test 20%
train_8 = diseases.sample(frac = 0.8, random_state = 1)
test_8 = diseases.drop(index = train_8.index)

# Split dataset for training 90% and test 10%
train_9 = diseases.sample(frac = 0.9, random_state = 1)
test_9 = diseases.drop(index = train_9.index)

train = train_7
test = test_7

# x_train, y_train, x_test, y_test =  train.drop('label', axis = 1),\
#                                     train['label'],\
#                                     test.drop('label', axis = 1),\
#                                     test['label']

In [None]:
train = tfdf.keras.pd_dataframe_to_tf_dataset(train, label=label)
test = tfdf.keras.pd_dataframe_to_tf_dataset(test, label=label)

In [None]:
# Specify the model.
model = tfdf.keras.RandomForestModel()

# Optionally, add evaluation metrics.
model.compile(
    metrics=["accuracy"])

# Train the model.
with sys_pipes():
    model.fit(x=train)

In [None]:
evaluation = model.evaluate(test, return_dict=True)
print()

for name, value in evaluation.items():
  print(f"{name}: {value:.4f}")

In [None]:
model.summary()

In [None]:
model.save('saved_model/my_model')