<a href="https://colab.research.google.com/github/faithNassiwa/predictive-diagnosis-assistant/blob/main/models/GradientBoosting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Mount google drive to access folder with data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# Load datasets
training_path = '/content/drive/MyDrive/DS5500/Data/kaggle_KAUSHIL268/Training.csv'
test_path = '/content/drive/MyDrive/DS5500/Data/kaggle_KAUSHIL268/Testing.csv'

training_df = pd.read_csv(training_path, usecols = lambda col: col not in ["Unnamed: 133"])
test_df = pd.read_csv(test_path)

In [8]:
# Peek at dataset
training_df.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection


In [10]:
# split data into training and test sets
X_train = training_df.iloc[:, :-1]
y_train = training_df.iloc[:, -1]
X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:,-1]

In [18]:
disease_labels = ['(vertigo) Paroymsal  Positional Vertigo' 'AIDS' 'Acne'
 'Alcoholic hepatitis' 'Allergy' 'Arthritis' 'Bronchial Asthma'
 'Cervical spondylosis' 'Chicken pox' 'Chronic cholestasis' 'Common Cold'
 'Dengue' 'Diabetes ' 'Dimorphic hemmorhoids(piles)' 'Drug Reaction'
 'Fungal infection' 'GERD' 'Gastroenteritis' 'Heart attack' 'Hepatitis B'
 'Hepatitis C' 'Hepatitis D' 'Hepatitis E' 'Hypertension '
 'Hyperthyroidism' 'Hypoglycemia' 'Hypothyroidism' 'Impetigo' 'Jaundice'
 'Malaria' 'Migraine' 'Osteoarthristis' 'Paralysis (brain hemorrhage)'
 'Peptic ulcer diseae' 'Pneumonia' 'Psoriasis' 'Tuberculosis' 'Typhoid'
 'Urinary tract infection' 'Varicose veins' 'hepatitis A']

In [21]:
#Initialize label encoder
label_encoder = preprocessing.LabelEncoder()

# Fit and transform y to integer labels
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.fit_transform(y_test)

In [24]:
# XGBoost
# Initialize XGBoost classifier with a random state
xgb_cls = XGBClassifier( random_state=42)

# Fit the model on training data
xgb_cls.fit(X_train, y_train)

# Make predictions on testing data
y_pred = xgb_cls.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'The accuracy score is {accuracy * 100:.2f}%')

(array([15,  4, 16,  9, 14, 33,  1, 12, 17,  6, 23, 30,  7, 32, 28, 29,  8,
       11, 37, 40, 19, 20, 21, 22,  3, 36, 10, 34, 13, 18, 39, 26, 24, 25,
       31,  5,  0,  2, 38, 35, 27, 15]), array([15,  4, 16,  9, 14, 33,  1, 12, 17,  6, 23, 30,  7, 32, 28, 29,  8,
       11, 37, 40, 19, 20, 21, 22,  3, 36, 10, 34, 13, 18, 39, 26, 24, 25,
       31,  5,  0,  2, 38, 35, 27,  8]))
The accuracy score is 97.62%


In [25]:
# Transform the predicted integer labels back to string labels
y_pred_labels = label_encoder.inverse_transform(y_pred)
y_test_labels = label_encoder.inverse_transform(y_test)

In [28]:
print('Classification Report')
print(classification_report(y_test_labels, y_pred_labels))

Classification Report
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         1
                                   AIDS       1.00      1.00      1.00         1
                                   Acne       1.00      1.00      1.00         1
                    Alcoholic hepatitis       1.00      1.00      1.00         1
                                Allergy       1.00      1.00      1.00         1
                              Arthritis       1.00      1.00      1.00         1
                       Bronchial Asthma       1.00      1.00      1.00         1
                   Cervical spondylosis       1.00      1.00      1.00         1
                            Chicken pox       0.50      1.00      0.67         1
                    Chronic cholestasis       1.00      1.00      1.00         1
                            Common Cold       1.00      1.00      1.00         1
     

In [31]:
# Get class probability predictions
class_probs = xgb_cls.predict_proba(X_test)

# Get the indices of the top 3 predictions for each row
top3_indices = np.argsort(-class_probs, axis=1)[:, :3]

# Map the idices to class labels
top3_labels = [label_encoder.inverse_transform(i) for i in top3_indices]

# print top3_labels containing the top 3 predicted classes for each instance in X_test
print(top3_labels)


[array(['Fungal infection', 'Drug Reaction', 'Acne'], dtype=object), array(['Allergy', 'Typhoid', 'Malaria'], dtype=object), array(['GERD', 'Drug Reaction', 'Migraine'], dtype=object), array(['Chronic cholestasis', 'Hepatitis C', 'Fungal infection'],
      dtype=object), array(['Drug Reaction', 'Urinary tract infection', 'GERD'], dtype=object), array(['Peptic ulcer diseae', 'Migraine', 'Hepatitis C'], dtype=object), array(['AIDS', 'Jaundice', 'Hepatitis E'], dtype=object), array(['Diabetes ', 'Varicose veins', 'Migraine'], dtype=object), array(['Gastroenteritis', 'Typhoid', 'Malaria'], dtype=object), array(['Bronchial Asthma', 'GERD', 'Jaundice'], dtype=object), array(['Hypertension ', '(vertigo) Paroymsal  Positional Vertigo',
       'Cervical spondylosis'], dtype=object), array(['Migraine', 'Arthritis', 'Peptic ulcer diseae'], dtype=object), array(['Cervical spondylosis', 'Hypertension ',
       '(vertigo) Paroymsal  Positional Vertigo'], dtype=object), array(['Paralysis (brain hemor

In [37]:
# Initialize list to store top 3 labels and their probabilities
top3_labels_with_probs = []
# Loop through each instance
for i in range(len(top3_indices)):
    # Get the top 3 indices for current instance
    indices = top3_indices[i]

    # Get the top 3 labels
    labels = label_encoder.inverse_transform(indices)

    # Get the probabilities of the top 3 labels
    probabilities = class_probs[i, indices]

    # Combine labels and their probabilities, and append to the list
    labels_with_probs = list(zip(labels, probabilities))
    top3_labels_with_probs.append(labels_with_probs)

# Print the result
for i, labels_with_probs in enumerate(top3_labels_with_probs):
    print(f"Instance {i+1}: {y_test_labels[i]}")
    for label, prob in labels_with_probs:
        print(f"   {label}: {prob*100:.4f}%")
    print("\n")

Instance 1: Fungal infection
   Fungal infection: 99.8312%
   Drug Reaction: 0.0097%
   Acne: 0.0074%


Instance 2: Allergy
   Allergy: 99.8860%
   Typhoid: 0.0040%
   Malaria: 0.0038%


Instance 3: GERD
   GERD: 99.8393%
   Drug Reaction: 0.0149%
   Migraine: 0.0116%


Instance 4: Chronic cholestasis
   Chronic cholestasis: 99.9713%
   Hepatitis C: 0.0023%
   Fungal infection: 0.0014%


Instance 5: Drug Reaction
   Drug Reaction: 99.8505%
   Urinary tract infection: 0.0155%
   GERD: 0.0130%


Instance 6: Peptic ulcer diseae
   Peptic ulcer diseae: 99.8922%
   Migraine: 0.0094%
   Hepatitis C: 0.0038%


Instance 7: AIDS
   AIDS: 99.8488%
   Jaundice: 0.0061%
   Hepatitis E: 0.0049%


Instance 8: Diabetes 
   Diabetes : 99.6709%
   Varicose veins: 0.0226%
   Migraine: 0.0212%


Instance 9: Gastroenteritis
   Gastroenteritis: 99.8266%
   Typhoid: 0.0066%
   Malaria: 0.0064%


Instance 10: Bronchial Asthma
   Bronchial Asthma: 99.8933%
   GERD: 0.0049%
   Jaundice: 0.0040%


Instance 11: 