<a href="https://colab.research.google.com/github/faithNassiwa/predictive-diagnosis-assistant/blob/main/notebooks/DDXPlus_GradientBoosting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [77]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, classification_report, auc, roc_auc_score, roc_curve, confusion_matrix
from pprint import pprint
from xgboost import XGBClassifier, plot_importance
from sklearn.utils import class_weight

In [78]:
# Mount google drive to access folder with data
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [80]:
# Read the training and test dataset -- Took a minute

train_df = pd.read_csv('/content/drive/MyDrive/DS5500/Data/Dataset2/processed_train.csv', low_memory=False)
test_df = pd.read_csv('/content/drive/MyDrive/DS5500/Data/Dataset2/processed_test.csv', low_memory=False)
pprint(f'Train dataset shape: {train_df.shape}')
pprint(f'Test dataset shape: {test_df.shape}')

'Train dataset shape: (1025602, 221)'
'Test dataset shape: (134529, 221)'


In [81]:
# View values of categorical columns
unique_values = {col: train_df[col].unique() for col in train_df.columns if train_df[col].dtype == 'O'}
pprint(unique_values)


{'Characterize your pain:': array(['heavy', 'exhausting', 'sharp', 'burning', 'a cramp', '0',
       'heartbreaking', 'sickening', 'sensitive', 'scary',
       'a knife stroke', 'tugging', 'violent', 'a pulse', 'haunting',
       'tedious', nan], dtype=object),
 'Do you feel pain somewhere?': array(['temple(L)', 'posterior chest wall(L)', 'eye(R)', 'pharynx', '0',
       'temple(R)', 'epigastric', 'breast(L)', 'iliac fossa(L)', 'pubis',
       'occiput', 'testicle(L)', 'posterior chest wall(R)', 'shoulder(L)',
       'under the jaw', 'hypochondrium(R)', 'forehead', 'trachea',
       'sole(L)', 'finger (middle)(R)', 'shoulder(R)', 'breast(R)',
       'dorsal aspect of the foot(L)', 'belly', 'eye(L)', 'nose',
       'hip(L)', 'top of the head', 'triceps(R)', 'finger (middle)(L)',
       'ear(L)', 'ear(R)', 'nowhere', 'sole(R)', 'upper chest',
       'cheek(L)', 'hypochondrium(L)', 'knee(R)', 'back of head',
       'palace', 'testicle(R)', 'lower chest',
       'dorsal aspect of the foot(

In [82]:
# Check if there are any nans values
pprint(train_df.columns[train_df.isna().any()].tolist())
pprint(test_df.columns[test_df.isna().any()].tolist())

['Characterize your pain:', 'What color is the rash?']
['Characterize your pain:', 'What color is the rash?']


In [83]:
# Count of NAN in the columns -- drop these rows

print(train_df['Characterize your pain:'].isna().sum())
print(train_df['What color is the rash?'].isna().sum())
print(test_df['Characterize your pain:'].isna().sum())
print(test_df['What color is the rash?'].isna().sum())

10859
32609
1379
4436


In [84]:
# Drop rows with NAN values

train_df_nonan = train_df.dropna(subset=['Characterize your pain:', 'What color is the rash?'])
test_df_nonan = test_df.dropna(subset=['Characterize your pain:', 'What color is the rash?'])

In [85]:
# Count of NAN in the columns after drop

print(train_df_nonan['Characterize your pain:'].isna().sum())
print(train_df_nonan['What color is the rash?'].isna().sum())
print(test_df_nonan['Characterize your pain:'].isna().sum())
print(test_df_nonan['What color is the rash?'].isna().sum())

0
0
0
0


In [86]:
pprint(f'Train dataset shape: {train_df_nonan.shape}')
pprint(f'Test dataset shape: {test_df_nonan.shape}')

'Train dataset shape: (982224, 221)'
'Test dataset shape: (128726, 221)'


In [87]:
# Remove columns with one value
train_df_nonan.drop(columns=['Have you lost consciousness associated with violent and sustained muscle contractions or had an absence episode?',
                             'Have you noticed a diffuse (widespread) redness in one or both eyes?',
                             'Have you had any vaginal discharge?'], inplace=True)
test_df_nonan.drop(columns=['Have you lost consciousness associated with violent and sustained muscle contractions or had an absence episode?',
                             'Have you noticed a diffuse (widespread) redness in one or both eyes?',
                             'Have you had any vaginal discharge?'], inplace=True)

In [88]:
from sklearn.preprocessing import LabelEncoder

categorical_columns = list(unique_values.keys())

# Reset indices of both X_train and X_test
train_df_reset = train_df_nonan.reset_index(drop=True)
test_df_reset = test_df_nonan.reset_index(drop=True)

# Combine the train and test data
combined = pd.concat([train_df_reset, test_df_reset], axis=0)

# Create a dictionary to store encodings
label_encoders = {}


# Label encode the combined dataframe for each categorical column
for col in categorical_columns:
  le = LabelEncoder()
  combined[col] = le.fit_transform(combined[col])
  label_encoders[col] = le

combined_encoded = combined

# Split the combined_encoded dataframe back into train and test
train_encoded = combined_encoded.iloc[:len(train_df_nonan)]
test_encoded = combined_encoded.iloc[len(train_df_nonan):]

In [89]:
# Prepare training and test variables
X_columns = [col for col in train_df_nonan.columns if col not in ['Unnamed: 0', 'PATHOLOGY']]
X_train = train_encoded[X_columns]
y_train = train_encoded['PATHOLOGY']
X_test = test_encoded[X_columns]
y_test = test_encoded['PATHOLOGY']
pprint(f'X_train shape: {X_train.shape}')
pprint(f'y_train shape: {y_train.shape}')
pprint(f'X_test shape: {X_test.shape}')
pprint(f'y_test shape: {y_test.shape}')


'X_train shape: (982224, 216)'
'y_train shape: (982224,)'
'X_test shape: (128726, 216)'
'y_test shape: (128726,)'


In [90]:
# Check dimension after encoding
pprint(f'X_train shape: {X_train.shape}')
pprint(f'X_test shape: {X_test.shape}')

'X_train shape: (982224, 216)'
'X_test shape: (128726, 216)'


In [91]:
# XGBoost Model -- Took about 8 minutes

# Initialize and fit the model
clf = XGBClassifier(random_state=420, class_weight='balanced')
clf.fit(X_train, y_train)


In [92]:
# Extract feature importances
importances = clf.feature_importances_

# Get the feature names
feature_names = X_train.columns

# Create a DataFrame for the importances
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

# Sort the DataFrame by the importances
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Display the top features
print(importance_df.head(50))

                                               Feature  Importance
26   Have you had sexual intercourse with an HIV-po...    0.158182
166                      Do you have an active cancer?    0.141260
193  Are you more irritable or has your mood been v...    0.107329
124         Do you have painful mouth ulcers or sores?    0.058418
188  Are you currently being treated or have you re...    0.049284
167  Have you been unable to move or get up for mor...    0.042474
139  Do you have a known kidney problem resulting i...    0.035874
76   Have you been in contact with or ate something...    0.032378
28                     Have you had chills or shivers?    0.026329
75            Do you have a known severe food allergy?    0.024318
123  Are you consulting because you have high blood...    0.017996
106                 Did your cheeks suddenly turn red?    0.015727
134  Have you been unintentionally losing weight or...    0.013395
52   Are you taking any new oral anticoagulants ((N...    0.01

In [93]:
# Make predictions
y_pred = clf.predict(X_test)

In [94]:
# Evaluate the Model

# Encode y_values for auc_score
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y_test_2d = lb.fit_transform(y_test)
y_pred_2d = lb.fit_transform(y_pred)

# Compute scores
accuracy = accuracy_score(y_test, y_pred)
auc_score_ovo = roc_auc_score(y_test_2d, y_pred_2d, multi_class='ovo')
auc_score_ovr = roc_auc_score(y_test_2d, y_pred_2d, multi_class='ovr')
report = classification_report(y_test, y_pred)

# Print scores
print(f'The accuracy score is {accuracy * 100:.2f}%')
print(f'One-vs-One AUC score: {auc_score_ovo * 100:.2f}%')
print(f'One-vs-Rest AUC score: {auc_score_ovr * 100:.2f}%')
print("Classification Report")
print(report)

The accuracy score is 99.48%
One-vs-One AUC score: 99.64%
One-vs-Rest AUC score: 99.64%
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2153
           1       1.00      1.00      1.00      3302
           2       0.99      0.99      0.99      3214
           3       1.00      1.00      1.00      3509
           4       1.00      1.00      1.00      2598
           5       0.98      0.70      0.82      1829
           6       1.00      1.00      1.00      2411
           7       1.00      1.00      1.00      3798
           8       1.00      1.00      1.00      6770
           9       1.00      1.00      1.00      2831
          10       1.00      1.00      1.00      2083
          11       1.00      1.00      1.00      2454
          12       1.00      1.00      1.00        36
          13       1.00      1.00      1.00      3577
          14       1.00      1.00      1.00      2222
          15       1.00  

In [None]:
# Convert the report to a DataFrame
report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(dict(report)).transpose()
class_labels_int = [int(i) for i in report_df.index[0:48].tolist() ]
original_labels = label_encoders['PATHOLOGY'].inverse_transform(class_labels_int)
new_index = original_labels.tolist() + report_df.index[48:].tolist()
report_df.index = new_index
report_df.head()

# Export report
report_df.to_csv('XGBoost_classification_report.csv')

In [95]:
# Get test dataset true values
y_test_labels = label_encoders['PATHOLOGY'].inverse_transform(y_test)

In [97]:
# Get class probability predictions
class_probs = clf.predict_proba(X_test)

# Get the indices of the top 3 predictions for each row
top3_indices = np.argsort(-class_probs, axis=1)[:, :3]

# Map the idices to class labels
top3_labels = [label_encoders['PATHOLOGY'].inverse_transform(i) for i in top3_indices]

# print top3_labels containing the top 3 predicted classes for each instance in X_test
print(top3_labels[:100])

[array(['GERD', 'Acute rhinosinusitis', 'Pericarditis'], dtype=object), array(['Bronchitis', 'Viral pharyngitis',
       'Acute COPD exacerbation / infection'], dtype=object), array(['Acute dystonic reactions', 'Myasthenia gravis',
       'Atrial fibrillation'], dtype=object), array(['Acute laryngitis', 'Viral pharyngitis', 'Epiglottitis'],
      dtype=object), array(['URTI', 'Bronchitis', 'Viral pharyngitis'], dtype=object), array(['URTI', 'Acute laryngitis', 'Viral pharyngitis'], dtype=object), array(['Inguinal hernia', 'HIV (initial infection)', 'Acute laryngitis'],
      dtype=object), array(['Spontaneous pneumothorax', 'Pericarditis', 'Unstable angina'],
      dtype=object), array(['Bronchitis', 'Acute COPD exacerbation / infection',
       'Pulmonary neoplasm'], dtype=object), array(['GERD', 'Acute rhinosinusitis', 'Pericarditis'], dtype=object), array(['Bronchitis', 'Viral pharyngitis', 'Acute laryngitis'],
      dtype=object), array(['Bronchitis', 'Pulmonary neoplasm',
       '

In [100]:
# Initialize list to store top 3 labels and their probabilities
top3_labels_with_probs = []
# Loop through each instance
for i in range(len(top3_indices[:500])):
    # Get the top 3 indices for current instance
    indices = top3_indices[i]

    # Get the top 3 labels
    labels = label_encoders['PATHOLOGY'].inverse_transform(indices)

    # Get the probabilities of the top 3 labels
    probabilities = class_probs[i, indices]

    # Combine labels and their probabilities, and append to the list
    labels_with_probs = list(zip(labels, probabilities))
    top3_labels_with_probs.append(labels_with_probs)

# Print the result
for i, labels_with_probs in enumerate(top3_labels_with_probs):
    print(f"Patient {i+1}: {y_test_labels[i]}")
    for label, prob in labels_with_probs:
        print(f"   {label}: {prob*100:.4f}%")
    print("\n")

Patient 1: GERD
   GERD: 99.9991%
   Acute rhinosinusitis: 0.0003%
   Pericarditis: 0.0002%


Patient 2: Bronchitis
   Bronchitis: 99.9995%
   Viral pharyngitis: 0.0004%
   Acute COPD exacerbation / infection: 0.0000%


Patient 3: Acute dystonic reactions
   Acute dystonic reactions: 99.9987%
   Myasthenia gravis: 0.0009%
   Atrial fibrillation: 0.0000%


Patient 4: Acute laryngitis
   Acute laryngitis: 99.9985%
   Viral pharyngitis: 0.0014%
   Epiglottitis: 0.0000%


Patient 5: URTI
   URTI: 99.9999%
   Bronchitis: 0.0001%
   Viral pharyngitis: 0.0000%


Patient 6: URTI
   URTI: 99.9987%
   Acute laryngitis: 0.0009%
   Viral pharyngitis: 0.0002%


Patient 7: Inguinal hernia
   Inguinal hernia: 99.9993%
   HIV (initial infection): 0.0003%
   Acute laryngitis: 0.0001%


Patient 8: Spontaneous pneumothorax
   Spontaneous pneumothorax: 99.9997%
   Pericarditis: 0.0001%
   Unstable angina: 0.0001%


Patient 9: Bronchitis
   Bronchitis: 100.0000%
   Acute COPD exacerbation / infection: 0.00

In [None]:
# Perform Cross Validation  - Took 34 Minutes

from sklearn.model_selection import cross_validate

# Initialize the model
clf = XGBClassifier(random_state=420, class_weight='balanced')

# Perform cross validation
cv_results = cross_validate(clf, X_train, y_train, cv=5, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro', 'roc_auc_ovr', 'roc_auc_ovo'], return_estimator=True)

# Print the mean scores
print(f'Mean Accuracy: {cv_results["test_accuracy"].mean() * 100:.2f}%')
print(f'Mean ROC_AUC OVR: {cv_results["test_roc_auc_ovr"].mean() * 100:.2f}%')
print(f'Mean ROC_AUC OVO: {cv_results["test_roc_auc_ovo"].mean() * 100:.2f}%')
print(f'Mean Precision: {cv_results["test_precision_macro"].mean() * 100:.2f}%')
print(f'Mean Recall: {cv_results["test_recall_macro"].mean() * 100:.2f}%')
print(f'Mean F1 Score: {cv_results["test_f1_macro"].mean() * 100:.2f}%')

Mean Accuracy: 99.49%
Mean ROC_AUC OVR: 99.99%
Mean ROC_AUC OVO: 99.99%
Mean Precision: 99.54%
Mean Recall: 99.30%
Mean F1 Score: 99.36%


In [None]:
# Generate feature importances

for idx, model in enumerate(cv_results['estimator']):
    # Extract feature importances
    importances = model.feature_importances_

    # Get the feature names
    print(f'Model {idx}')
    feature_names = X_train.columns

    # Create a DataFrame for the importances
    feature_importance = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    })

# Sort the DataFrame by the importances
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Display the top features
print(feature_importance.head(50))



Model 0
Model 1
Model 2
Model 3
Model 4
                                               Feature  Importance
166                      Do you have an active cancer?    0.162417
188  Are you currently being treated or have you re...    0.123145
124         Do you have painful mouth ulcers or sores?    0.067082
193  Are you more irritable or has your mood been v...    0.056563
167  Have you been unable to move or get up for mor...    0.048299
134  Have you been unintentionally losing weight or...    0.045875
75            Do you have a known severe food allergy?    0.032464
28                     Have you had chills or shivers?    0.030045
123  Are you consulting because you have high blood...    0.020431
26   Have you had sexual intercourse with an HIV-po...    0.020166
76   Have you been in contact with or ate something...    0.018535
106                 Did your cheeks suddenly turn red?    0.018010
16   Have you ever had a sexually transmitted infec...    0.015092
186  Do you have famil

In [None]:
feature_importance.describe

<bound method NDFrame.describe of                                                Feature  Importance
166                      Do you have an active cancer?    0.162417
188  Are you currently being treated or have you re...    0.123145
124         Do you have painful mouth ulcers or sores?    0.067082
193  Are you more irritable or has your mood been v...    0.056563
167  Have you been unable to move or get up for mor...    0.048299
..                                                 ...         ...
147  Were you diagnosed with endocrine disease or a...    0.000000
206  Have you breastfed one of your children for mo...    0.000000
118                  Do you have chronic pancreatitis?    0.000000
120            Do you have pale stools and dark urine?    0.000000
172  Have you been in contact with someone who has ...    0.000000

[216 rows x 2 columns]>

In [None]:
feature_importance.to_csv('XGBoost_feature_importance.csv')
