<a href="https://colab.research.google.com/github/faithNassiwa/predictive-diagnosis-assistant/blob/main/notebooks/TrainingXGBoostModel_49Diseases.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, classification_report, auc, roc_auc_score, roc_curve, confusion_matrix
from pprint import pprint
from xgboost import XGBClassifier, plot_importance, DMatrix
from sklearn.utils import class_weight
from sklearn.model_selection import GridSearchCV, KFold, ParameterGrid
from sklearn.preprocessing import LabelBinarizer

In [2]:
# Mount google drive to access folder with data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Load Data & Preprocessing

In [3]:
# Read the training and test dataset -- Took a minute

train_df = pd.read_csv('/content/drive/MyDrive/DS5500/Data/Dataset2/processed_train.csv', low_memory=False)
test_df = pd.read_csv('/content/drive/MyDrive/DS5500/Data/Dataset2/processed_test.csv', low_memory=False)
pprint(f'Train dataset shape: {train_df.shape}')
pprint(f'Test dataset shape: {test_df.shape}')

'Train dataset shape: (1025602, 221)'
'Test dataset shape: (134529, 221)'


In [4]:
# Drop rows with NAN values

train_df_nonan = train_df.dropna(subset=['Characterize your pain:', 'What color is the rash?'])
test_df_nonan = test_df.dropna(subset=['Characterize your pain:', 'What color is the rash?'])

In [5]:
# Remove columns with only one value
train_df_nonan.drop(columns=['Have you lost consciousness associated with violent and sustained muscle contractions or had an absence episode?',
                             'Have you noticed a diffuse (widespread) redness in one or both eyes?',
                             'Have you had any vaginal discharge?'], inplace=True)
test_df_nonan.drop(columns=['Have you lost consciousness associated with violent and sustained muscle contractions or had an absence episode?',
                             'Have you noticed a diffuse (widespread) redness in one or both eyes?',
                             'Have you had any vaginal discharge?'], inplace=True)

In [6]:
# View values of categorical columns
unique_values = {col: train_df[col].unique() for col in train_df.columns if train_df[col].dtype == 'O'}

In [7]:
# Handling Categorical Variables  - Label Encoding to avoid creating new features that increase the already many featureset
from sklearn.preprocessing import LabelEncoder

categorical_columns = list(unique_values.keys())

# Reset indices of both X_train and X_test
train_df_reset = train_df_nonan.reset_index(drop=True)
test_df_reset = test_df_nonan.reset_index(drop=True)

# Combine the train and test data
combined = pd.concat([train_df_reset, test_df_reset], axis=0)

# Create a dictionary to store encodings
label_encoders = {}


# Label encode the combined dataframe for each categorical column
for col in categorical_columns:
  le = LabelEncoder()
  combined[col] = le.fit_transform(combined[col])
  label_encoders[col] = [le.classes_, le.transform(le.classes_)]

combined_encoded = combined

# Split the combined_encoded dataframe back into train and test
train_encoded = combined_encoded.iloc[:len(train_df_nonan)]
test_encoded = combined_encoded.iloc[len(train_df_nonan):]

pprint(f'{label_encoders}')
pprint(f'Train dataset shape: {train_encoded.shape}')
pprint(f'Test dataset shape: {test_encoded.shape}')

("{'SEX': [array(['F', 'M'], dtype=object), array([0, 1])], 'PATHOLOGY': "
 "[array(['Acute COPD exacerbation / infection', 'Acute dystonic reactions',\n"
 "       'Acute laryngitis', 'Acute otitis media', 'Acute pulmonary edema',\n"
 "       'Acute rhinosinusitis', 'Allergic sinusitis', 'Anaphylaxis',\n"
 "       'Anemia', 'Atrial fibrillation', 'Boerhaave', 'Bronchiectasis',\n"
 "       'Bronchiolitis', 'Bronchitis',\n"
 "       'Bronchospasm / acute asthma exacerbation', 'Chagas',\n"
 "       'Chronic rhinosinusitis', 'Cluster headache', 'Croup', 'Ebola',\n"
 "       'Epiglottitis', 'GERD', 'Guillain-Barré syndrome',\n"
 "       'HIV (initial infection)', 'Influenza', 'Inguinal hernia',\n"
 "       'Larygospasm', 'Localized edema', 'Myasthenia gravis',\n"
 "       'Myocarditis', 'PSVT', 'Pancreatic neoplasm', 'Panic attack',\n"
 "       'Pericarditis', 'Pneumonia', 'Possible NSTEMI / STEMI',\n"
 "       'Pulmonary embolism', 'Pulmonary neoplasm', 'SLE',\n"
 "       'Scombroid food p

# Extract 102 Most Importance Features

In [8]:
# Prepare training and test variables
X_columns = [col for col in train_encoded.columns if col not in ['Unnamed: 0', 'PATHOLOGY']]
X_train = train_encoded[X_columns]
y_train = train_encoded['PATHOLOGY']
X_test = test_encoded[X_columns]
y_test = test_encoded['PATHOLOGY']
pprint(f'X_train shape: {X_train.shape}')
pprint(f'y_train shape: {y_train.shape}')
pprint(f'X_test shape: {X_test.shape}')
pprint(f'y_test shape: {y_test.shape}')

'X_train shape: (982224, 216)'
'y_train shape: (982224,)'
'X_test shape: (128726, 216)'
'y_test shape: (128726,)'


In [9]:
selected_102_features = ['Have you had sexual intercourse with an HIV-positive partner in the past 12 '
 'months?',
 'Do you have an active cancer?',
 'Are you more irritable or has your mood been very unstable recently?',
 'Do you have painful mouth ulcers or sores?',
 'Are you currently being treated or have you recently been treated with an '
 'oral antibiotic for an ear infection?',
 'Have you been unable to move or get up for more than 3 consecutive days '
 'within the last 4 weeks?',
 'Do you have a known kidney problem resulting in an inability to retain '
 'proteins?',
 'Have you been in contact with or ate something that you have an allergy to?',
 'Have you had chills or shivers?',
 'Do you have a known severe food allergy?',
 'Are you consulting because you have high blood pressure?',
 'Did your cheeks suddenly turn red?',
 'Have you been unintentionally losing weight or have you lost your appetite?',
 'Are you taking any new oral anticoagulants ((NOACs)?',
 'Have you had unprotected sex with more than one partner in the last 6 '
 'months?',
 'Have you ever had surgery to remove lymph nodes?',
 'Have you noticed weakness in your facial muscles and/or eyes?',
 'Have you had 2 or more asthma attacks in the past year?',
 'Have you started or taken any antipsychotic medication within the last 7 '
 'days?',
 'Do you have family members who have had lung cancer?',
 'Do you currently take hormones?',
 'Do you feel like you are dying or were you afraid that you were about do '
 'die?',
 'Do you feel your abdomen is bloated or distended (swollen due to pressure '
 'from inside)?',
 'Do you have a burning sensation that starts in your stomach then goes up '
 'into your throat, and can be associated with a bitter taste in your mouth?',
 'Have any of your family members been diagnosed with cluster headaches?',
 'Do your lesions peel off?',
 'Do you have numbness, loss of sensation or tingling in the feet?',
 'Have you been treated in hospital recently for nausea, agitation, '
 'intoxication or aggressive behavior and received medication via an '
 'intravenous or intramuscular route?',
 'Do you consume energy drinks regularly?',
 'Have you been able to pass stools or gas since your symptoms increased?',
 'Do you feel your heart is beating very irregularly or in a disorganized '
 'pattern?',
 'Do you have any close family members who suffer from allergies (any type), '
 'hay fever or eczema?',
 'Have you had weakness or paralysis on one side of the face, which may still '
 'be present or completely resolved?',
 'Have you recently taken decongestants or other substances that may have '
 'stimulant effects?',
 'Is your nose or the back of your throat itchy?',
 'Do you have the perception of seeing two images of a single object seen '
 'overlapping or adjacent to each other (double vision)?',
 'Did you have your first menstrual period before the age of 12?',
 'Do you regularly drink coffee or tea?',
 'Were you born prematurely or did you suffer any complication at birth?',
 'Have you been diagnosed with hyperthyroidism?',
 'Do you have cystic fibrosis?',
 'Have you vomited several times or have you made several efforts to vomit?',
 'Have you had one or several flare ups of chronic obstructive pulmonary '
 'disease (COPD) in the past year?',
 'Do you have a problem with poor circulation?',
 'Have you ever had a spontaneous pneumothorax?',
 'Do you have severe itching in one or both eyes?',
 'Do you find that your symptoms have worsened over the last 2 weeks and that '
 'progressively less effort is required to cause the symptoms?',
 'Have you ever had a diagnosis of anemia?',
 'Do you suffer from chronic anxiety?',
 'Do you work in agriculture?',
 'Do you take medication that dilates your blood vessels?',
 'Do you have a known heart defect?',
 'Do you feel out of breath with minimal physical effort?',
 'Have you been diagnosed with chronic sinusitis?',
 'Do you have pain that improves when you lean forward?',
 'Do you have Rheumatoid Arthritis?',
 'Have you had diarrhea or an increase in stool frequency?',
 'Have you or any member of your family ever had croup?',
 'Have you noticed that you produce more saliva than usual?',
 'Have you ever had fluid in your lungs?',
 'Have you ever had a pericarditis?',
 'Did you vomit after coughing?',
 'Do you have pain that is increased with movement?',
 'Do you have a sore throat?',
 'Have you ever had a sexually transmitted infection?',
 'Do you have symptoms that get worse after eating?',
 'Do you have chronic kidney failure?',
 'Are you infected with the human immunodeficiency virus (HIV)?',
 'Do you have annoying muscle spasms in your face, neck or any other part of '
 'your body?',
 'Have you ever been diagnosed with obstructive sleep apnea (OSA)?',
 'Do you work in the mining sector?',
 'Have you been in contact with a person with similar symptoms in the past 2 '
 'weeks?',
 'Do you have liver cirrhosis?',
 'Are your symptoms more prominent at night?',
 'Are the symptoms or pain increased with coughing, with an effort like '
 'lifting a weight or from forcing a bowel movement?',
 'Do you have any lesions, redness or problems on your skin that you believe '
 'are related to the condition you are consulting for?',
 'Have you noticed that the tone of your voice has become deeper, softer or '
 'hoarse?',
 'Do you have difficulty articulating words/speaking?',
 'In the last month, have you been in contact with anyone infected with the '
 'Ebola virus?',
 'Do you take a calcium channel blockers (medication)?',
 'Is the lesion (or are the lesions) larger than 1cm?',
 'Are you currently using intravenous drugs?',
 'Are you immunosuppressed?',
 'Have any of your family members ever had a pneumothorax?',
 'Do you feel that your eyes produce excessive tears?',
 'Does the person have a whooping cough?',
 'Do you feel so tired that you are unable to do your usual activities or are '
 'you stuck in your bed all day long?',
 'What color is the rash?',
 'Do you have pain or weakness in your jaw?',
 'Have you ever been diagnosed with depression?',
 'Where is the swelling located?',
 'Have you been hospitalized for an asthma attack in the past year?',
 'Do you have polyps in your nose?',
 'Do you suffer from Crohn’s disease or ulcerative colitis (UC)?',
 'Do you work in construction?',
 'Do you have pain somewhere, related to your reason for consulting?',
 'Do you have chest pain even at rest?',
 'Do you currently undergo dialysis?',
 'Do you have a poor diet?',
 'Are your vaccinations up to date?',
 'Are you exposed to secondhand cigarette smoke on a daily basis?',
 'Do you have a decrease in appetite?']

print(f'{len(selected_102_features)}')

102


In [10]:
# Get top features from the dataset
X_train_selected = X_train[selected_102_features]
X_test_selected = X_test[selected_102_features]
pprint(f'X_train_selected shape: {X_train_selected.shape}')
pprint(f'X_test_selected shape: {X_test_selected.shape}')

'X_train_selected shape: (982224, 102)'
'X_test_selected shape: (128726, 102)'


In [14]:
# Train model -- took 5 minutes
cls = XGBClassifier(class_weight='balanced', random_state=43)
cls.fit(X_train_selected, y_train)

# Save trained model
!pip install joblib
from joblib import dump, load
dump(cls, 'xgboost_49.joblib')



['xgboost_49.joblib']

In [32]:
# Disease Labels
disease_labels = [
    'Acute COPD exacerbation / infection', 'Acute dystonic reactions',
    'Acute laryngitis', 'Acute otitis media', 'Acute pulmonary edema',
    'Acute rhinosinusitis', 'Allergic sinusitis', 'Anaphylaxis',
    'Anemia', 'Atrial fibrillation', 'Boerhaave', 'Bronchiectasis',
    'Bronchiolitis', 'Bronchitis',
    'Bronchospasm / acute asthma exacerbation', 'Chagas',
    'Chronic rhinosinusitis', 'Cluster headache', 'Croup', 'Ebola',
    'Epiglottitis', 'GERD', 'Guillain-Barré syndrome',
    'HIV (initial infection)', 'Influenza', 'Inguinal hernia',
    'Larygospasm', 'Localized edema', 'Myasthenia gravis',
    'Myocarditis', 'PSVT', 'Pancreatic neoplasm', 'Panic attack',
    'Pericarditis', 'Pneumonia', 'Possible NSTEMI / STEMI',
    'Pulmonary embolism', 'Pulmonary neoplasm', 'SLE',
    'Scombroid food poisoning', 'Spontaneous pneumothorax',
    'Spontaneous rib fracture', 'Stable angina', 'Tuberculosis',
    'URTI', 'Unstable angina', 'Viral pharyngitis', 'Whooping cough'
]

In [28]:
y_test_disease_labels = y_test.unique()
pprint(y_test_disease_labels)
print(len(y_test_disease_labels))

array([21, 13,  1,  2, 44, 25, 40,  8, 26, 16, 32, 36, 24,  3, 41,  9, 34,
       46, 27, 20, 23, 33, 31,  5, 17, 42,  6, 30, 35, 11,  0, 29, 43,  4,
       39, 14, 28, 38, 37,  7, 10, 22, 18, 45, 15, 19, 47, 12])
48


In [33]:
trained_model = load('xgboost_49.joblib')
y_pred = trained_model.predict(X_test_selected)

# Get classification report
print(classification_report(y_test, y_pred, target_names=disease_labels))

                                          precision    recall  f1-score   support

     Acute COPD exacerbation / infection       1.00      0.98      0.99      2153
                Acute dystonic reactions       1.00      0.97      0.98      3302
                        Acute laryngitis       0.97      0.86      0.91      3214
                      Acute otitis media       0.88      0.90      0.89      3509
                   Acute pulmonary edema       1.00      1.00      1.00      2598
                    Acute rhinosinusitis       1.00      0.45      0.63      1829
                      Allergic sinusitis       1.00      0.99      1.00      2411
                             Anaphylaxis       1.00      1.00      1.00      3798
                                  Anemia       1.00      1.00      1.00      6770
                     Atrial fibrillation       1.00      0.97      0.99      2831
                               Boerhaave       1.00      0.81      0.89      2083
               

In [34]:
# Get class probability predictions
y_pred_proba = trained_model.predict_proba(X_test_selected)

# Get the indices of the top 3 predictions for each row
top3_indices = np.argsort(-y_pred_proba, axis=1)[:, :3]