<a href="https://colab.research.google.com/github/faithNassiwa/predictive-diagnosis-assistant/blob/main/notebooks/TrainingXGBoostModel_Top10Diseases.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, classification_report, auc, roc_auc_score, roc_curve, confusion_matrix
from pprint import pprint
from xgboost import XGBClassifier, plot_importance, DMatrix
from sklearn.utils import class_weight
from sklearn.model_selection import GridSearchCV, KFold, ParameterGrid
from sklearn.preprocessing import LabelBinarizer

In [6]:
# Mount google drive to access folder with data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Load Data & Preprocessing

In [7]:
# Read the training and test dataset -- Took a minute

train_df = pd.read_csv('/content/drive/MyDrive/DS5500/Data/Dataset2/processed_train.csv', low_memory=False)
test_df = pd.read_csv('/content/drive/MyDrive/DS5500/Data/Dataset2/processed_test.csv', low_memory=False)
pprint(f'Train dataset shape: {train_df.shape}')
pprint(f'Test dataset shape: {test_df.shape}')

'Train dataset shape: (1025602, 221)'
'Test dataset shape: (134529, 221)'


In [8]:
# Drop rows with NAN values

train_df_nonan = train_df.dropna(subset=['Characterize your pain:', 'What color is the rash?'])
test_df_nonan = test_df.dropna(subset=['Characterize your pain:', 'What color is the rash?'])

In [9]:
# Remove columns with only one value
train_df_nonan.drop(columns=['Have you lost consciousness associated with violent and sustained muscle contractions or had an absence episode?',
                             'Have you noticed a diffuse (widespread) redness in one or both eyes?',
                             'Have you had any vaginal discharge?'], inplace=True)
test_df_nonan.drop(columns=['Have you lost consciousness associated with violent and sustained muscle contractions or had an absence episode?',
                             'Have you noticed a diffuse (widespread) redness in one or both eyes?',
                             'Have you had any vaginal discharge?'], inplace=True)

In [10]:
# View values of categorical columns
unique_values = {col: train_df[col].unique() for col in train_df.columns if train_df[col].dtype == 'O'}

In [11]:
# Handling Categorical Variables  - Label Encoding to avoid creating new features that increase the already many featureset
from sklearn.preprocessing import LabelEncoder

categorical_columns = list(unique_values.keys())

# Reset indices of both X_train and X_test
train_df_reset = train_df_nonan.reset_index(drop=True)
test_df_reset = test_df_nonan.reset_index(drop=True)

# Combine the train and test data
combined = pd.concat([train_df_reset, test_df_reset], axis=0)

# Create a dictionary to store encodings
label_encoders = {}


# Label encode the combined dataframe for each categorical column
for col in categorical_columns:
  le = LabelEncoder()
  combined[col] = le.fit_transform(combined[col])
  label_encoders[col] = [le.classes_, le.transform(le.classes_)]

combined_encoded = combined

# Split the combined_encoded dataframe back into train and test
train_encoded = combined_encoded.iloc[:len(train_df_nonan)]
test_encoded = combined_encoded.iloc[len(train_df_nonan):]

pprint(f'{label_encoders}')
pprint(f'Train dataset shape: {train_encoded.shape}')
pprint(f'Test dataset shape: {test_encoded.shape}')

("{'SEX': [array(['F', 'M'], dtype=object), array([0, 1])], 'PATHOLOGY': "
 "[array(['Acute COPD exacerbation / infection', 'Acute dystonic reactions',\n"
 "       'Acute laryngitis', 'Acute otitis media', 'Acute pulmonary edema',\n"
 "       'Acute rhinosinusitis', 'Allergic sinusitis', 'Anaphylaxis',\n"
 "       'Anemia', 'Atrial fibrillation', 'Boerhaave', 'Bronchiectasis',\n"
 "       'Bronchiolitis', 'Bronchitis',\n"
 "       'Bronchospasm / acute asthma exacerbation', 'Chagas',\n"
 "       'Chronic rhinosinusitis', 'Cluster headache', 'Croup', 'Ebola',\n"
 "       'Epiglottitis', 'GERD', 'Guillain-Barré syndrome',\n"
 "       'HIV (initial infection)', 'Influenza', 'Inguinal hernia',\n"
 "       'Larygospasm', 'Localized edema', 'Myasthenia gravis',\n"
 "       'Myocarditis', 'PSVT', 'Pancreatic neoplasm', 'Panic attack',\n"
 "       'Pericarditis', 'Pneumonia', 'Possible NSTEMI / STEMI',\n"
 "       'Pulmonary embolism', 'Pulmonary neoplasm', 'SLE',\n"
 "       'Scombroid food p

# Extract Top 10 Diseases and Top 20 Features

In [12]:
# Top 10 diseases
disease_labels_10 = list(train_df['PATHOLOGY'].value_counts().index[:10])
print(disease_labels_10)

['URTI', 'Viral pharyngitis', 'Anemia', 'HIV (initial infection)', 'Localized edema', 'Anaphylaxis', 'Pulmonary embolism', 'Influenza', 'Bronchitis', 'Allergic sinusitis']


In [20]:
# Get pathology distribution sorted
pathology_distribution = train_encoded['PATHOLOGY'].value_counts()
top_10_pathologies = pathology_distribution.index[:10]
disease_label_codes = list(top_10_pathologies)

In [21]:
disease_label_mappings = zip(disease_label_codes, disease_labels_10)
disease_label_mappings = dict(disease_label_mappings)
print(disease_label_mappings)

{44: 'URTI', 46: 'Viral pharyngitis', 8: 'Anemia', 23: 'HIV (initial infection)', 27: 'Localized edema', 7: 'Anaphylaxis', 36: 'Pulmonary embolism', 13: 'Influenza', 6: 'Bronchitis', 1: 'Allergic sinusitis'}


In [14]:
# Prepare training and test variables
top_10_pathologies_train_df = train_encoded[train_encoded['PATHOLOGY'].isin(top_10_pathologies)]
top_10_pathologies_test_df = test_encoded[test_encoded['PATHOLOGY'].isin(top_10_pathologies)]
X_columns = [col for col in top_10_pathologies_train_df.columns if col not in ['Unnamed: 0', 'PATHOLOGY']]
X_train = top_10_pathologies_train_df[X_columns]
y_train = top_10_pathologies_train_df['PATHOLOGY']
X_test = top_10_pathologies_test_df[X_columns]
y_test = top_10_pathologies_test_df['PATHOLOGY']
pprint(f'X_train shape: {X_train.shape}')
pprint(f'y_train shape: {y_train.shape}')
pprint(f'X_test shape: {X_test.shape}')
pprint(f'y_test shape: {y_test.shape}')

'X_train shape: (366303, 216)'
'y_train shape: (366303,)'
'X_test shape: (48222, 216)'
'y_test shape: (48222,)'


In [15]:
selected_20_features = ['Do you have swollen or painful lymph nodes?',
 'Have you had sexual intercourse with an HIV-positive partner in the past 12 '
 'months?',
 'Are you taking any new oral anticoagulants ((NOACs)?',
 'Have you had unprotected sex with more than one partner in the last 6 '
 'months?',
 'Is your nose or the back of your throat itchy?',
 'Are you immunosuppressed?',
 'Have you had surgery within the last month?',
 'Do you have a chronic obstructive pulmonary disease (COPD)?',
 'Do you regularly take stimulant drugs?',
 'Are you exposed to secondhand cigarette smoke on a daily basis?',
 'Do you have heart failure?',
 'What color is the rash?',
 'Have you ever had a diagnosis of anemia?',
 'Where is the swelling located?',
 'Have you ever had a sexually transmitted infection?',
 'Where is the affected region located?',
 'Do you have any lesions, redness or problems on your skin that you believe '
 'are related to the condition you are consulting for?',
 'Have you started or taken any antipsychotic medication within the last 7 '
 'days?',
 'Do you have pain somewhere, related to your reason for consulting?',
 'Do you have severe itching in one or both eyes?']

print(f'{len(selected_20_features)}')

20


In [16]:
# Get top features from the dataset
X_train_selected = X_train[selected_20_features]
X_test_selected = X_test[selected_20_features]
pprint(f'X_train_selected shape: {X_train_selected.shape}')
pprint(f'X_test_selected shape: {X_test_selected.shape}')

'X_train_selected shape: (366303, 20)'
'X_test_selected shape: (48222, 20)'


In [17]:
# XGBoost expects classess to start from 0, so create a new mapping to fit this.
unique_classes = np.unique(y_train)
class_mapping = {original: new for new, original in enumerate(unique_classes)}

y_train_mapped = y_train.map(class_mapping)
y_test_mapped = y_test.map(class_mapping)
y_train_mapped.unique()

array([8, 5, 9, 3, 1, 2, 4, 6, 7, 0])

In [33]:
class_mapping

{1: 0, 6: 1, 7: 2, 8: 3, 13: 4, 23: 5, 27: 6, 36: 7, 44: 8, 46: 9}

In [22]:
# Train model -- took 20s
cls = XGBClassifier(class_weight='balanced', random_state=43)
cls.fit(X_train_selected, y_train_mapped)

# Save trained model
!pip install joblib
from joblib import dump, load
dump(cls, 'xgboost_10.joblib')



['xgboost_10.joblib']

In [23]:
trained_model = load('xgboost_10.joblib')
y_pred = trained_model.predict(X_test_selected)

# Get classification report
print(classification_report(y_test_mapped, y_pred, target_names=disease_labels_10))

                         precision    recall  f1-score   support

                   URTI       0.97      1.00      0.99      3302
      Viral pharyngitis       1.00      0.96      0.98      2411
                 Anemia       1.00      1.00      1.00      3798
HIV (initial infection)       1.00      0.96      0.98      6770
        Localized edema       1.00      0.79      0.89      3577
            Anaphylaxis       1.00      1.00      1.00      3904
     Pulmonary embolism       1.00      1.00      1.00      3734
              Influenza       1.00      1.00      1.00      3679
             Bronchitis       1.00      0.63      0.77      8713
     Allergic sinusitis       0.66      1.00      0.80      8334

               accuracy                           0.91     48222
              macro avg       0.96      0.93      0.94     48222
           weighted avg       0.94      0.91      0.91     48222



In [40]:
# Get class probability predictions
y_pred_proba = trained_model.predict_proba(X_test_selected)

# Get the indices of the top 3 predictions for each row
top3_indices = np.argsort(-y_pred_proba, axis=1)[:, :3]