<a href="https://colab.research.google.com/github/faithNassiwa/predictive-diagnosis-assistant/blob/main/notebooks/Refinement_DDXPlus_GradientBoosting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [101]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, classification_report, auc, roc_auc_score, roc_curve, confusion_matrix
from pprint import pprint
from xgboost import XGBClassifier, plot_importance, DMatrix
from sklearn.utils import class_weight
from sklearn.model_selection import GridSearchCV, KFold, ParameterGrid
from sklearn.preprocessing import LabelBinarizer

In [None]:
# Mount google drive to access folder with data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Load and Preprocessing

In [None]:
# Read the training and test dataset -- Took a minute

train_df = pd.read_csv('/content/drive/MyDrive/DS5500/Data/Dataset2/processed_train.csv', low_memory=False)
test_df = pd.read_csv('/content/drive/MyDrive/DS5500/Data/Dataset2/processed_test.csv', low_memory=False)
pprint(f'Train dataset shape: {train_df.shape}')
pprint(f'Test dataset shape: {test_df.shape}')

'Train dataset shape: (1025602, 221)'
'Test dataset shape: (134529, 221)'


In [None]:
# View values of categorical columns
unique_values = {col: train_df[col].unique() for col in train_df.columns if train_df[col].dtype == 'O'}
pprint(unique_values)


{'Characterize your pain:': array(['heavy', 'exhausting', 'sharp', 'burning', 'a cramp', '0',
       'heartbreaking', 'sickening', 'sensitive', 'scary',
       'a knife stroke', 'tugging', 'violent', 'a pulse', 'haunting',
       'tedious', nan], dtype=object),
 'Do you feel pain somewhere?': array(['temple(L)', 'posterior chest wall(L)', 'eye(R)', 'pharynx', '0',
       'temple(R)', 'epigastric', 'breast(L)', 'iliac fossa(L)', 'pubis',
       'occiput', 'testicle(L)', 'posterior chest wall(R)', 'shoulder(L)',
       'under the jaw', 'hypochondrium(R)', 'forehead', 'trachea',
       'sole(L)', 'finger (middle)(R)', 'shoulder(R)', 'breast(R)',
       'dorsal aspect of the foot(L)', 'belly', 'eye(L)', 'nose',
       'hip(L)', 'top of the head', 'triceps(R)', 'finger (middle)(L)',
       'ear(L)', 'ear(R)', 'nowhere', 'sole(R)', 'upper chest',
       'cheek(L)', 'hypochondrium(L)', 'knee(R)', 'back of head',
       'palace', 'testicle(R)', 'lower chest',
       'dorsal aspect of the foot(

In [None]:
# Check if there are any nans values
pprint(train_df.columns[train_df.isna().any()].tolist())
pprint(test_df.columns[test_df.isna().any()].tolist())

['Characterize your pain:', 'What color is the rash?']
['Characterize your pain:', 'What color is the rash?']


In [None]:
# Count of NAN in the columns -- drop these rows

print(train_df['Characterize your pain:'].isna().sum())
print(train_df['What color is the rash?'].isna().sum())
print(test_df['Characterize your pain:'].isna().sum())
print(test_df['What color is the rash?'].isna().sum())

10859
32609
1379
4436


In [None]:
# Drop rows with NAN values

train_df_nonan = train_df.dropna(subset=['Characterize your pain:', 'What color is the rash?'])
test_df_nonan = test_df.dropna(subset=['Characterize your pain:', 'What color is the rash?'])

In [None]:
# Count of NAN in the columns after drop

print(train_df_nonan['Characterize your pain:'].isna().sum())
print(train_df_nonan['What color is the rash?'].isna().sum())
print(test_df_nonan['Characterize your pain:'].isna().sum())
print(test_df_nonan['What color is the rash?'].isna().sum())

0
0
0
0


In [None]:
pprint(f'Train dataset shape: {train_df_nonan.shape}')
pprint(f'Test dataset shape: {test_df_nonan.shape}')

'Train dataset shape: (982224, 221)'
'Test dataset shape: (128726, 221)'


In [None]:
# Remove columns with only one value
train_df_nonan.drop(columns=['Have you lost consciousness associated with violent and sustained muscle contractions or had an absence episode?',
                             'Have you noticed a diffuse (widespread) redness in one or both eyes?',
                             'Have you had any vaginal discharge?'], inplace=True)
test_df_nonan.drop(columns=['Have you lost consciousness associated with violent and sustained muscle contractions or had an absence episode?',
                             'Have you noticed a diffuse (widespread) redness in one or both eyes?',
                             'Have you had any vaginal discharge?'], inplace=True)

In [None]:
pprint(f'Train dataset shape: {train_df_nonan.shape}')
pprint(f'Test dataset shape: {test_df_nonan.shape}')

'Train dataset shape: (982224, 218)'
'Test dataset shape: (128726, 218)'


# Feature Engineering

In [None]:
# Handling Categorical Variables  - Label Encoding to avoid creating new features that increase the already many featureset
from sklearn.preprocessing import LabelEncoder

categorical_columns = list(unique_values.keys())

# Reset indices of both X_train and X_test
train_df_reset = train_df_nonan.reset_index(drop=True)
test_df_reset = test_df_nonan.reset_index(drop=True)

# Combine the train and test data
combined = pd.concat([train_df_reset, test_df_reset], axis=0)

# Create a dictionary to store encodings
label_encoders = {}


# Label encode the combined dataframe for each categorical column
for col in categorical_columns:
  le = LabelEncoder()
  combined[col] = le.fit_transform(combined[col])
  label_encoders[col] = le

combined_encoded = combined

# Split the combined_encoded dataframe back into train and test
train_encoded = combined_encoded.iloc[:len(train_df_nonan)]
test_encoded = combined_encoded.iloc[len(train_df_nonan):]

In [None]:
# Prepare training and test variables
X_columns = [col for col in train_df_nonan.columns if col not in ['Unnamed: 0', 'PATHOLOGY']]
X_train = train_encoded[X_columns]
y_train = train_encoded['PATHOLOGY']
X_test = test_encoded[X_columns]
y_test = test_encoded['PATHOLOGY']
pprint(f'X_train shape: {X_train.shape}')
pprint(f'y_train shape: {y_train.shape}')
pprint(f'X_test shape: {X_test.shape}')
pprint(f'y_test shape: {y_test.shape}')


'X_train shape: (982224, 216)'
'y_train shape: (982224,)'
'X_test shape: (128726, 216)'
'y_test shape: (128726,)'


In [None]:
# Check dimension after encoding
pprint(f'X_train shape: {X_train.shape}')
pprint(f'X_test shape: {X_test.shape}')

'X_train shape: (982224, 216)'
'X_test shape: (128726, 216)'


# Feature Selection

In [None]:
# Correlation Analysis
correlation_matrix = X_train.corr()
correlation_matrix.style.background_gradient(cmap='cool')

Unnamed: 0,AGE,SEX,Do you live with 4 or more people?,Have you had significantly increased sweating?,"Do you have pain somewhere, related to your reason for consulting?",Characterize your pain:,Do you feel pain somewhere?,Does the pain radiate to another location?,Do you have a cough that produces colored or more abundant sputum than usual?,Do you smoke cigarettes?,Do you have a fever (either felt or measured with a thermometer)?,Do you have a sore throat?,Do you have a cough?,Have you traveled out of the country in the last 4 weeks?,Are you exposed to secondhand cigarette smoke on a daily basis?,Do you have swollen or painful lymph nodes?,Have you ever had a sexually transmitted infection?,Have you had diarrhea or an increase in stool frequency?,Have you had unprotected sex with more than one partner in the last 6 months?,"Do you have any lesions, redness or problems on your skin that you believe are related to the condition you are consulting for?",What color is the rash?,Do your lesions peel off?,Where is the affected region located?,Is the lesion (or are the lesions) larger than 1cm?,Are you feeling nauseous or do you feel like vomiting?,Have you had an involuntary weight loss over the last 3 months?,Have you had sexual intercourse with an HIV-positive partner in the past 12 months?,Do you drink alcohol excessively or do you have an addiction to alcohol?,Have you had chills or shivers?,Do you have heart failure?,Have you ever had pneumonia?,Do you have a chronic obstructive pulmonary disease (COPD)?,Do you have asthma or have you ever had to use a bronchodilator in the past?,Do you have diffuse (widespread) muscle pain?,"Have you noticed any new fatigue, generalized and vague discomfort, diffuse (widespread) muscle aches or a change in your general well-being related to your consultation today?",Do you have nasal congestion or a clear runny nose?,Do you attend or work in a daycare?,Have you lost your sense of smell?,Have you had a cold in the last 2 weeks?,Do you have polyps in your nose?,Do you have a deviated nasal septum?,Have you ever been diagnosed with gastroesophageal reflux?,Do you have greenish or yellowish nasal discharge?,Are you more likely to develop common allergies than the general population?,Have you been in contact with a person with similar symptoms in the past 2 weeks?,Are you immunosuppressed?,Do you have a poor diet?,Have you ever had a diagnosis of anemia?,Do you feel lightheaded and dizzy or do you feel like you are about to faint?,Do you feel so tired that you are unable to do your usual activities or are you stuck in your bed all day long?,Do you have chronic kidney failure?,Have you recently had stools that were black (like coal)?,Are you taking any new oral anticoagulants ((NOACs)?,Is your skin much paler than usual?,"Is your BMI less than 18.5, or are you underweight?",Have you been diagnosed with hyperthyroidism?,Do you have a known issue with one of your heart valves?,Do you have severe Chronic Obstructive Pulmonary Disease (COPD)?,Are you experiencing shortness of breath or difficulty breathing in a significant way?,Do you have diabetes?,Are you significantly overweight compared to people of the same height as you?,Do you feel slightly dizzy or lightheaded?,Do you have high blood pressure or do you take medications to treat high blood pressure?,Have you ever had a heart attack or do you have angina (chest pain)?,Do you have a known heart defect?,"Do you feel your heart is beating fast (racing), irregularly (missing a beat) or do you feel palpitations?",Do you have symptoms that are increased with physical exertion but alleviated with rest?,"Do you have any close family members who suffer from allergies (any type), hay fever or eczema?",Do you have any family members who have asthma?,Is your nose or the back of your throat itchy?,Do you have severe itching in one or both eyes?,Do you live in in a big city?,Have you noticed a high pitched sound when breathing in?,Have any of your family members been diagnosed with cluster headaches?,Do you take medication that dilates your blood vessels?,Do you have a known severe food allergy?,Have you been in contact with or ate something that you have an allergy to?,Do you have swelling in one or more areas of your body?,Where is the swelling located?,Did you lose consciousness?,Have you been coughing up blood?,Have you ever had a spontaneous pneumothorax?,Do you have pain that is increased when you breathe in deeply?,Do you feel that your eyes produce excessive tears?,"Did you previously, or do you currently, have any weakness/paralysis in one or more of your limbs or in your face?",Have you ever had fluid in your lungs?,Are your symptoms worse when lying down and alleviated while sitting up?,Are you infected with the human immunodeficiency virus (HIV)?,Do you take corticosteroids?,Are you currently using intravenous drugs?,Are there any members of your family who have been diagnosed myasthenia gravis?,Do you have the perception of seeing two images of a single object seen overlapping or adjacent to each other (double vision)?,Do you feel weakness in both arms and/or both legs?,Do your symptoms of muscle weakness increase with fatigue and/or stress?,Do you have a hard time opening/raising one or both eyelids?,Do you feel anxious?,Do you suffer from chronic anxiety?,Have you ever had a migraine or is a member of your family known to have migraines?,Do you suffer from fibromyalgia?,"Have you recently had numbness, loss of sensation or tingling, in both arms and legs and around your mouth?","Do you currently, or did you ever, have numbness, loss of sensitivity or tingling anywhere on your body?",Have you ever had a head trauma?,Do you feel like you are (or were) choking or suffocating?,Have you ever been diagnosed with depression?,Do you feel like you are dying or were you afraid that you were about do die?,Do you feel like you are detached from your own body or your surroundings?,Did your cheeks suddenly turn red?,Did you eat dark-fleshed fish (such as tuna) or Swiss cheese before the reaction occurred?,Do you have pain that improves when you lean forward?,Do you regularly take stimulant drugs?,"Do you have difficulty swallowing, or have a feeling of discomfort/blockage when swallowing?",Have you noticed that you produce more saliva than usual?,Are your vaccinations up to date?,Have you been able to pass stools or gas since your symptoms increased?,Were you born prematurely or did you suffer any complication at birth?,Do you have intense coughing fits?,"Are the symptoms or pain increased with coughing, with an effort like lifting a weight or from forcing a bowel movement?",Have you vomited several times or have you made several efforts to vomit?,Do you have chronic pancreatitis?,Do you constantly feel fatigued or do you have non-restful sleep?,Do you have pale stools and dark urine?,Are there members of your family who have been diagnosed with pancreatic cancer?,Have you ever had a pericarditis?,Are you consulting because you have high blood pressure?,Do you have painful mouth ulcers or sores?,"Have you noticed that the tone of your voice has become deeper, softer or hoarse?",Do you find that your symptoms have worsened over the last 2 weeks and that progressively less effort is required to cause the symptoms?,Do you have chest pain even at rest?,Do you have close family members who had a cardiovascular disease problem before the age of 50?,Do you have Rheumatoid Arthritis?,Do you suffer from Crohn’s disease or ulcerative colitis (UC)?,Do you have a problem with poor circulation?,Have you recently had a loss of appetite or do you get full more quickly then usually?,Are you a former smoker?,Have you been unintentionally losing weight or have you lost your appetite?,Do you think you are pregnant or are you currently pregnant?,Are you currently taking or have you recently taken anti-inflammatory drugs (NSAIDs)?,Have you ever had deep vein thrombosis (DVT)?,Have you ever had surgery to remove lymph nodes?,Do you have a known kidney problem resulting in an inability to retain proteins?,Do you work in agriculture?,Are you being treated for osteoporosis?,Do you have pain that is increased with movement?,Have any of your family members ever had a pneumothorax?,Have you noticed a wheezing sound when you exhale?,Have you gained weight recently?,Do you have liver cirrhosis?,Were you diagnosed with endocrine disease or a hormone dysfunction?,Do you have any family members who have been diagnosed with anemia?,Have you noticed light red blood or blood clots in your stool?,Do you have Parkinson’s disease?,Do you have a hiatal hernia?,"Do you have a burning sensation that starts in your stomach then goes up into your throat, and can be associated with a bitter taste in your mouth?",Do you have symptoms that get worse after eating?,Do you have pain or weakness in your jaw?,Do you have difficulty articulating words/speaking?,Have you been hospitalized for an asthma attack in the past year?,Have you been diagnosed with chronic sinusitis?,Have you had one or several flare ups of chronic obstructive pulmonary disease (COPD) in the past year?,Do you work in construction?,Have you had 2 or more asthma attacks in the past year?,Have you recently had a viral infection?,Have you noticed weakness in your facial muscles and/or eyes?,"Do you have numbness, loss of sensation or tingling in the feet?",Do you have high cholesterol or do you take medications to treat high cholesterol?,Do you have very abundant or very long menstruation periods?,Do you have an active cancer?,Have you been unable to move or get up for more than 3 consecutive days within the last 4 weeks?,Have you had surgery within the last month?,Are your symptoms more prominent at night?,Have you ever been diagnosed with obstructive sleep apnea (OSA)?,Have you recently thrown up blood or something resembling coffee beans?,Have you been in contact with someone who has had pertussis (whoooping cough)?,Did you vomit after coughing?,Do you feel your heart is beating very irregularly or in a disorganized pattern?,"Do you exercise regularly, 4 times per week or more?",Have you started or taken any antipsychotic medication within the last 7 days?,Have you ever felt like you were suffocating for a very short time associated with inability to breathe or speak?,"Have you been treated in hospital recently for nausea, agitation, intoxication or aggressive behavior and received medication via an intravenous or intramuscular route?",Are you unable to control the direction of your eyes?,Do you feel that muscle spasms or soreness in your neck are keeping you from turning your head to one side?,"Do you have annoying muscle spasms in your face, neck or any other part of your body?",Do you suddenly have difficulty or an inability to open your mouth or have jaw pain when opening it?,Do you feel your abdomen is bloated or distended (swollen due to pressure from inside)?,Do you currently undergo dialysis?,Do you have bouts of choking or shortness of breath that wake you up at night?,Do you have family members who have had lung cancer?,Do you have trouble keeping your tongue in your mouth?,Are you currently being treated or have you recently been treated with an oral antibiotic for an ear infection?,"Have you had weakness or paralysis on one side of the face, which may still be present or completely resolved?",Do you regularly drink coffee or tea?,Have you recently taken decongestants or other substances that may have stimulant effects?,Do you have metastatic cancer?,Are you more irritable or has your mood been very unstable recently?,Do you take a calcium channel blockers (medication)?,Do you have cystic fibrosis?,Do you currently take hormones?,Do you wheeze while inhaling or is your breathing noisy after coughing spells?,Do you feel out of breath with minimal physical effort?,Do any members of your immediate family have a psychiatric illness?,Have you ever had a stroke?,Do you consume energy drinks regularly?,Does the person have a whooping cough?,Have you or any member of your family ever had croup?,Do you work in the mining sector?,Did you have your first menstrual period before the age of 12?,Have you breastfed one of your children for more than 9 months?,Have you felt confused or disorientated lately?,"In the last month, have you been in contact with anyone infected with the Ebola virus?",Have you noticed any unusual bleeding or bruising related to your consultation today?,Do you live in the suburbs?,Do you ever temporarily stop breathing while you’re asleep?,Do you have a decrease in appetite?,Does your mother suffer from asthma?,Do you live in a rural area?,Are you of Asian descent?
AGE,1.0,0.007493,-0.018812,0.014051,0.002725,-0.00188,-0.028034,0.028659,0.017995,0.05104,-0.063499,-0.030164,0.007679,-0.011031,0.008693,-0.01401,-0.013482,-0.008984,-0.013129,-0.03111,-0.016145,-0.03137,-0.029553,-0.029187,0.025546,0.029864,-0.013694,0.008945,-0.023608,0.006841,0.00177,0.034931,-0.001368,-0.032175,0.01328,-0.031072,-0.015742,0.005452,-0.005527,0.005241,0.003723,0.05849,0.002338,-0.007565,-0.01361,-0.002334,-0.014817,-0.015329,-0.011161,-0.025295,-0.014552,-0.014388,-0.016005,-0.013219,-0.023221,-0.010891,-0.016314,0.045167,0.024942,0.030778,0.043421,-0.013617,0.038564,0.067495,-0.010017,0.006127,0.047744,-0.000404,-0.006997,-0.002037,-0.001721,-0.006679,-0.056099,0.014241,0.013613,-0.014163,-0.013297,-0.003952,0.009454,-0.01721,0.014755,0.004037,-6e-06,0.011524,-0.007681,0.04084,0.021668,0.021418,-0.015884,-0.016022,-0.0077,-0.008844,-0.01017,-0.007542,-0.012933,0.012184,0.011042,0.010466,0.01049,0.004588,0.002587,0.011788,0.009422,0.01091,0.012095,0.011479,-0.012568,-0.012859,-0.054734,-0.034417,-0.047254,-0.079435,-0.043931,-0.006719,-0.006597,2e-05,-0.008621,0.01587,0.026595,0.055032,0.02822,0.029411,0.011116,-0.008489,-0.009049,-0.048179,0.03737,0.032403,0.062846,0.016964,0.017183,0.038438,0.024849,0.071809,-0.001534,-0.012566,-0.012494,-0.017064,-0.014705,-0.013042,0.073558,0.009463,0.010658,0.003161,0.023486,-0.012594,-0.013203,-0.011718,-0.014722,-0.012912,-0.013823,-0.010436,-0.010773,-0.01139,-0.006325,-0.008965,-0.009347,-0.010336,0.082587,0.072626,-0.009897,0.004083,-0.006808,-0.006763,0.064098,-0.037127,-0.011325,-0.010688,-0.018725,-0.033662,-0.014335,0.004735,-0.000346,-0.002602,-0.011471,0.049064,-0.010696,-0.010998,-0.01093,-0.009641,-0.009778,-0.01031,-0.010452,-0.006223,0.039223,0.03801,0.061543,-0.010369,-0.010651,-0.005118,0.004321,0.005196,0.008739,-0.009775,-0.013186,0.019203,-0.007953,-0.001127,-0.005843,0.009517,-0.013659,0.005769,-0.07216,-0.075042,0.07775,-0.008472,-0.007888,-0.001304,-0.001815,-0.002223,-0.021015,-0.021026,-0.021516,-0.021258,-0.021305,0.006362
SEX,0.007493,1.0,-0.001279,-0.000157,-0.00696,-0.004633,-0.004583,-0.004715,0.005881,0.004893,-0.003071,-0.001702,0.00412,0.000424,-0.000651,-0.005251,-0.001207,0.001326,-0.001677,0.001004,0.00585,0.00072,-0.000273,0.00113,0.002546,0.003395,-0.001074,-0.0012,0.000374,-0.001477,-0.000824,0.008014,-0.002451,-0.000117,-0.002433,-0.004322,-0.000964,-0.001948,-0.001433,-0.000925,-0.001841,0.012044,-0.002458,-0.001353,-0.001303,0.000204,-0.001446,-0.001052,-0.001148,-0.007755,-0.002129,-0.002449,-0.00205,-0.00017,-0.001125,0.000358,0.000875,0.012867,0.0032,0.0048,0.001148,-0.012854,-0.001525,-0.000979,0.000296,-0.002822,-0.005024,0.000704,0.000104,0.000575,0.001203,-0.00022,-0.000943,-0.002722,-0.003204,-0.001049,-0.000333,-0.001739,-0.001844,-0.001136,-0.000477,-0.000145,-0.000371,-0.003984,-0.001759,-0.000951,-0.001354,0.00023,0.000445,-0.000792,0.000178,-0.000622,0.000236,-0.001662,0.000708,-0.000503,-0.000352,-0.000415,-0.001075,-0.001026,9.2e-05,-0.000395,-0.000283,-0.000408,-0.001599,-0.000959,-0.001177,-0.000755,2.3e-05,0.000337,-0.001561,-0.001423,-0.002112,-0.006921,-0.004348,-0.00597,-0.006485,-0.000201,0.008548,0.005099,0.007879,0.011427,-0.000608,0.000482,0.000332,-0.000759,6.7e-05,0.001184,-0.002033,-0.000654,0.000479,-0.000109,0.000565,0.000423,-0.005726,-0.16117,0.000163,-0.00113,-0.000412,-0.00099,0.017161,0.000391,0.000172,0.000318,0.007866,-0.000268,-0.000851,0.000248,-0.001382,-0.001109,0.000178,-0.001585,-0.001537,-0.000332,-0.000944,-0.001338,-0.001813,-0.001008,0.019559,0.016887,-0.001957,0.000242,0.001407,0.00076,-0.002342,-0.103305,-0.000801,-0.001318,-0.000495,-0.000502,-0.000755,-0.001418,0.000277,0.000697,0.001594,-0.001305,0.000775,-0.000559,-3.2e-05,0.000313,0.000122,-0.000323,8.3e-05,-0.005543,-0.000791,-0.001501,0.001593,-0.000518,9.1e-05,0.001202,0.001918,0.002068,1e-06,-9.9e-05,-0.000784,-0.000114,-0.088791,0.000988,-0.001617,-0.000428,0.000605,0.001993,-0.001019,-0.001915,0.01809,-0.061889,-0.03386,0.00132,0.002265,0.002029,0.001667,-0.00052,-8.2e-05,0.000241,0.00016,-0.000992
Do you live with 4 or more people?,-0.018812,-0.001279,1.0,0.169009,0.183009,0.086222,0.378524,0.069768,0.101931,0.228624,0.310888,0.304805,0.228497,-0.037964,0.290339,-0.056181,-0.049425,-0.083355,-0.04937,-0.151942,-0.138011,-0.143787,-0.124865,-0.148617,-0.116407,-0.075069,-0.049456,-0.124158,-0.062295,-0.083153,-0.083214,-0.094713,-0.13588,0.207015,-0.070451,0.367158,0.620322,-0.056955,0.051459,-0.060796,-0.052571,-0.073206,-0.054054,-0.099927,0.602602,0.231585,-0.067089,-0.068368,-0.104711,-0.091908,-0.067185,-0.067432,-0.069345,-0.059845,-0.089354,-0.041853,-0.063349,-0.057037,-0.265766,-0.104837,-0.106532,-0.080859,-0.101051,-0.098266,-0.040457,-0.096174,-0.092361,-0.046141,-0.056253,-0.050489,-0.047345,-0.057484,-0.07201,-0.044984,-0.042725,-0.051236,-0.050145,-0.112604,-0.106677,-0.064626,0.131861,-0.03159,-0.088557,-0.038326,-0.053721,-0.042893,-0.081266,-0.053382,-0.056011,-0.056094,-0.037527,-0.040261,-0.058597,-0.032695,-0.05887,-0.05449,-0.046988,-0.03979,-0.044086,-0.065577,-0.056541,-0.045752,-0.040753,-0.046413,-0.047509,-0.0458,-0.046278,-0.046102,-0.034892,-0.066904,-0.056213,-0.040944,-0.082675,-0.033495,-0.035122,-0.048797,-0.035333,-0.037584,-0.031311,-0.103463,-0.032127,-0.033795,-0.05553,-0.027903,-0.029546,0.16594,-0.04158,-0.04821,-0.071073,-0.038585,-0.038397,-0.042674,-0.079411,-0.051386,-0.024465,-0.055651,-0.04296,-0.06358,-0.049609,-0.046656,-0.034633,-0.022117,-0.025668,-0.027961,-0.094668,-0.044241,-0.046679,-0.042168,-0.065368,-0.059775,-0.043857,-0.045504,-0.047646,-0.047527,-0.037862,-0.040214,-0.040818,-0.041457,-0.038673,-0.033967,-0.042446,-0.06997,-0.042072,-0.0404,-0.071863,-0.035671,-0.046668,-0.044312,-0.068096,-0.060928,-0.044247,-0.047014,-0.021538,-0.026189,-0.043341,-0.054437,-0.046438,-0.042517,-0.046371,-0.042494,-0.044835,-0.045015,-0.044834,-0.035998,-0.040773,-0.040579,-0.030747,-0.044863,-0.040579,-0.038897,-0.027928,-0.02923,-0.021094,-0.040619,-0.044218,-0.04186,-0.030659,-0.020111,-0.027737,-0.040236,-0.041541,-0.029307,-0.015479,-0.016073,-0.036084,-0.02137,-0.011692,-0.007664,-0.008666,-0.008653,0.015413,0.01931,0.018663,0.015997,0.018112,-0.015698
Have you had significantly increased sweating?,0.014051,-0.000157,0.169009,1.0,0.210538,0.174631,0.1367,0.188184,0.044624,0.121926,0.215427,0.4046,0.00793,-0.025691,0.20773,0.186455,0.216131,0.043698,0.214453,0.100306,0.030416,0.060853,0.110884,0.12004,0.200444,0.085934,0.216499,0.005759,0.10379,0.037078,-0.095125,-0.108269,-0.071438,0.372465,0.041622,0.134674,0.130962,-0.065107,-0.096784,-0.069498,-0.060096,-0.083684,-0.061791,-0.114229,0.160219,0.01195,-0.076691,-0.078153,-0.006291,0.177509,-0.076801,-0.077084,-0.07927,-0.068411,-0.102143,-0.047843,-0.072417,-0.0652,-0.080894,0.083084,0.06218,-0.092432,0.167655,0.204131,-0.046247,0.01614,0.090086,-0.052745,-0.064305,-0.057715,-0.054121,-0.065711,-0.082317,-0.051423,-0.04884,-0.058569,-0.057323,-0.016273,0.036411,-0.073876,-0.128429,-0.036112,-0.101233,-0.043811,-0.06141,0.174006,0.020586,0.117824,-0.064027,0.136892,-0.042899,-0.046024,-0.066984,-0.037375,-0.067296,0.133566,0.199355,0.168831,0.187722,0.122477,0.090577,0.194239,0.173139,0.197282,0.202501,0.194161,-0.052902,-0.052701,-0.039886,-0.07648,-0.064259,-0.046805,-0.095626,-0.038289,-0.040149,-0.055782,-0.04039,-0.042963,-0.035792,0.02912,-0.036726,-0.038632,-0.063478,-0.031897,-0.033775,-0.072579,0.143941,0.103506,0.169856,-0.044108,-0.043892,0.191894,0.157752,0.104872,-0.027967,-0.063617,-0.049108,-0.07268,-0.05671,-0.053334,-0.03959,-0.025283,-0.029342,-0.031964,-0.109736,-0.050573,-0.05336,-0.048204,-0.074724,-0.06833,-0.050135,-0.052017,-0.054466,-0.054329,-0.043282,-0.04597,-0.046661,-0.047391,-0.044208,-0.038829,-0.048521,-0.079985,-0.048094,-0.046182,0.187283,-0.040776,-0.053347,-0.050655,-0.077843,-0.069649,-0.05058,-0.053743,-0.02462,-0.029938,-0.049545,0.078538,-0.053084,-0.048603,-0.053008,-0.048576,-0.051253,-0.051457,-0.051251,-0.04115,0.166395,0.167451,-0.035148,-0.051285,-0.046388,-0.044464,-0.031925,-0.033413,-0.024113,-0.046433,-0.050547,-0.047851,-0.035047,-0.02299,-0.031707,0.171675,-0.047487,-0.033502,-0.017695,-0.018373,-0.041249,-0.024429,-0.013365,-0.008761,-0.009907,-0.009892,-0.004655,-0.004655,-0.004766,-0.004703,-0.004719,-0.017945
"Do you have pain somewhere, related to your reason for consulting?",0.002725,-0.00696,0.183009,0.210538,1.0,0.701297,0.67107,0.859261,-0.05167,0.235178,0.212936,0.172451,-0.02047,0.038059,-0.022542,0.091509,0.080505,0.030923,0.080416,0.128129,0.113212,0.136094,0.120805,0.115773,0.10872,0.015288,0.080556,0.130321,0.101469,0.135443,0.011555,-0.09565,-0.192288,0.170846,0.114753,0.125913,0.182452,0.092771,-0.080308,0.099027,0.08563,-0.124341,0.088046,-0.132945,0.171046,0.011649,0.109277,0.11136,0.088651,0.149703,0.109434,0.109836,0.112952,0.097479,0.045809,-0.227243,-0.11093,-0.309684,-0.190483,-0.027943,0.03725,-0.022974,0.028371,0.024276,-0.219662,-0.074772,0.035969,-0.250525,-0.30543,-0.274131,-0.257062,-0.312111,-0.036775,0.073272,0.069592,0.083455,0.081679,0.183415,0.17376,0.105266,-0.006765,0.051456,0.144246,0.062427,-0.285909,0.069866,0.132369,-0.067667,-0.039615,-0.047454,-0.203757,-0.218602,-0.318158,-0.177522,-0.319639,0.088755,0.076536,0.064811,0.071809,-0.097179,-0.103023,0.074523,0.06638,0.0756,0.077385,0.074602,-0.251269,-0.250314,0.056834,-0.12353,-0.118898,0.032958,0.061662,0.054558,0.057209,0.014143,0.057551,0.061218,0.051001,0.16543,0.05233,0.055047,0.090449,0.04545,0.048125,0.103417,0.067727,0.078526,0.115766,-0.209502,-0.208477,0.069508,0.129347,0.0837,0.03985,0.090647,0.069974,0.103561,0.080805,0.075995,-0.188044,0.036026,0.041809,0.045545,-0.239646,0.072062,0.076033,0.068685,0.106473,0.097364,0.071436,0.074118,0.077608,0.077413,-0.205576,-0.218347,-0.221625,-0.225092,-0.209976,-0.184425,-0.230463,-0.077605,-0.228433,-0.219352,0.117053,0.058102,0.076014,0.072178,0.110918,0.074146,0.072071,0.076578,-0.11694,-0.142195,-0.235324,0.08867,-0.252137,-0.23085,-0.251773,-0.230721,-0.243436,-0.244409,-0.243429,0.058635,0.066413,0.066098,0.050082,-0.243588,0.066098,-0.211192,0.04549,0.04761,0.034359,0.066162,0.072025,-0.227281,0.049939,-0.109197,0.04518,0.065539,0.067664,0.047737,-0.084046,-0.087269,-0.195922,0.034808,0.019044,-0.041614,-0.047055,-0.046983,-0.022108,-0.022108,-0.022639,-0.022337,-0.022413,-0.085234
Characterize your pain:,-0.00188,-0.004633,0.086222,0.174631,0.701297,1.0,0.387688,0.64043,-0.049835,0.114603,0.090294,0.123163,-0.053066,0.006867,0.022842,-0.017185,-0.019421,-0.071949,-0.018962,-0.01487,-0.024073,-0.00272,0.00064,-0.021851,0.050096,-0.107025,-0.019801,0.166142,0.018248,0.16838,0.024159,-0.099354,-0.0908,0.105046,0.058685,0.108775,0.067862,0.120596,-0.06153,0.12894,0.111429,-0.049524,0.114576,-0.073455,0.077041,-0.101329,-0.036509,-0.036883,-0.023932,-0.032271,-0.035763,0.006648,-0.036697,-0.032728,-0.071273,-0.159365,-0.016007,-0.21718,-0.152879,-0.047896,0.009455,-0.103333,0.045914,0.027744,-0.154048,-0.068682,0.008288,-0.175692,-0.214197,-0.192247,-0.180277,-0.218883,-0.024159,0.168592,0.159974,0.027911,0.027043,0.218314,0.200333,0.080545,-0.091132,0.123574,0.178742,0.143287,-0.198355,0.105317,0.146525,-0.007167,0.041897,-0.089806,-0.142894,-0.153305,-0.223123,-0.124495,-0.224162,0.053204,0.060601,0.051913,0.057337,-0.063021,-0.067312,0.058876,0.053297,0.059777,0.062376,0.059158,-0.176214,-0.175544,0.069359,-0.083794,-0.060417,0.054108,0.067512,0.04483,0.047008,0.041783,0.047289,0.146863,-0.055649,0.018581,-0.057454,-0.060232,0.121705,0.087171,0.092377,0.034911,0.003797,0.05778,0.058188,-0.146923,-0.146205,0.103238,0.054795,0.050823,0.003012,0.026701,0.129018,0.152466,0.149052,0.140224,-0.131874,0.075394,0.087608,0.109058,-0.207153,0.132743,0.140056,0.126558,-0.034554,-0.031697,0.02291,0.111474,0.117566,0.116358,-0.14417,-0.153126,-0.155425,-0.157857,-0.147255,-0.129337,-0.161623,-0.066735,-0.1602,-0.153831,0.070219,-0.019523,0.088787,0.08386,0.098369,-0.006492,0.1329,0.154882,-0.082009,-0.099721,-0.165032,0.007099,-0.176823,-0.161895,-0.176568,-0.161804,-0.170721,-0.171403,-0.170716,0.048179,0.100336,0.099511,-0.025185,-0.170827,0.125981,-0.148108,0.00969,0.008786,0.071498,0.12624,0.132727,-0.159392,0.059141,-0.076579,-0.035246,0.050744,0.020394,0.009527,-0.058941,-0.061202,-0.1374,0.066797,0.036683,-0.029184,-0.033,-0.032949,-0.015504,-0.015504,-0.015877,-0.015665,-0.015718,-0.059774
Do you feel pain somewhere?,-0.028034,-0.004583,0.378524,0.1367,0.67107,0.387688,1.0,0.457875,0.00838,0.169198,0.32149,0.226789,0.06447,0.06081,0.067048,0.14201,0.138395,-0.01656,0.138676,0.092504,0.024259,0.102872,0.105761,0.080611,-0.077669,0.040582,0.138773,-0.038611,0.083874,0.11239,-0.018474,-0.089346,-0.231417,0.223113,0.062054,0.209307,0.385476,0.006725,0.00572,0.005462,0.004612,-0.12221,0.002281,-0.176053,0.339981,0.150183,0.188955,0.192396,0.039515,0.220324,0.189187,0.133107,0.195167,0.168127,0.130324,-0.152496,-0.063126,-0.20782,-0.25294,-0.14956,-0.143683,0.075398,-0.078529,-0.114574,-0.147409,-0.158707,-0.110952,-0.16812,-0.204965,-0.183961,-0.172506,-0.209448,-0.058916,0.085297,0.081564,-0.05868,-0.057916,0.072344,0.08252,0.009693,0.11839,-0.07231,0.007938,0.07257,-0.191383,0.050517,-0.060814,-0.118909,-0.013621,0.030594,-0.136735,-0.146697,-0.213506,-0.119129,-0.2145,-0.013171,-0.065154,-0.055684,-0.0612,-0.155781,-0.140422,-0.063075,-0.055838,-0.063861,-0.065905,-0.062894,-0.168619,-0.167978,0.081684,-0.045022,-0.047055,0.066701,-0.00471,0.091744,0.096937,0.058714,0.097064,-0.049573,-0.045401,0.074654,-0.046655,-0.04882,-0.037748,0.056786,0.059982,0.250956,-0.047653,-0.072242,-0.080816,-0.140591,-0.139903,-0.053008,0.056372,0.002352,0.033877,0.08042,0.062291,0.087519,0.071393,0.067575,-0.12619,0.029765,0.034679,-0.064399,-0.213299,0.064805,0.067069,0.061029,0.183661,0.168956,0.06292,-0.054729,-0.05717,-0.057342,-0.137956,-0.146526,-0.148726,-0.151053,-0.140908,-0.123762,-0.154657,-0.167873,-0.153295,-0.1472,-0.082515,0.100293,0.062117,0.058854,0.092718,0.051801,0.0635,-0.059689,-0.078475,-0.095423,-0.157919,-0.058938,-0.169202,-0.154917,-0.168957,-0.15483,-0.163363,-0.164016,-0.163358,0.098712,0.04789,0.047355,0.051512,-0.163464,-0.056326,-0.141725,0.056927,0.059153,0.028333,-0.056396,0.06387,-0.152522,0.040792,-0.073279,-0.040877,-0.055559,0.058939,0.059422,-0.056401,-0.058564,-0.131478,0.043102,0.023667,-0.027926,-0.031577,-0.031529,-0.014836,-0.014836,-0.015192,-0.01499,-0.015041,-0.057198
Does the pain radiate to another location?,0.028659,-0.004715,0.069768,0.188184,0.859261,0.64043,0.457875,1.0,-0.105174,0.300476,0.06875,0.064726,0.006223,0.012329,-0.071643,0.035099,0.030878,0.004888,0.030844,0.039802,0.066786,0.04452,0.036806,0.034519,0.131674,0.003577,0.030898,0.209895,0.03892,0.129392,0.082317,-0.12814,-0.029403,0.064414,0.115561,0.036283,0.069981,0.230247,-0.005887,0.24674,0.212628,-0.00431,0.220114,-0.039792,0.063809,-0.043849,0.041914,0.042713,0.004031,0.05742,0.041974,0.120644,0.043324,0.037389,-0.018924,-0.195261,-0.120899,-0.266099,-0.166656,0.10196,0.243141,-0.065413,0.163233,0.191403,-0.188747,-0.100885,0.146387,-0.215266,-0.262445,-0.23555,-0.220883,-0.268185,-0.070483,0.028104,0.026693,0.03201,0.031329,0.147762,0.165802,0.054164,-0.063937,0.019736,0.087575,0.023944,-0.246304,0.157742,0.274048,0.007527,-0.063075,-0.069,-0.175081,-0.187837,-0.273381,-0.152538,-0.274653,0.034043,0.029356,0.024859,0.027543,-0.11192,-0.110915,0.028584,0.025461,0.028997,0.029682,0.028614,-0.215906,-0.215085,0.021799,-0.132461,-0.122618,0.000297,0.032325,0.020926,0.021943,-0.012249,0.022074,0.085673,0.108969,0.204382,0.112916,0.118297,0.065316,0.017433,0.018459,0.039667,0.146333,0.128621,0.250701,-0.180017,-0.179137,0.148471,0.109008,0.114262,0.015285,0.14199,0.026839,0.053885,0.030994,0.029149,-0.161579,0.025697,0.029856,0.017469,-0.236828,0.02764,0.029163,0.026345,0.040839,0.037345,0.0274,0.282017,0.296223,0.294603,-0.176644,-0.187617,-0.190434,-0.193413,-0.180424,-0.158469,-0.198028,-0.077819,-0.196284,-0.188481,0.252563,0.022286,0.048441,0.045837,0.06153,0.019256,0.027643,0.186126,-0.100482,-0.122183,-0.202205,0.193035,-0.216652,-0.198361,-0.216339,-0.19825,-0.209175,-0.210011,-0.209169,0.02249,0.149593,0.148772,0.019209,-0.209306,0.025352,-0.181469,0.017448,0.018261,0.024506,0.025377,0.027626,-0.195294,0.031702,-0.093828,0.017329,0.025138,0.025953,0.01831,-0.072217,-0.074987,-0.168349,0.013351,0.007304,-0.035757,-0.040433,-0.04037,-0.018997,-0.018997,-0.019453,-0.019194,-0.019259,-0.073238
Do you have a cough that produces colored or more abundant sputum than usual?,0.017995,0.005881,0.101931,0.044624,-0.05167,-0.049835,0.00838,-0.105174,1.0,0.178932,0.188133,0.253456,0.304037,-0.036534,0.133936,-0.055391,-0.04873,-0.082183,-0.048676,0.045337,0.046363,0.179044,0.074299,-0.043283,-0.114771,-0.074014,-0.048761,0.017318,0.216714,0.147103,0.319688,0.597909,0.087703,0.249394,0.151717,0.171876,0.07354,-0.056155,-0.083476,-0.059942,-0.051833,0.110334,-0.053295,-0.098522,0.096424,0.071494,-0.066146,-0.067407,-0.103239,-0.090616,-0.066241,-0.066484,-0.068371,-0.059004,0.094391,-0.041265,-0.062459,0.118385,0.061775,-0.103363,-0.105035,-0.079722,-0.099631,-0.096885,-0.039888,-0.094822,-0.091063,-0.045492,-0.055463,-0.049779,-0.046679,-0.056676,-0.070998,-0.044352,-0.042125,-0.050516,-0.049441,-0.111022,-0.105178,-0.063718,0.122723,-0.031146,0.069034,-0.037787,-0.052966,-0.04229,-0.080124,-0.052631,-0.055223,-0.055306,-0.037,-0.039696,-0.057774,-0.032236,-0.058043,-0.053724,-0.046328,-0.03923,-0.043467,-0.064655,-0.055747,-0.045109,-0.04018,-0.045761,-0.046841,-0.045157,-0.045628,-0.045454,-0.034402,-0.065964,-0.055423,-0.040369,0.177741,-0.033024,-0.034629,-0.048112,-0.034836,-0.037056,-0.030871,-0.102009,-0.031676,-0.03332,-0.05475,-0.027511,-0.029131,-0.062599,-0.040995,-0.047532,-0.070074,0.28872,0.286782,-0.042074,0.13971,-0.050664,-0.024121,-0.054869,-0.042356,-0.062686,-0.048912,-0.046,0.21521,-0.021806,-0.025307,-0.027569,0.193709,-0.043619,-0.046023,-0.041575,-0.064449,-0.058935,0.329293,-0.044864,-0.046977,-0.046859,-0.03733,-0.039649,-0.040245,-0.040874,0.239721,0.212375,-0.041849,-0.068986,-0.041481,-0.039832,-0.070853,-0.03517,-0.046012,-0.04369,0.120196,0.405453,-0.043625,-0.046353,-0.021235,-0.025821,-0.042732,-0.053672,-0.045785,-0.04192,-0.045719,-0.041896,-0.044205,-0.044382,-0.044204,-0.035492,-0.0402,-0.040009,-0.030315,-0.044233,-0.040009,-0.03835,-0.027536,-0.028819,-0.020797,-0.040048,-0.043597,0.312478,-0.030228,-0.019829,-0.027347,-0.039671,0.313351,-0.028896,-0.015262,-0.015847,0.224272,-0.02107,-0.011527,-0.007557,-0.008545,-0.008532,-0.004015,-0.004015,-0.004111,-0.004056,-0.00407,0.11886
Do you smoke cigarettes?,0.05104,0.004893,0.228624,0.121926,0.235178,0.114603,0.169198,0.300476,0.178932,1.0,0.157199,0.15058,0.309672,-0.072925,0.142698,-0.104934,-0.092315,-0.107622,-0.092212,0.017054,0.094183,0.058996,0.04064,-0.010904,-0.054947,0.013952,-0.092374,0.029335,0.16793,-0.040232,0.139463,0.173891,0.043649,0.152365,-0.02154,0.192426,0.227555,0.159868,0.105925,0.171408,0.148129,0.211828,0.151507,-0.023642,0.216809,0.129785,-0.125307,-0.127696,-0.195578,-0.051248,-0.125487,-0.063921,-0.129522,-0.111779,-0.075648,-0.078172,-0.118323,0.024535,-0.135831,0.106997,0.168873,-0.151027,0.056517,0.099386,-0.075564,-0.179632,0.054602,-0.086181,-0.105069,-0.094302,-0.08843,-0.107367,-0.069344,-0.084021,-0.079801,-0.095697,-0.093661,-0.209676,-0.199251,-0.120708,0.082737,0.1071,-0.019699,-0.071585,-0.100339,-0.080115,-0.040951,0.055017,-0.104616,-0.104772,-0.070093,-0.0752,-0.109447,-0.061068,-0.109957,-0.101775,-0.087764,-0.074319,-0.082343,-0.122484,-0.105607,-0.085455,-0.076118,-0.08669,-0.088737,-0.085545,-0.086437,-0.086109,-0.065171,-0.124962,-0.104994,-0.076475,0.017643,-0.062562,-0.065601,-0.091143,-0.065994,-0.070199,0.078837,0.129199,0.080903,0.086047,-0.038389,0.069309,0.071975,0.035302,0.116829,0.144055,0.2061,-0.072069,-0.071717,0.129553,0.240369,0.185986,-0.045696,-0.019526,-0.080239,-0.118754,-0.092659,-0.087144,0.120914,-0.04131,-0.047942,0.095171,0.002261,-0.082633,-0.087187,-0.078761,-0.122093,-0.111647,0.105325,0.115472,0.121128,0.121386,-0.070719,-0.075112,-0.07624,-0.077433,0.134741,0.118264,-0.07928,-0.130689,-0.078582,-0.075458,0.210244,-0.066625,-0.087165,-0.082766,-0.03366,0.117793,-0.082643,0.000755,-0.040228,-0.048916,-0.080952,0.154119,-0.086736,-0.079413,-0.086611,-0.079369,-0.083743,-0.084078,-0.08374,-0.067236,-0.076156,-0.075794,0.132174,-0.083795,-0.075794,-0.072651,-0.052164,-0.054595,-0.039399,-0.075867,-0.08259,-0.078185,-0.057265,-0.037564,-0.051807,-0.075153,0.098908,-0.05474,-0.028912,-0.030021,0.126049,0.051188,0.028087,-0.014315,-0.016187,-0.016162,-0.007605,-0.007605,-0.007788,-0.007684,-0.00771,-0.029321


In [None]:
## Export correlation matrix
correlation_matrix.to_csv('/content/drive/MyDrive/DS5500/Data/Dataset2/correlation_matrix.csv')

In [51]:
# Get pair where value is greater 0.6 coefficient, highly and positive correlation
correlated_features = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.6:
            print(f'Feature: {correlation_matrix.columns[i]} and {correlation_matrix.columns[j]} have correlation of {abs(correlation_matrix.iloc[i, j])}')
            correlated_features.add(correlation_matrix.columns[i])
            correlated_features.add(correlation_matrix.columns[j])

Feature: Characterize your pain: and Do you have pain somewhere, related to your reason for consulting? have correlation of 0.7012969715899394
Feature: Do you feel pain somewhere? and Do you have pain somewhere, related to your reason for consulting? have correlation of 0.6710697244517296
Feature: Does the pain radiate to another location? and Do you have pain somewhere, related to your reason for consulting? have correlation of 0.8592614941708604
Feature: Does the pain radiate to another location? and Characterize your pain: have correlation of 0.6404300039414924
Feature: Have you ever had a sexually transmitted infection? and Do you have swollen or painful lymph nodes? have correlation of 0.6555725336207763
Feature: Have you had unprotected sex with more than one partner in the last 6 months? and Do you have swollen or painful lymph nodes? have correlation of 0.6545784189561947
Feature: Have you had unprotected sex with more than one partner in the last 6 months? and Have you ever ha

In [52]:
print(f'Number of correlated features: {len(correlated_features)}')
pprint(f'Correlated features: {correlated_features}')

Number of correlated features: 125
("Correlated features: {'Have any of your family members ever had a "
 "pneumothorax?', 'Have you ever had surgery to remove lymph nodes?', 'Have "
 'you ever had a migraine or is a member of your family known to have '
 "migraines?', 'Do you have swelling in one or more areas of your body?', 'Do "
 "you work in construction?', 'Do you have trouble keeping your tongue in your "
 "mouth?', 'Do you suffer from chronic anxiety?', 'Do you have any family "
 "members who have been diagnosed with anemia?', 'Have you started or taken "
 "any antipsychotic medication within the last 7 days?', 'Are there any "
 "members of your family who have been diagnosed myasthenia gravis?', 'Do any "
 "members of your immediate family have a psychiatric illness?', 'Have you "
 "been in contact with a person with similar symptoms in the past 2 weeks?', "
 "'Do you have an active cancer?', 'In the last month, have you been in "
 "contact with anyone infected with the Ebola 

In [None]:
# Get pair where value is less than -0.6 coefficient, highly and negative correlation
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) < -0.6:
            print(f'Feature: {correlation_matrix.columns[i]} and {correlation_matrix.columns[j]} have correlation of {abs(correlation_matrix.iloc[i, j])}')

Correlation Matrix Findings: Using a threashold of +0.5 and -0.5


1.   Age is not highly correlated to any other feature
2.   Sex is not highly correlated to any other feature
3. 'Do you live with 4 or more people?' is highly positive correlated to 'Do you attend or work in a daycare?' and 'Have you been in contact with a person with similar symptoms in the past 2 weeks?'
4. 'Do you have pain somewhere, related to your reason for consulting?' is highly positively correlated to 'Characterize your pain:', 'Do you feel pain somewhere?', 'Does the pain radiate to another location?'
5. 'Characterize your pain:' is highly positive correlated to 'Do you have pain somewhere, related to your reason for consulting?', 'Does the pain radiate to another location?'
6. 'Do you feel pain somewhere?' is highly positive correlated to 'Do you feel pain somewhere?'
7. 'Does the pain radiate to another location?' is highly postive correlated to ' Do you have pain somewhere, related to your reason for consulting?', 'Characterize your pain:'
8. 'Do you have a cough that produces colored or more abundant sputum than usual?' is highly positive correlated to 'Do you have a chronic obstructive pulmonary disease (COPD)?'
9. 'Do you smoke cigarettes?' is not highly correlated to any other feature
10. 'Do you have a fever (either felt or measured with a thermometer)?' is not highly correlated to any other feature
11. 'Do you have a sore throat?' is not highly correlated to any other feature
12. 'Do you have a cough?' is not highly correlated to any other feature
13. 'Have you traveled out of the country in the last 4 weeks?' is not highly correlated to any other feature

14. 'Are you exposed to secondhand cigarette smoke on a daily basis?' is not highly  correlated to any other feature

15. 'Do you have swollen or painful lymph nodes?' is highly positive correlated to 'Have you ever had a sexually transmitted infection?', 'Have you had unprotected sex with more than one partner in the last 6 months?', ' Have you had sexual intercourse with an HIV-positive partner in the past 12 months?'

16. 'Have you ever had a sexually transmitted infection?' is highly positive correlated to 'Do you have swollen or painful lymph nodes?',' Have you had unprotected sex with more than one partner in the last 6 months?' , ' Have you had sexual intercourse with an HIV-positive partner in the past 12 months?' , ' Are you currently using intravenous drugs?'

17. 'Have you had diarrhea or an increase in stool frequency?' is highly correlated to ' Is the lesion (or are the lesions) larger than 1cm?'

18. 'Have you had unprotected sex with more than one partner in the last 6 months?' is highly positive correlated to 'Do you have swollen or painful lymph nodes?', 'Have you ever had a sexually transmitted infection?', 'Have you had sexual intercourse with an HIV-positive partner in the past 12 months?', 'Are you currently using intravenous drugs?'

19. 'Do you have any lesions, redness or problems on your skin that you believe are related to the condition you are consulting for?' is highly positive correlated(>0.7) to 'What color is the rash?', 'Do your lesions peel off?', 'Where is the affected region located?', 'Is the lesion (or are the lesions) larger than 1cm?'. All each related to each other

20. 'Do your lesions peel off?' highly correlated to 'Do you have any lesions, redness or problems on your skin that you believe are related to the condition you are consulting for?', 'What color is the rash?', 'Where is the affected region located?', 'Is the lesion (or are the lesions) larger than 1cm?', 'Have you had chills or shivers?', 'Do you have Parkinson’s disease?'

21. 'Where is the affected region located?' highly correlated to 'Do you have any lesions, redness or problems on your skin that you believe are related to the condition you are consulting for?', 'What color is the rash?', 'Do your lesions peel off?', 'Is the lesion (or are the lesions) larger than 1cm?'

22. 'Is the lesion (or are the lesions) larger than 1cm?' highly correlated to 'Have you had diarrhea or an increase in stool frequency?','Do you have any lesions, redness or problems on your skin that you believe are related to the condition you are consulting for?', 'What color is the rash?', 'Do your lesions peel off?', 'Where is the affected region located?'

23. 'Are you feeling nauseous or do you feel like vomiting?' not highly correlated to anything

24. 'Have you had an involuntary weight loss over the last 3 months?' not highly correlated to anything

25. 'Have you had sexual intercourse with an HIV-positive partner in the past 12 months?' is highly correlated to 'Do you have swollen or painful lymph nodes?', 'Have you ever had a sexually transmitted infection?', 'Have you had unprotected sex with more than one partner in the last 6 months?', 'Are you currently using intravenous drugs?'

26. 'Do you drink alcohol excessively or do you have an addiction to alcohol?' not highly correlated to anything

27. 'Have you had chills or shivers?' highly correlated to 'Do your lesions peel off?'

28. 'Do you have heart failure?' is highly correlated to 'Where is the swelling located?'

29. 'Have you ever had pneumonia?' is highly correlated to 'Do you have polyps in your nose?'

30. 'Do you have a chronic obstructive pulmonary disease (COPD)?' is highly correlated to 'Do you have a cough that produces colored or more abundant sputum than usual?'

31. 'Do you have asthma or have you ever had to use a bronchodilator in the past?' not highly correlated to anything

32. 'Do you have diffuse (widespread) muscle pain?' not highly correlated to anything

33. 'Have you noticed any new fatigue, generalized and vague discomfort, diffuse (widespread) muscle aches or a change in your general well-being related to your consultation today?' not highly correlated to anything

34. 'Do you have nasal congestion or a clear runny nose?' not highly correlated to anything

35. 'Do you attend or work in a daycare?' highly correlated to 'Do you live with 4 or more people?', 'Have you been in contact with a person with similar symptoms in the past 2 weeks?'

36. 'Have you lost your sense of smell?' is highly correlated to 'Do you have polyps in your nose?', 'Do you have a deviated nasal septum?', 'Have you ever been diagnosed with gastroesophageal reflux?', 'Do you have greenish or yellowish nasal discharge?'

37.


Based on the above analysis:
1. Drop - 'Do you have pain somewhere, related to your reason for consulting?' since 'Characterize your pain:', 'Do you feel pain somewhere?' and 'Does the pain radiate to another location?' can determine it
2. Drop  - 'Do you have any lesions, redness or problems on your skin that you believe are related to the condition you are consulting for?' since can be inferred 'What color is the rash?', 'Do your lesions peel off?', 'Where is the affected region located?', 'Is the lesion (or are the lesions) larger than 1cm?'.
3. Drop





In [None]:
## get pair wise correlation of features to the target
corr_anaylsis = X_train.corrwith(y_train).sort_values()

## Display features and colleration
for index, value in corr_anaylsis.items():
  print(f'{index}: {value}')

Are you more likely to develop common allergies than the general population?: -0.28678732683703007
Are you taking any new oral anticoagulants ((NOACs)?: -0.22225973954720876
Have you noticed a wheezing sound when you exhale?: -0.21976651227655272
Have you had a cold in the last 2 weeks?: -0.21943877355592986
Have you ever had a diagnosis of anemia?: -0.21912708990858296
Do you have severe Chronic Obstructive Pulmonary Disease (COPD)?: -0.21551084863111197
Do you have chronic kidney failure?: -0.21533642610971634
Do you have a poor diet?: -0.215027406773647
Have you started or taken any antipsychotic medication within the last 7 days?: -0.2141174512461069
Have you been treated in hospital recently for nausea, agitation, intoxication or aggressive behavior and received medication via an intravenous or intramuscular route?: -0.21380861573272705
Have you ever been diagnosed with gastroesophageal reflux?: -0.21350509839516996
Do you have any family members who have been diagnosed with anemi

In [None]:
# Export features correlation to target
corr_anaylsis.to_csv('/content/drive/MyDrive/DS5500/correlation_features_to_pathology.csv')

In [None]:
# Perform Cross Validation  - Took 37 Minutes

from sklearn.model_selection import cross_validate

# Initialize the model
clf = XGBClassifier(random_state=0, class_weight='balanced', aplha=1.0) # include L1 regulations to prevent overfiting

# Perform cross validation
cv_results = cross_validate(clf, X_train, y_train, cv=5, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro', 'roc_auc_ovr', 'roc_auc_ovo'], return_estimator=True)

# Print the mean scores
print(f'Mean Accuracy: {cv_results["test_accuracy"].mean() * 100:.2f}%')
print(f'Mean ROC_AUC OVR: {cv_results["test_roc_auc_ovr"].mean() * 100:.2f}%')
print(f'Mean ROC_AUC OVO: {cv_results["test_roc_auc_ovo"].mean() * 100:.2f}%')
print(f'Mean Precision: {cv_results["test_precision_macro"].mean() * 100:.2f}%')
print(f'Mean Recall: {cv_results["test_recall_macro"].mean() * 100:.2f}%')
print(f'Mean F1 Score: {cv_results["test_f1_macro"].mean() * 100:.2f}%')

Mean Accuracy: 99.49%
Mean ROC_AUC OVR: 99.99%
Mean ROC_AUC OVO: 99.99%
Mean Precision: 99.54%
Mean Recall: 99.30%
Mean F1 Score: 99.36%


In [None]:
# Generate feature importances

for idx, model in enumerate(cv_results['estimator']):
    # Extract feature importances
    importances = model.feature_importances_

    # Get the feature names
    print(f'Model {idx}')
    feature_names = X_train.columns

    # Create a DataFrame for the importances
    feature_importance = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    })

# Sort the DataFrame by the importances
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Display the top features
print(feature_importance.head(50))



Model 0
Model 1
Model 2
Model 3
Model 4
                                               Feature  Importance
166                      Do you have an active cancer?    0.162417
188  Are you currently being treated or have you re...    0.123145
124         Do you have painful mouth ulcers or sores?    0.067082
193  Are you more irritable or has your mood been v...    0.056563
167  Have you been unable to move or get up for mor...    0.048299
134  Have you been unintentionally losing weight or...    0.045875
75            Do you have a known severe food allergy?    0.032464
28                     Have you had chills or shivers?    0.030045
123  Are you consulting because you have high blood...    0.020431
26   Have you had sexual intercourse with an HIV-po...    0.020166
76   Have you been in contact with or ate something...    0.018535
106                 Did your cheeks suddenly turn red?    0.018010
16   Have you ever had a sexually transmitted infec...    0.015092
186  Do you have famil

In [37]:
# save feature_importance to csv file
feature_importance.to_csv('/content/drive/MyDrive/DS5500/feature_importance_cv.csv')

In [39]:
top_100_important_features = feature_importance['Feature'].to_list()[:100]
top_100_important_features

['Do you have an active cancer?',
 'Are you currently being treated or have you recently been treated with an oral antibiotic for an ear infection?',
 'Do you have painful mouth ulcers or sores?',
 'Are you more irritable or has your mood been very unstable recently?',
 'Have you been unable to move or get up for more than 3 consecutive days within the last 4 weeks?',
 'Have you been unintentionally losing weight or have you lost your appetite?',
 'Do you have a known severe food allergy?',
 'Have you had chills or shivers?',
 'Are you consulting because you have high blood pressure?',
 'Have you had sexual intercourse with an HIV-positive partner in the past 12 months?',
 'Have you been in contact with or ate something that you have an allergy to?',
 'Did your cheeks suddenly turn red?',
 'Have you ever had a sexually transmitted infection?',
 'Do you have family members who have had lung cancer?',
 'Are you taking any new oral anticoagulants ((NOACs)?',
 'Do you have a known kidney p

In [53]:
correlated_features_to_remove = set(top_100_important_features) - correlated_features
len(correlated_features_to_remove)

27

In [54]:
selected_features = set(top_100_important_features) - correlated_features_to_remove
len(selected_features)

73

In [48]:
top_100_important_features

['Do you have an active cancer?',
 'Are you currently being treated or have you recently been treated with an oral antibiotic for an ear infection?',
 'Do you have painful mouth ulcers or sores?',
 'Are you more irritable or has your mood been very unstable recently?',
 'Have you been unable to move or get up for more than 3 consecutive days within the last 4 weeks?',
 'Have you been unintentionally losing weight or have you lost your appetite?',
 'Do you have a known severe food allergy?',
 'Have you had chills or shivers?',
 'Are you consulting because you have high blood pressure?',
 'Have you had sexual intercourse with an HIV-positive partner in the past 12 months?',
 'Have you been in contact with or ate something that you have an allergy to?',
 'Did your cheeks suddenly turn red?',
 'Have you ever had a sexually transmitted infection?',
 'Do you have family members who have had lung cancer?',
 'Are you taking any new oral anticoagulants ((NOACs)?',
 'Do you have a known kidney p

In [55]:
X_train_selected_no_correlated = X_train[selected_features]
X_test_selected_no_correlated = X_test[selected_features]

In [56]:
clf = XGBClassifier(random_state=23, class_weight='balanced')
clf.fit(X_train_selected_no_correlated, y_train)
# Make predictions
y_pred = clf.predict(X_test_selected_no_correlated)
lb = LabelBinarizer()
y_test_2d = lb.fit_transform(y_test)
y_pred_2d = lb.fit_transform(y_pred)

# Compute scores
accuracy = accuracy_score(y_test, y_pred)
auc_score_ovo = roc_auc_score(y_test_2d, y_pred_2d, multi_class='ovo')
auc_score_ovr = roc_auc_score(y_test_2d, y_pred_2d, multi_class='ovr')
report = classification_report(y_test, y_pred)

# Print scores
print(f'The accuracy score is {accuracy * 100:.2f}%')
print(f'One-vs-One AUC score: {auc_score_ovo * 100:.2f}%')
print(f'One-vs-Rest AUC score: {auc_score_ovr * 100:.2f}%')
print("Classification Report")
print(report)

The accuracy score is 70.41%
One-vs-One AUC score: 51.43%
One-vs-Rest AUC score: 51.43%
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      2153
           1       1.00      0.99      0.99      3302
           2       0.00      0.00      0.00      3214
           3       0.00      0.00      0.00      3509
           4       1.00      0.98      0.99      2598
           5       0.00      0.00      0.00      1829
           6       1.00      0.99      1.00      2411
           7       1.00      1.00      1.00      3798
           8       1.00      0.99      0.99      6770
           9       1.00      0.97      0.99      2831
          10       0.00      0.00      0.00      2083
          11       1.00      0.98      0.99      2454
          12       0.00      0.00      0.00        36
          13       0.11      1.00      0.20      3577
          14       1.00      0.99      0.99      2222
          15       0.96  

# Feature Importance Only

In [57]:
def get_important_features_list_threshold(threshold):
  important_features = []
  for i in range(len(importances)):
    if importances[i] > threshold:
      important_features.append(feature_names[i])
  return important_features

In [36]:
total = 0
for i in range(len(feature_importance)):
    total += importances[i]
print(total)

0.9999999964536497


In [61]:
importances = feature_importance['Importance'].to_list()
features_names = feature_importance['Feature'].to_list()

In [64]:
def get_important_features_list_top(importances, feature_names, threshold):
    importances = np.array(importances)
    # Normalize importances
    importances_normalized = importances / importances.sum()

    # Combine features and their importances
    feature_importances = [(feature, importance) for feature, importance in zip(feature_names, importances_normalized)]

    # Sort the features by their importance
    feature_importances.sort(key=lambda x: x[1], reverse=True)

    # Accumulate the top features until the sum of importances reaches given threshold
    important_features = []
    cumulative_importance = 0.0
    for feature, importance in feature_importances:
        important_features.append(feature)
        cumulative_importance += importance
        if cumulative_importance >= threshold:
            break

    return important_features


## Top Features that Contribute 80%

In [65]:
top_80_percent_important_features = get_important_features_list_top(importances, feature_names, 0.8)

In [66]:
len(top_80_percent_important_features)

27

In [67]:
top_80_X_train = X_train[top_80_percent_important_features]
top_80_X_test = X_test[top_80_percent_important_features]

In [68]:
clf = XGBClassifier(random_state=23, class_weight='balanced')
clf.fit(top_80_X_train, y_train)
# Make predictions
y_pred = clf.predict(top_80_X_test)
lb = LabelBinarizer()
y_test_2d = lb.fit_transform(y_test)
y_pred_2d = lb.fit_transform(y_pred)

# Compute scores
accuracy = accuracy_score(y_test, y_pred)
auc_score_ovo = roc_auc_score(y_test_2d, y_pred_2d, multi_class='ovo')
auc_score_ovr = roc_auc_score(y_test_2d, y_pred_2d, multi_class='ovr')
report = classification_report(y_test, y_pred)

# Print scores
print(f'The accuracy score is {accuracy * 100:.2f}%')
print(f'One-vs-One AUC score: {auc_score_ovo * 100:.2f}%')
print(f'One-vs-Rest AUC score: {auc_score_ovr * 100:.2f}%')
print("Classification Report")
print(report)

The accuracy score is 87.04%
One-vs-One AUC score: 90.83%
One-vs-Rest AUC score: 90.83%
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.76      0.87      2153
           1       0.26      0.93      0.40      3302
           2       0.95      0.42      0.59      3214
           3       1.00      1.00      1.00      3509
           4       0.98      0.99      0.98      2598
           5       0.91      0.41      0.56      1829
           6       0.51      0.63      0.57      2411
           7       1.00      1.00      1.00      3798
           8       0.95      0.99      0.97      6770
           9       0.20      0.01      0.02      2831
          10       1.00      1.00      1.00      2083
          11       0.88      0.85      0.86      2454
          12       1.00      0.67      0.80        36
          13       1.00      0.99      1.00      3577
          14       0.91      0.56      0.69      2222
          15       1.00  

In [69]:
top_90_percent_important_features = get_important_features_list_top(importances, feature_names, 0.9)
len(top_90_percent_important_features)

44

In [70]:
top_90_X_train = X_train[top_90_percent_important_features]
top_90_X_test = X_test[top_90_percent_important_features]

In [71]:
# using top 90%
clf = XGBClassifier(random_state=23, class_weight='balanced')
clf.fit(top_90_X_train, y_train)
# Make predictions
y_pred = clf.predict(top_90_X_test)
lb = LabelBinarizer()
y_test_2d = lb.fit_transform(y_test)
y_pred_2d = lb.fit_transform(y_pred)

# Compute scores
accuracy = accuracy_score(y_test, y_pred)
auc_score_ovo = roc_auc_score(y_test_2d, y_pred_2d, multi_class='ovo')
auc_score_ovr = roc_auc_score(y_test_2d, y_pred_2d, multi_class='ovr')
report = classification_report(y_test, y_pred)

# Print scores
print(f'The accuracy score is {accuracy * 100:.2f}%')
print(f'One-vs-One AUC score: {auc_score_ovo * 100:.2f}%')
print(f'One-vs-Rest AUC score: {auc_score_ovr * 100:.2f}%')
print("Classification Report")
print(report)

The accuracy score is 91.00%
One-vs-One AUC score: 94.21%
One-vs-Rest AUC score: 94.21%
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.97      0.98      2153
           1       0.30      0.91      0.45      3302
           2       0.98      0.83      0.90      3214
           3       1.00      1.00      1.00      3509
           4       1.00      1.00      1.00      2598
           5       0.82      0.51      0.63      1829
           6       0.94      0.98      0.96      2411
           7       1.00      1.00      1.00      3798
           8       0.97      0.99      0.98      6770
           9       0.23      0.02      0.03      2831
          10       1.00      1.00      1.00      2083
          11       0.98      0.98      0.98      2454
          12       1.00      0.78      0.88        36
          13       1.00      1.00      1.00      3577
          14       0.98      0.91      0.94      2222
          15       1.00  

In [None]:
top_95_percent_important_features = get_important_features_list_top(importances, feature_names, 0.95)
len(top_95_percent_important_features)

58

In [None]:
top_95_X_train = X_train[top_95_percent_important_features]
top_95_X_test = X_test[top_95_percent_important_features]

In [None]:
# using top 95%
clf = XGBClassifier(random_state=230, class_weight='balanced')
clf.fit(top_95_X_train, y_train)
# Make predictions
y_pred = clf.predict(top_95_X_test)
lb = LabelBinarizer()
y_test_2d = lb.fit_transform(y_test)
y_pred_2d = lb.fit_transform(y_pred)

# Compute scores
accuracy = accuracy_score(y_test, y_pred)
auc_score_ovo = roc_auc_score(y_test_2d, y_pred_2d, multi_class='ovo')
auc_score_ovr = roc_auc_score(y_test_2d, y_pred_2d, multi_class='ovr')
report = classification_report(y_test, y_pred)

# Print scores
print(f'The accuracy score is {accuracy * 100:.2f}%')
print(f'One-vs-One AUC score: {auc_score_ovo * 100:.2f}%')
print(f'One-vs-Rest AUC score: {auc_score_ovr * 100:.2f}%')
print("Classification Report")
print(report)

The accuracy score is 66.74%
One-vs-One AUC score: 51.36%
One-vs-Rest AUC score: 51.36%
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.89      0.94      2153
           1       1.00      0.91      0.95      3302
           2       0.00      0.00      0.00      3214
           3       1.00      0.80      0.89      3509
           4       0.00      0.00      0.00      2598
           5       0.00      0.00      0.00      1829
           6       1.00      0.99      1.00      2411
           7       1.00      0.96      0.98      3798
           8       1.00      0.96      0.98      6770
           9       1.00      0.97      0.99      2831
          10       1.00      0.81      0.89      2083
          11       1.00      0.94      0.97      2454
          12       0.00      0.00      0.00        36
          13       0.00      0.00      0.00      3577
          14       1.00      0.94      0.97      2222
          15       0.93  

In [None]:
top_98_percent_important_features = get_important_features_list_top(importances, feature_names, 0.98)
len(top_98_percent_important_features)

77

In [None]:
top_98_X_train = X_train[top_98_percent_important_features]
top_98_X_test = X_test[top_98_percent_important_features]

In [None]:
# using top 98%
clf = XGBClassifier(random_state=231, class_weight='balanced')
clf.fit(top_98_X_train, y_train)
# Make predictions
y_pred = clf.predict(top_98_X_test)
lb = LabelBinarizer()
y_test_2d = lb.fit_transform(y_test)
y_pred_2d = lb.fit_transform(y_pred)

# Compute scores
accuracy = accuracy_score(y_test, y_pred)
auc_score_ovo = roc_auc_score(y_test_2d, y_pred_2d, multi_class='ovo')
auc_score_ovr = roc_auc_score(y_test_2d, y_pred_2d, multi_class='ovr')
report = classification_report(y_test, y_pred)

# Print scores
print(f'The accuracy score is {accuracy * 100:.2f}%')
print(f'One-vs-One AUC score: {auc_score_ovo * 100:.2f}%')
print(f'One-vs-Rest AUC score: {auc_score_ovr * 100:.2f}%')
print("Classification Report")
print(report)

The accuracy score is 84.36%
One-vs-One AUC score: 54.11%
One-vs-Rest AUC score: 54.11%
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.96      0.98      2153
           1       1.00      0.97      0.98      3302
           2       0.93      0.86      0.89      3214
           3       1.00      0.80      0.89      3509
           4       1.00      0.83      0.91      2598
           5       0.00      0.00      0.00      1829
           6       1.00      0.99      1.00      2411
           7       1.00      0.96      0.98      3798
           8       1.00      0.99      0.99      6770
           9       1.00      0.97      0.99      2831
          10       1.00      0.81      0.89      2083
          11       1.00      0.94      0.97      2454
          12       0.00      0.00      0.00        36
          13       0.99      0.58      0.73      3577
          14       1.00      0.94      0.97      2222
          15       1.00  

In [None]:
top_99_percent_important_features = get_important_features_list_top(importances, feature_names, 0.99)
len(top_99_percent_important_features)

90

In [None]:
top_99_X_train = X_train[top_99_percent_important_features]
top_99_X_test = X_test[top_99_percent_important_features]

In [None]:
# using top 99%
clf = XGBClassifier(random_state=232, class_weight='balanced')
clf.fit(top_99_X_train, y_train)
# Make predictions
y_pred = clf.predict(top_99_X_test)
lb = LabelBinarizer()
y_test_2d = lb.fit_transform(y_test)
y_pred_2d = lb.fit_transform(y_pred)

# Compute scores
accuracy = accuracy_score(y_test, y_pred)
auc_score_ovo = roc_auc_score(y_test_2d, y_pred_2d, multi_class='ovo')
auc_score_ovr = roc_auc_score(y_test_2d, y_pred_2d, multi_class='ovr')
report = classification_report(y_test, y_pred)

# Print scores
print(f'The accuracy score is {accuracy * 100:.2f}%')
print(f'One-vs-One AUC score: {auc_score_ovo * 100:.2f}%')
print(f'One-vs-Rest AUC score: {auc_score_ovr * 100:.2f}%')
print("Classification Report")
print(report)

The accuracy score is 86.97%
One-vs-One AUC score: 54.07%
One-vs-Rest AUC score: 54.07%
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.96      0.98      2153
           1       1.00      0.97      0.98      3302
           2       0.93      0.86      0.89      3214
           3       1.00      0.80      0.89      3509
           4       1.00      0.83      0.91      2598
           5       0.00      0.00      0.00      1829
           6       1.00      0.99      1.00      2411
           7       0.98      1.00      0.99      3798
           8       1.00      1.00      1.00      6770
           9       1.00      0.97      0.99      2831
          10       1.00      0.81      0.89      2083
          11       1.00      0.94      0.97      2454
          12       0.00      0.00      0.00        36
          13       1.00      0.58      0.73      3577
          14       1.00      0.94      0.97      2222
          15       1.00  

In [None]:
# Perform Cross Validation  - Took 20 Minutes

from sklearn.model_selection import cross_validate

# Initialize the model
clf = XGBClassifier(random_state=0, class_weight='balanced')

# Perform cross validation
cv_results = cross_validate(clf, top_99_X_train, y_train, cv=5, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro', 'roc_auc_ovr', 'roc_auc_ovo'], return_estimator=True)

# Print the mean scores
print(f'Mean Accuracy: {cv_results["test_accuracy"].mean() * 100:.2f}%')
print(f'Mean ROC_AUC OVR: {cv_results["test_roc_auc_ovr"].mean() * 100:.2f}%')
print(f'Mean ROC_AUC OVO: {cv_results["test_roc_auc_ovo"].mean() * 100:.2f}%')
print(f'Mean Precision: {cv_results["test_precision_macro"].mean() * 100:.2f}%')
print(f'Mean Recall: {cv_results["test_recall_macro"].mean() * 100:.2f}%')
print(f'Mean F1 Score: {cv_results["test_f1_macro"].mean() * 100:.2f}%')

Mean Accuracy: 86.71%
Mean ROC_AUC OVR: 98.88%
Mean ROC_AUC OVO: 98.57%
Mean Precision: 88.74%
Mean Recall: 83.19%
Mean F1 Score: 84.80%


In [None]:
top_995_percent_important_features = get_important_features_list_top(importances, feature_names, 0.995)
len(top_995_percent_important_features)

102

In [None]:
pprint(top_995_percent_important_features)

['Have you had sexual intercourse with an HIV-positive partner in the past 12 '
 'months?',
 'Do you have an active cancer?',
 'Are you more irritable or has your mood been very unstable recently?',
 'Do you have painful mouth ulcers or sores?',
 'Are you currently being treated or have you recently been treated with an '
 'oral antibiotic for an ear infection?',
 'Have you been unable to move or get up for more than 3 consecutive days '
 'within the last 4 weeks?',
 'Do you have a known kidney problem resulting in an inability to retain '
 'proteins?',
 'Have you been in contact with or ate something that you have an allergy to?',
 'Have you had chills or shivers?',
 'Do you have a known severe food allergy?',
 'Are you consulting because you have high blood pressure?',
 'Did your cheeks suddenly turn red?',
 'Have you been unintentionally losing weight or have you lost your appetite?',
 'Are you taking any new oral anticoagulants ((NOACs)?',
 'Have you had unprotected sex with more t

In [None]:
top_995_X_train = X_train[top_995_percent_important_features]
top_995_X_test = X_test[top_995_percent_important_features]

In [None]:
# using top 99.5%
clf = XGBClassifier(random_state=232, class_weight='balanced')
clf.fit(top_995_X_train, y_train)
# Make predictions
y_pred = clf.predict(top_995_X_test)
lb = LabelBinarizer()
y_test_2d = lb.fit_transform(y_test)
y_pred_2d = lb.fit_transform(y_pred)

# Compute scores
accuracy = accuracy_score(y_test, y_pred)
auc_score_ovo = roc_auc_score(y_test_2d, y_pred_2d, multi_class='ovo')
auc_score_ovr = roc_auc_score(y_test_2d, y_pred_2d, multi_class='ovr')
report = classification_report(y_test, y_pred)

# Print scores
print(f'The accuracy score is {accuracy * 100:.2f}%')
print(f'One-vs-One AUC score: {auc_score_ovo * 100:.2f}%')
print(f'One-vs-Rest AUC score: {auc_score_ovr * 100:.2f}%')
print("Classification Report")
print(report)

The accuracy score is 93.25%
One-vs-One AUC score: 96.33%
One-vs-Rest AUC score: 96.33%
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      2153
           1       1.00      0.97      0.98      3302
           2       0.97      0.86      0.91      3214
           3       0.88      0.90      0.89      3509
           4       1.00      1.00      1.00      2598
           5       1.00      0.45      0.63      1829
           6       1.00      0.99      1.00      2411
           7       1.00      1.00      1.00      3798
           8       1.00      1.00      1.00      6770
           9       1.00      0.97      0.99      2831
          10       1.00      0.81      0.89      2083
          11       1.00      1.00      1.00      2454
          12       1.00      0.61      0.76        36
          13       1.00      0.66      0.80      3577
          14       1.00      0.99      0.99      2222
          15       0.96  

In [None]:
# Perform Cross Validation  - Took 20 minutes

from sklearn.model_selection import cross_validate

# Initialize the model
clf = XGBClassifier(random_state=0, class_weight='balanced')

# Perform cross validation
cv_results = cross_validate(clf, top_995_X_train, y_train, cv=5, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro', 'roc_auc_ovr', 'roc_auc_ovo'], return_estimator=True)

# Print the mean scores
print(f'Mean Accuracy: {cv_results["test_accuracy"].mean() * 100:.2f}%')
print(f'Mean ROC_AUC OVR: {cv_results["test_roc_auc_ovr"].mean() * 100:.2f}%')
print(f'Mean ROC_AUC OVO: {cv_results["test_roc_auc_ovo"].mean() * 100:.2f}%')
print(f'Mean Precision: {cv_results["test_precision_macro"].mean() * 100:.2f}%')
print(f'Mean Recall: {cv_results["test_recall_macro"].mean() * 100:.2f}%')
print(f'Mean F1 Score: {cv_results["test_f1_macro"].mean() * 100:.2f}%')

Mean Accuracy: 93.42%
Mean ROC_AUC OVR: 99.81%
Mean ROC_AUC OVO: 99.80%
Mean Precision: 96.17%
Mean Recall: 92.74%
Mean F1 Score: 93.37%


In [None]:
# Generate feature importances

for idx, model in enumerate(cv_results['estimator']):
    # Extract feature importances
    importances = model.feature_importances_

    # Get the feature names
    print(f'Model {idx}')
    feature_names = X_train.columns

    # Create a DataFrame for the importances
    feature_importance = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    })

# Sort the DataFrame by the importances
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Display the top features
print(feature_importance.head(50))

In [None]:
# Generate feature importances

for idx, model in enumerate(cv_results['estimator']):
    # Extract feature importances
    importances = model.feature_importances_

    # Get the feature names
    print(f'Model {idx}')
    feature_names = X_train.columns

    # Create a DataFrame for the importances
    feature_importance = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    })

# Sort the DataFrame by the importances
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Display the top features
print(feature_importance.head(50))



Model 0
Model 1
Model 2
Model 3
Model 4
                                               Feature  Importance
166                      Do you have an active cancer?    0.162417
188  Are you currently being treated or have you re...    0.123145
124         Do you have painful mouth ulcers or sores?    0.067082
193  Are you more irritable or has your mood been v...    0.056563
167  Have you been unable to move or get up for mor...    0.048299
134  Have you been unintentionally losing weight or...    0.045875
75            Do you have a known severe food allergy?    0.032464
28                     Have you had chills or shivers?    0.030045
123  Are you consulting because you have high blood...    0.020431
26   Have you had sexual intercourse with an HIV-po...    0.020166
76   Have you been in contact with or ate something...    0.018535
106                 Did your cheeks suddenly turn red?    0.018010
16   Have you ever had a sexually transmitted infec...    0.015092
186  Do you have famil

In [None]:
feature_importance.describe

<bound method NDFrame.describe of                                                Feature  Importance
166                      Do you have an active cancer?    0.162417
188  Are you currently being treated or have you re...    0.123145
124         Do you have painful mouth ulcers or sores?    0.067082
193  Are you more irritable or has your mood been v...    0.056563
167  Have you been unable to move or get up for mor...    0.048299
..                                                 ...         ...
147  Were you diagnosed with endocrine disease or a...    0.000000
206  Have you breastfed one of your children for mo...    0.000000
118                  Do you have chronic pancreatitis?    0.000000
120            Do you have pale stools and dark urine?    0.000000
172  Have you been in contact with someone who has ...    0.000000

[216 rows x 2 columns]>

# Recursive Feacture Elimintation

In [None]:
from sklearn.feature_selection import RFE

def select_most_important_features(num_features, X_train, y_train):
    estimator = XGBClassifier()
    selector = RFE(estimator, n_features_to_select=num_features, step=1)
    selector = selector.fit(X_train, y_train)
    selected_features = X_train.columns[selector.support_]
    feature_ranking = selector.ranking_
    return selected_features, feature_ranking, selector

In [None]:
top_50_features, feature_ranking, selector = select_most_important_features(50, top_995_X_train, y_train) # took 3hrs 27 minutes
print(top_50_features)

Index(['Have you had sexual intercourse with an HIV-positive partner in the past 12 months?',
       'Do you have an active cancer?',
       'Are you more irritable or has your mood been very unstable recently?',
       'Do you have painful mouth ulcers or sores?',
       'Are you currently being treated or have you recently been treated with an oral antibiotic for an ear infection?',
       'Have you been unable to move or get up for more than 3 consecutive days within the last 4 weeks?',
       'Do you have a known kidney problem resulting in an inability to retain proteins?',
       'Do you have a known severe food allergy?',
       'Are you consulting because you have high blood pressure?',
       'Did your cheeks suddenly turn red?',
       'Have you been unintentionally losing weight or have you lost your appetite?',
       'Are you taking any new oral anticoagulants ((NOACs)?',
       'Have you had unprotected sex with more than one partner in the last 6 months?',
       'Have y

In [None]:
X_train_selected = selector.transform(top_995_X_train)
X_test_selected = selector.transform(top_995_X_test)

In [None]:
# Perform Cross Validation  - Took 14 minutes

from sklearn.model_selection import cross_validate

# Initialize the model
clf = XGBClassifier(random_state=0, class_weight='balanced')

# Perform cross validation
cv_results = cross_validate(clf, X_train_selected, y_train, cv=5, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro', 'roc_auc_ovr', 'roc_auc_ovo'], return_estimator=True)

# Print the mean scores
print(f'Mean Accuracy: {cv_results["test_accuracy"].mean() * 100:.2f}%')
print(f'Mean ROC_AUC OVR: {cv_results["test_roc_auc_ovr"].mean() * 100:.2f}%')
print(f'Mean ROC_AUC OVO: {cv_results["test_roc_auc_ovo"].mean() * 100:.2f}%')
print(f'Mean Precision: {cv_results["test_precision_macro"].mean() * 100:.2f}%')
print(f'Mean Recall: {cv_results["test_recall_macro"].mean() * 100:.2f}%')
print(f'Mean F1 Score: {cv_results["test_f1_macro"].mean() * 100:.2f}%')

Mean Accuracy: 62.30%
Mean ROC_AUC OVR: 90.91%
Mean ROC_AUC OVO: 91.37%
Mean Precision: 69.06%
Mean Recall: 60.12%
Mean F1 Score: 63.17%


# Feature Selection for Top 10 Diseases by Frequency Distribution

In [146]:
# Top 10 diseases
list(train_df['PATHOLOGY'].value_counts().index[:10])

['URTI',
 'Viral pharyngitis',
 'Anemia',
 'HIV (initial infection)',
 'Localized edema',
 'Anaphylaxis',
 'Pulmonary embolism',
 'Influenza',
 'Bronchitis',
 'Allergic sinusitis']

In [144]:
# Get pathology distribution sorted
pathology_distribution = train_encoded['PATHOLOGY'].value_counts()
top_10_pathologies = pathology_distribution.index[:10]
list(top_10_pathologies)

[44, 46, 8, 23, 27, 7, 36, 13, 6, 1]

In [88]:
top_10_pathologies_train_df = train_encoded[train_encoded['PATHOLOGY'].isin(top_10_pathologies)]
len(top_10_pathologies_train_df)

366303

In [89]:
top_10_pathologies_test_df = test_encoded[test_encoded['PATHOLOGY'].isin(top_10_pathologies)]
len(top_10_pathologies_test_df)

48222

In [90]:
# Prepare training and test variables
X_columns = [col for col in top_10_pathologies_train_df.columns if col not in ['Unnamed: 0', 'PATHOLOGY']]
X_train = top_10_pathologies_train_df[X_columns]
y_train = top_10_pathologies_train_df['PATHOLOGY']
X_test = top_10_pathologies_test_df[X_columns]
y_test = top_10_pathologies_test_df['PATHOLOGY']
pprint(f'X_train shape: {X_train.shape}')
pprint(f'y_train shape: {y_train.shape}')
pprint(f'X_test shape: {X_test.shape}')
pprint(f'y_test shape: {y_test.shape}')

'X_train shape: (366303, 216)'
'y_train shape: (366303,)'
'X_test shape: (48222, 216)'
'y_test shape: (48222,)'


In [95]:
y_train.unique()

array([44, 23, 46,  8,  6,  7, 13, 27, 36,  1])

In [97]:
# XGBoost expects classess to start from 0, so create a new mapping to fit this.
unique_classes = np.unique(y_train)
class_mapping = {original: new for new, original in enumerate(unique_classes)}

y_train_mapped = y_train.map(class_mapping)
y_test_mapped = y_test.map(class_mapping)

In [98]:
y_train_mapped.unique()

array([8, 5, 9, 3, 1, 2, 4, 6, 7, 0])

In [106]:
important_features_by_pathologies = {}
roc_auc_scores = {}
accuracy_scores = {}


for pathology in list(top_10_pathologies):

    # Initialize and train the model
    model = XGBClassifier(class_weight='balanced', alpha=1.0)
    model.fit(X_train, y_train_mapped)

     # Predict and evaluate
    predictions = model.predict(X_test)
    roc_auc_ovr = roc_auc_score(y_test_mapped, model.predict_proba(X_test), multi_class='ovr')
    accuracy = accuracy_score(y_test_mapped, predictions)
    roc_auc_scores[pathology] = roc_auc_ovr.tolist()
    accuracy_scores[pathology] = accuracy

    # Get feature importances
    importances = model.feature_importances_
    feature_names = X_train.columns
    features_importance = zip(feature_names, importances)

    # Sort features by importance
    sorted_features = sorted(features_importance, key=lambda x: x[1], reverse=True)

    # Store the top features for this disease
    important_features_by_pathologies[pathology] = sorted_features


In [129]:
len(important_features_by_pathologies)

10

In [147]:
from collections import Counter

# Count how often each feature is important across diseases
feature_counts = Counter()

for pathology in important_features_by_pathologies:
    for feature, importance in important_features_by_pathologies[pathology]:
        if importance > 0.005:  # You can set a threshold for importance
            feature_counts[feature] += 1

# Sort features by their count
sorted_feature_counts = sorted(feature_counts.items(), key=lambda x: x[1], reverse=True)


In [148]:
len(sorted_feature_counts)

20

In [149]:
# Get list of features
selected_features = [feature for feature, count in sorted_feature_counts]
pprint(selected_features)

['Do you have swollen or painful lymph nodes?',
 'Have you had sexual intercourse with an HIV-positive partner in the past 12 '
 'months?',
 'Are you taking any new oral anticoagulants ((NOACs)?',
 'Have you had unprotected sex with more than one partner in the last 6 '
 'months?',
 'Is your nose or the back of your throat itchy?',
 'Are you immunosuppressed?',
 'Have you had surgery within the last month?',
 'Do you have a chronic obstructive pulmonary disease (COPD)?',
 'Do you regularly take stimulant drugs?',
 'Are you exposed to secondhand cigarette smoke on a daily basis?',
 'Do you have heart failure?',
 'What color is the rash?',
 'Have you ever had a diagnosis of anemia?',
 'Where is the swelling located?',
 'Have you ever had a sexually transmitted infection?',
 'Where is the affected region located?',
 'Do you have any lesions, redness or problems on your skin that you believe '
 'are related to the condition you are consulting for?',
 'Have you started or taken any antipsyc

In [150]:
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

In [151]:
X_train_selected

Unnamed: 0,Do you have swollen or painful lymph nodes?,Have you had sexual intercourse with an HIV-positive partner in the past 12 months?,Are you taking any new oral anticoagulants ((NOACs)?,Have you had unprotected sex with more than one partner in the last 6 months?,Is your nose or the back of your throat itchy?,Are you immunosuppressed?,Have you had surgery within the last month?,Do you have a chronic obstructive pulmonary disease (COPD)?,Do you regularly take stimulant drugs?,Are you exposed to secondhand cigarette smoke on a daily basis?,Do you have heart failure?,What color is the rash?,Have you ever had a diagnosis of anemia?,Where is the swelling located?,Have you ever had a sexually transmitted infection?,Where is the affected region located?,"Do you have any lesions, redness or problems on your skin that you believe are related to the condition you are consulting for?",Have you started or taken any antipsychotic medication within the last 7 days?,"Do you have pain somewhere, related to your reason for consulting?",Do you have severe itching in one or both eyes?
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0,0,0.0,0,0.0,0.0,1.0,0.0
1,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0,1.0,20,1.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,1.0,0.0
6,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
980259,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0,0.0,27,1.0,0.0,1.0,0.0
980260,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0,1.0,20,1.0,0.0,1.0,0.0
980261,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0,1.0,20,1.0,0.0,1.0,0.0
980262,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0,1.0,20,1.0,0.0,1.0,0.0


In [152]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
# Initialize the model
clf = XGBClassifier(random_state=0, class_weight='balanced')

# Perform cross-validation
cv_results = cross_validate(clf, X_train_selected, y_train_mapped, cv=5,
                            scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro', 'roc_auc_ovr', 'roc_auc_ovo'],
                            return_estimator=True)

# Prepare to store test scores
test_scores = {
    'accuracy': [],
    'precision_macro': [],
    'recall_macro': [],
    'f1_macro': [],
    'roc_auc_ovr': [],
    'roc_auc_ovo': []
}

# Predict on X_test with each fold's estimator
for estimator in cv_results['estimator']:
    y_pred = estimator.predict(X_test_selected)
    y_pred_proba = estimator.predict_proba(X_test_selected)

    test_scores['accuracy'].append(accuracy_score(y_test_mapped, y_pred))
    test_scores['precision_macro'].append(precision_score(y_test_mapped, y_pred, average='macro'))
    test_scores['recall_macro'].append(recall_score(y_test_mapped, y_pred, average='macro'))
    test_scores['f1_macro'].append(f1_score(y_test_mapped, y_pred, average='macro'))
    test_scores['roc_auc_ovr'].append(roc_auc_score(y_test_mapped, y_pred_proba, multi_class='ovr'))
    test_scores['roc_auc_ovo'].append(roc_auc_score(y_test_mapped, y_pred_proba, multi_class='ovo'))

# Calculate and print mean test scores
for metric, scores in test_scores.items():
    print(f'Mean {metric}: {np.mean(scores) * 100:.2f}%')

Mean accuracy: 90.90%
Mean precision_macro: 96.30%
Mean recall_macro: 93.35%
Mean f1_macro: 93.92%
Mean roc_auc_ovr: 99.38%
Mean roc_auc_ovo: 99.61%


In [153]:
clf = XGBClassifier(random_state=27, class_weight='balanced')
clf.fit(X_train_selected, y_train_mapped)
# Make predictions
y_pred = clf.predict(X_test_selected)
lb = LabelBinarizer()
y_test_2d = lb.fit_transform(y_test_mapped)
y_pred_2d = lb.fit_transform(y_pred)

# Compute scores
accuracy = accuracy_score(y_test_mapped, y_pred)
auc_score_ovo = roc_auc_score(y_test_2d, y_pred_2d, multi_class='ovo')
auc_score_ovr = roc_auc_score(y_test_2d, y_pred_2d, multi_class='ovr')
report = classification_report(y_test_mapped, y_pred)

# Print scores
print(f'The accuracy score is {accuracy * 100:.2f}%')
print(f'One-vs-One AUC score: {auc_score_ovo * 100:.2f}%')
print(f'One-vs-Rest AUC score: {auc_score_ovr * 100:.2f}%')
print("Classification Report")
print(report)

The accuracy score is 90.90%
One-vs-One AUC score: 96.13%
One-vs-Rest AUC score: 96.13%
Classification Report
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      3302
           1       1.00      0.96      0.98      2411
           2       1.00      1.00      1.00      3798
           3       1.00      0.96      0.98      6770
           4       1.00      0.79      0.89      3577
           5       1.00      1.00      1.00      3904
           6       1.00      1.00      1.00      3734
           7       1.00      1.00      1.00      3679
           8       1.00      0.63      0.77      8713
           9       0.66      1.00      0.80      8334

    accuracy                           0.91     48222
   macro avg       0.96      0.93      0.94     48222
weighted avg       0.94      0.91      0.91     48222

