In [17]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from collections import defaultdict
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, roc_auc_score, precision_recall_curve, auc
from scipy.stats import entropy
from sklearn.utils import shuffle
from tqdm import tqdm
from sklearn.model_selection import ParameterSampler


In [18]:
import sys
sys.path.append('/home/cmottez/CS231N_Lightweight_Bias_Mitigation_Chest_Xray')

from metrics import compute_metrics, compute_kl_divergence_sex, compute_kl_divergence_age, compute_kl_divergence_race, compute_metrics_subcath, bias_table, bias_table_auprc


### Read the CSV with demographics info + diseases info + image embeddings

In [19]:
train = pd.read_pickle('../data/embeddings_chexpert/CNNs/chexpert_on_chexpert_train_with_embeddings_extracted.pkl')
valid = pd.read_pickle('../data/embeddings_chexpert/CNNs/chexpert_on_chexpert_valid_with_embeddings_extracted.pkl')
test = pd.read_pickle('../data/embeddings_chexpert/CNNs/chexpert_on_chexpert_test_with_embeddings_extracted.pkl')

In [20]:
test.head()

Unnamed: 0,gender,race,age,insurance,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices,embeddings
0,1,0,1,1,1,1,0,1,0,1,0,0,0,1,0,0,0,1,"[0.0029132624622434378, 0.1020001769065857, 0...."
1,0,1,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,"[0.0014348188415169716, 0.0543656125664711, 0...."
2,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,"[0.001982336398214102, 0.040021587163209915, 0..."
3,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,"[0.001741771469824016, 0.0560498870909214, 0.1..."
4,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,"[9.678312198957428e-05, 0.12247737497091293, 0..."


In [21]:
col = ['gender', 'race', 'age', 'Atelectasis',
       'Cardiomegaly', 'Consolidation', 'Edema', 'Enlarged Cardiomediastinum',
       'Fracture', 'Lung Lesion', 'Lung Opacity', 'No Finding',
       'Pleural Effusion', 'Pleural Other', 'Pneumonia', 'Pneumothorax',
       'Support Devices', 'embeddings']
train = train[col]
valid = valid[col]
test = test[col]

In [22]:
# Remove images that could not be processed

initial_size = test.shape[0] 
test = test[test['embeddings'].apply(type) == list]
final_size = test.shape[0] 
print(f'Number of test removed rows = {initial_size - final_size}')

initial_size = train.shape[0] 
train = train[train['embeddings'].apply(type) == list]
final_size = train.shape[0] 
print(f'Number of train removed rows = {initial_size - final_size}')

initial_size = valid.shape[0]
valid = valid[valid['embeddings'].apply(type) == list]
final_size = valid.shape[0] 
print(f'Number of valid removed rows = {initial_size - final_size}')

Number of test removed rows = 51
Number of train removed rows = 67
Number of valid removed rows = 3


### Train test Valid

In [23]:
train_embeddings = pd.DataFrame(train['embeddings'].tolist())
test_embeddings = pd.DataFrame(test['embeddings'].tolist())
valid_embeddings = pd.DataFrame(valid['embeddings'].tolist())

# Diseases to predict
diseases = ['Cardiomegaly', 'Lung Opacity', 'Edema', 'Pleural Effusion']

# Labels for train and test
y_train = train[diseases]
y_test = test[diseases]
y_valid = valid[diseases]

X_train = train_embeddings
X_test = test_embeddings
X_valid = valid_embeddings


In [24]:
y_sex = test['gender']
y_race = test['race']
y_age = test['age']

y_sex_valid = valid['gender']
y_race_valid = valid['race']
y_age_valid = valid['age']

In [25]:
y_train.replace(-1, 0, inplace=True)
y_test.replace(-1, 0, inplace=True)
y_valid.replace(-1, 0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train.replace(-1, 0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test.replace(-1, 0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_valid.replace(-1, 0, inplace=True)


### PCA to reduce embeddings

In [26]:
# Step 1: Standardize the embeddings_list to have mean 0 and variance 1
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(np.stack(train_embeddings.values))
X_test_scaled = scaler.transform(np.stack(test_embeddings.values))
X_valid_scaled = scaler.transform(np.stack(valid_embeddings.values))

# Step 2: Set target variance threshold (e.g., 95%)
variance_threshold = 0.95

# Step 3: Fit PCA to determine the optimal number of components based on variance threshold
pca_full = PCA()
pca_full.fit(X_train_scaled)
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)

# Step 4: Find the number of components that meets the variance threshold
optimal_components = np.argmax(cumulative_variance >= variance_threshold) + 1
print(f"Optimal number of components to retain {variance_threshold*100}% variance: {optimal_components}")

#95% variance means that the selected principal components (reduced dimensions) retain 95% of the total variability present in the original high-dimensional data.

Optimal number of components to retain 95.0% variance: 305


In [27]:
# Apply PCA
pca = PCA(n_components=optimal_components)
x_train_subset = pca.fit_transform(X_train_scaled)
x_test_subset = pca.transform(X_test_scaled)
x_valid_subset = pca.transform(X_valid_scaled)

### Hyperparameter tuning and train testing

In [28]:
def train_model(x_train, y_train, x_test, y_test, model):
    
    if y_train.shape[1] > 1:
        multi_output_model = MultiOutputClassifier(model)
        multi_output_model.fit(x_train, y_train)

        if hasattr(model, "predict_proba"):
            y_test_preds_proba = pd.DataFrame({disease: probs[:, 1] for disease, probs in zip(diseases, multi_output_model.predict_proba(x_test))}) # Dataframe with probabilites 
        else:
            y_test_preds_proba = None

    else:
        print("Single disease")
        model.fit(x_train, y_train)
        if hasattr(model, "predict_proba"):
            # Dataframe with probabilities for the positive class
            y_test_preds_proba = pd.DataFrame(model.predict_proba(x_test)[:, 1], columns=['Probability'])
        else:
            y_test_preds_proba = None


    return y_test_preds_proba 


In [None]:
# Random search for hyperparameters
# Starting large, then reduce the search space

param_dist = {
    'learning_rate':    [0.01, 0.05, 0.07, 0.1],
    'n_estimators':     [100, 150, 200, 250],
    'max_depth':        [5, 10, 15],
    'min_child_weight': [5, 10],
    'gamma':            [0.1],
    'subsample':        [1.0],
    'colsample_bytree': [0.8],
    'reg_lambda':       [0],
    'reg_alpha':        [10],
    'scale_pos_weight': [1]
}

n_iter = 50
sampler = list(ParameterSampler(param_dist, n_iter=n_iter, random_state=42))

results = []
for params in tqdm(sampler, desc="Hyperparameter search"):
    # always fix these two
    params_fixed = {
        'use_label_encoder': False,
        'eval_metric':        'logloss',
        'random_state':       42,
        **params
    }
    model = XGBClassifier(**params_fixed)

    # train & get probability predictions
    y_pred_proba = train_model(
        x_train=x_train_subset,
        y_train=y_train,
        x_test= x_valid_subset,
        y_test= y_valid,
        model= model
    )

    predictions = y_pred_proba.values
    targets = y_valid.values

    y_pred_df = pd.DataFrame(y_pred_proba)
    y_pred_df.columns = diseases
    y_valid_df = pd.DataFrame(y_valid)
    y_valid_df.columns = diseases

    metrics = compute_metrics(y_pred_df, y_valid_df, diseases)

    metrics_female, metrics_male, metrics_white, metrics_black, metrics_asian, metrics_young, metrics_old = compute_metrics_subcath(predictions, targets, diseases, y_sex_valid, y_race_valid, y_age_valid)
    styled_df, df = bias_table(metrics, metrics_female, metrics_male, metrics_white, metrics_black, metrics_asian, metrics_young, metrics_old)

    score = (df['AUPRC'].mean() - (df['Delta AUPRC sex'].mean() + df['Delta AUPRC race'].mean() + df['Delta AUPRC age'].mean()))

    results.append({
        **params_fixed,
        'objective (higher better)': score
    })

# turn into a DataFrame and sort
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('objective (higher better)', ascending=False)

# best row  
best = results_df.iloc[0]
print("Best params:\n", best.to_dict())


In [30]:
dataframes = []

for i in range(5):

    x_train_shuffled, y_train_shuffled = shuffle(x_train_subset, y_train, random_state=i)
    

    xgb_model = XGBClassifier(
    use_label_encoder=False,         # Disables the deprecated label encoder warning
    eval_metric='logloss',           # Evaluation metric for validation data
    learning_rate=0.05,              # Step size shrinkage used to prevent overfitting
    n_estimators=150,                # Number of gradient boosted trees. Equivalent to the number of boosting rounds
    max_depth=10,                    # Maximum depth of a tree
    min_child_weight=5,              # Minimum sum of instance weight (hessian) needed in a child
    gamma=0.1,                       # Minimum loss reduction required to make a further partition on a leaf node of the tree
    subsample=1.0,                   # Subsample ratio of the training instances
    colsample_bytree=0.8,            # Subsample ratio of columns when constructing each tree
    colsample_bylevel=1,             # Subsample ratio of columns for each level
    colsample_bynode=1,              # Subsample ratio of columns for each split
    reg_lambda=0,                    # L2 regularization term on weights
    reg_alpha=10,                    # L1 regularization term on weights
    scale_pos_weight=1,              # Balancing of positive and negative weights
    random_state=i                  
)
    
 
    y_pred = train_model(
        x_train=x_train_shuffled, 
        y_train=y_train_shuffled, 
        x_test=x_test_subset, 
        y_test=y_test, 
        model=xgb_model, 
    )

    predictions = y_pred.values
    targets = y_test.values

    y_pred_df = pd.DataFrame(y_pred)
    y_pred_df.columns = diseases
    y_test_df = pd.DataFrame(y_test)
    y_test_df.columns = diseases

    metrics = compute_metrics(y_pred_df, y_test_df, diseases)
    metrics_female, metrics_male, metrics_white, metrics_black, metrics_asian, metrics_young, metrics_old = compute_metrics_subcath(predictions, targets, diseases, y_sex, y_race, y_age)
    styled_df, df = bias_table_auprc(metrics, metrics_female, metrics_male, metrics_white, metrics_black, metrics_asian, metrics_young, metrics_old)

    dataframes.append(df)



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

In [31]:
combined_df = pd.concat(dataframes, keys=range(len(dataframes)))

combined_df = combined_df[['AUPRC', 'AUC', 'Delta AUPRC sex', 'Delta AUPRC race', 'Delta AUPRC age']]

df_mean = combined_df.groupby(level=1).mean()
df_std = combined_df.groupby(level=1).std() 

df_mean.reset_index(drop=True, inplace=True)
df_std.reset_index(drop=True, inplace=True)

df_mean.insert(0, 'disease', diseases)
df_std.insert(0, 'disease', diseases)


### Visualize final results

In [34]:
# Styling the DataFrame
styled_df = df_mean.style.format({
    'AUPRC': "{:.1f}",
    'AUC': "{:.1f}",
    'Delta AUPRC sex': "{:.1f}",
    'Delta AUPRC race': "{:.1f}",
    'Delta AUPRC age': "{:.1f}",
}).background_gradient(cmap='OrRd', subset=[
    'AUPRC', 'AUC', 'Delta AUPRC sex', 'Delta AUPRC race', 'Delta AUPRC age'
])
styled_df

Unnamed: 0,disease,AUPRC,AUC,Delta AUPRC sex,Delta AUPRC race,Delta AUPRC age
0,Cardiomegaly,39.6,80.7,4.3,17.2,0.6
1,Lung Opacity,64.3,69.3,0.4,1.3,0.9
2,Edema,50.5,78.5,0.6,6.0,5.7
3,Pleural Effusion,71.2,81.4,1.0,8.1,3.7
