# Prediction from densenet embeddings

In [2]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, roc_auc_score, precision_recall_curve, auc
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR



In [3]:
import sys
sys.path.append('/home/cmottez/CS231N_Lightweight_Bias_Mitigation_Chest_Xray')

from metrics import compute_metrics, compute_kl_divergence_sex, compute_kl_divergence_age, compute_kl_divergence_race, compute_metrics_subcath, bias_table, bias_table_auprc

### Read the CSV with the baseline test predictions

In [4]:
test = pd.read_csv("../data/embeddings_chexpert/CNNs/chexpert_on_chexpert_my_test_predictions.csv", quotechar='"', on_bad_lines='skip')
test.head()

Unnamed: 0,path_to_image,path_to_dcm,age,sex,race,insurance_type,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,...,Pneumonia.1,Pleural_Thickening,Cardiomegaly.1,Nodule,Mass,Hernia,Lung Lesion.1,Fracture.1,Lung Opacity.1,Enlarged Cardiomediastinum.1
0,train/patient47347/study3/view1_frontal.jpg,train/patient47347/study3/view1_frontal.dcm,78.0,1,0,1,0,0,1,0,...,0.627497,0.159013,0.965829,0.245335,0.534185,0.553511,0.678016,0.731546,0.953525,0.894168
1,train/patient37527/study12/view1_frontal.jpg,train/patient37527/study12/view1_frontal.dcm,63.0,0,1,2,0,0,0,0,...,0.664034,0.539082,0.612635,0.505365,0.536698,0.025208,0.725883,0.604588,0.869167,0.654336
2,train/patient41208/study9/view1_frontal.jpg,train/patient41208/study9/view1_frontal.dcm,70.0,0,0,1,0,0,0,1,...,0.651837,0.46913,0.544333,0.543463,0.663163,0.308127,0.71772,0.568348,0.767912,0.598184
3,train/patient39357/study1/view1_frontal.jpg,train/patient39357/study1/view1_frontal.dcm,79.0,1,1,1,0,0,0,0,...,0.509472,0.19304,0.645339,0.504483,0.505109,0.032462,0.509491,0.523578,0.619305,0.536065
4,train/patient31982/study4/view1_frontal.jpg,train/patient31982/study4/view1_frontal.dcm,67.0,0,0,0,0,0,0,0,...,0.598874,0.513309,0.771735,0.524469,0.643032,0.071618,0.702488,0.580152,0.854334,0.737613


### Convert age to binary to study bias

In [5]:
a = 70
test['age'] = (test['age'] >= a).astype(int)

In [6]:
test.replace(-1, 0, inplace=True)

### Visualize distribution of diseases

In [7]:
col = ['Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia',
            'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture', 'No Finding']

print(f"Distribution in test set")
for disease in col:
    counts = test[disease].value_counts()
    print(f"{disease}:- 0s: {counts.get(0, 0)}, 1s: {counts.get(1, 0)}")


Distribution in test set
Enlarged Cardiomediastinum:- 0s: 38374, 1s: 1984
Cardiomegaly:- 0s: 35384, 1s: 4974
Lung Opacity:- 0s: 20481, 1s: 19877
Lung Lesion:- 0s: 38860, 1s: 1498
Edema:- 0s: 30666, 1s: 9692
Consolidation:- 0s: 37860, 1s: 2498
Pneumonia:- 0s: 39374, 1s: 984
Atelectasis:- 0s: 34141, 1s: 6217
Pneumothorax:- 0s: 36250, 1s: 4108
Pleural Effusion:- 0s: 24455, 1s: 15903
Pleural Other:- 0s: 39790, 1s: 568
Fracture:- 0s: 38750, 1s: 1608
No Finding:- 0s: 36690, 1s: 3668


In [8]:
# Diseases to predict
diseases = ['Cardiomegaly', 'Lung Opacity', 'Edema', 'Pleural Effusion']

# Labels for train and test
y_test = test[diseases]
y_no_finding = test["No Finding"]
y_sex = test['sex']
y_race = test['race']
y_insurance = test['insurance_type']
y_age = test['age']

y_pred = test[['Cardiomegaly.1', 'Lung Opacity.1', 'Edema.1', 'Effusion']]
y_pred.columns = diseases


predictions, targets = y_pred.values, y_test.values


metrics = compute_metrics(pd.DataFrame(y_pred), pd.DataFrame(y_test), diseases)
kl_divergence_results_sex = compute_kl_divergence_sex(y_test, y_pred, y_sex)
kl_divergence_results_age = compute_kl_divergence_age(y_test, y_pred, y_age)
kl_divergence_results_race = compute_kl_divergence_race(y_test, y_pred, y_race)
metrics_female, metrics_male, metrics_white, metrics_black, metrics_asian, metrics_young, metrics_old = compute_metrics_subcath(predictions, targets, diseases, y_sex, y_race, y_age)
styled_df, df_my_test = bias_table_auprc(metrics, metrics_female, metrics_male, metrics_white, metrics_black, metrics_asian, metrics_young, metrics_old)
