In [130]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, auc, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [131]:
preds = pd.read_csv('../preds/preds_InceptionV3_radimagenet_4_256_all.csv')
preds.head()

Unnamed: 0,file_path,y_true,y_pred
0,../data/split_1/train/0/104_new.jpg,0.0,0.56014
1,../data/split_1/train/0/104_old.jpg,0.0,0.463017
2,../data/split_1/train/0/107_old.jpg,0.0,0.544784
3,../data/split_1/train/0/108_old.jpg,0.0,0.553695
4,../data/split_1/train/0/109_old.jpg,0.0,0.545951


In [132]:
preds['y_pred'].describe()

count    821.000000
mean       0.551024
std        0.065880
min        0.077089
25%        0.534776
50%        0.553751
75%        0.579704
max        0.750141
Name: y_pred, dtype: float64

In [133]:
preds.shape

(821, 3)

In [134]:
test_preds = preds.loc[preds['file_path'].str.contains('test')]
test_auroc = roc_auc_score(test_preds['y_true'], test_preds['y_pred'])
test_preds = preds.loc[preds['file_path'].str.contains('train')]
train_auroc = roc_auc_score(test_preds['y_true'], test_preds['y_pred'])
test_preds = preds.loc[preds['file_path'].str.contains('val')]
val_auroc = roc_auc_score(test_preds['y_true'], test_preds['y_pred'])
print('Test AUROC: ', test_auroc)
print('Train AUROC: ', train_auroc)
print('Val AUROC: ', val_auroc)

Test AUROC:  0.7069171138938581
Train AUROC:  0.681640495593984
Val AUROC:  0.6248157972295904


In [135]:
old_preds = preds.loc[~preds['file_path'].str.contains('new')]
old_auroc = roc_auc_score(old_preds['y_true'], old_preds['y_pred'])
new_preds = preds.loc[preds['file_path'].str.contains('new')]
new_auroc = roc_auc_score(new_preds['y_true'], new_preds['y_pred'])
print('Old AUROC: ', old_auroc)
print('New AUROC: ', new_auroc)

Old AUROC:  0.6540448188175462
New AUROC:  0.6837050054538842


In [136]:
#read in the clinical info
clin = pd.read_csv('../data/new_clin_clean.csv')
clin.head()

Unnamed: 0,study_id,age,race/ethnicity,bmi,density,tumor_grade,tumor_size,tumor_type,margin
0,573,68.0,Non-Hispanic White,26.05,C,2.0,5.0,ILC,0.0
1,95,59.0,Non-Hispanic Black,32.58,A,3.0,15.0,IDC,1.0
2,748,44.0,Non-Hispanic White,25.39,C,3.0,8.0,IDC,1.0
3,391,52.0,Non-Hispanic Black,40.57,B,2.0,10.0,IDC,1.0
4,79,51.0,Non-Hispanic White,32.46,A,1.0,12.0,IDC,1.0


In [137]:
preds_new = preds.loc[preds['file_path'].str.contains('_new')]
preds_new.head()

Unnamed: 0,file_path,y_true,y_pred
0,../data/split_1/train/0/104_new.jpg,0.0,0.56014
6,../data/split_1/train/0/110_new.jpg,0.0,0.634918
9,../data/split_1/train/0/122_new.jpg,0.0,0.558098
15,../data/split_1/train/0/138_new.jpg,0.0,0.240106
18,../data/split_1/train/0/145_new.jpg,0.0,0.498927


In [138]:
preds_new.shape

(371, 3)

In [139]:
file_paths = preds_new['file_path'].tolist()
file_paths1 = [x.split('/')[-1] for x in file_paths]
study_ids = [x.split('_')[0] for x in file_paths1]
preds_new['study_id'] = study_ids
preds_new.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preds_new['study_id'] = study_ids


Unnamed: 0,file_path,y_true,y_pred,study_id
0,../data/split_1/train/0/104_new.jpg,0.0,0.56014,104
6,../data/split_1/train/0/110_new.jpg,0.0,0.634918,110
9,../data/split_1/train/0/122_new.jpg,0.0,0.558098,122
15,../data/split_1/train/0/138_new.jpg,0.0,0.240106,138
18,../data/split_1/train/0/145_new.jpg,0.0,0.498927,145


In [140]:
test_study_ids = preds_new['study_id'].tolist()
test_study_ids = [int(x) for x in test_study_ids]
test_clin = clin.loc[clin['study_id'].isin(test_study_ids)]
print(test_clin.shape)
test_clin.head()

(371, 9)


Unnamed: 0,study_id,age,race/ethnicity,bmi,density,tumor_grade,tumor_size,tumor_type,margin
0,573,68.0,Non-Hispanic White,26.05,C,2.0,5.0,ILC,0.0
1,95,59.0,Non-Hispanic Black,32.58,A,3.0,15.0,IDC,1.0
2,748,44.0,Non-Hispanic White,25.39,C,3.0,8.0,IDC,1.0
3,391,52.0,Non-Hispanic Black,40.57,B,2.0,10.0,IDC,1.0
4,79,51.0,Non-Hispanic White,32.46,A,1.0,12.0,IDC,1.0


In [141]:
test_clin['study_id'] = test_clin['study_id'].astype(int)
test_clin.sort_values(by=['study_id'], inplace=True)
test_clin.reset_index(drop=True, inplace=True)
test_clin.head()

Unnamed: 0,study_id,age,race/ethnicity,bmi,density,tumor_grade,tumor_size,tumor_type,margin
0,1,46.0,Asian,18.97,D,2.0,10.0,IDC,1.0
1,2,47.0,Hispanic,40.28,B,2.0,4.0,IDC,1.0
2,3,61.0,Non-Hispanic White,24.44,C,2.0,7.553571,DCIS,1.0
3,4,59.0,Non-Hispanic White,24.21,C,1.0,10.0,IDC,1.0
4,5,40.0,Non-Hispanic White,21.8,C,3.0,11.0,IDC,0.0


In [142]:
preds_new['study_id'] = preds_new['study_id'].astype(int)
preds_new.sort_values(by=['study_id'], inplace=True)
preds_new.reset_index(drop=True, inplace=True)
preds_new.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preds_new['study_id'] = preds_new['study_id'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Unnamed: 0,file_path,y_true,y_pred,study_id
0,../data/split_1/train/1/1_new.jpg,1.0,0.52082,1
1,../data/split_1/train/1/2_new.jpg,1.0,0.73601,2
2,../data/split_1/train/1/3_new.jpg,1.0,0.571732,3
3,../data/split_1/train/1/4_new.jpg,1.0,0.714353,4
4,../data/split_1/train/0/5_new.jpg,0.0,0.154633,5


In [143]:
roc_auc_score(preds_new['y_true'], preds_new['y_pred'])

0.6837050054538842

In [144]:
test_clin['y_pred'] = np.NaN
test_clin.update(preds_new)
test_clin.head()

Unnamed: 0,study_id,age,race/ethnicity,bmi,density,tumor_grade,tumor_size,tumor_type,margin,y_pred
0,1,46.0,Asian,18.97,D,2.0,10.0,IDC,1.0,0.52082
1,2,47.0,Hispanic,40.28,B,2.0,4.0,IDC,1.0,0.73601
2,3,61.0,Non-Hispanic White,24.44,C,2.0,7.553571,DCIS,1.0,0.571732
3,4,59.0,Non-Hispanic White,24.21,C,1.0,10.0,IDC,1.0,0.714353
4,5,40.0,Non-Hispanic White,21.8,C,3.0,11.0,IDC,0.0,0.154633


In [145]:
roc_auc_score(test_clin['margin'], test_clin['y_pred'])

0.6837050054538842

In [146]:
test_clin.set_index('study_id', inplace=True)
test_clin.head()

Unnamed: 0_level_0,age,race/ethnicity,bmi,density,tumor_grade,tumor_size,tumor_type,margin,y_pred
study_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,46.0,Asian,18.97,D,2.0,10.0,IDC,1.0,0.52082
2,47.0,Hispanic,40.28,B,2.0,4.0,IDC,1.0,0.73601
3,61.0,Non-Hispanic White,24.44,C,2.0,7.553571,DCIS,1.0,0.571732
4,59.0,Non-Hispanic White,24.21,C,1.0,10.0,IDC,1.0,0.714353
5,40.0,Non-Hispanic White,21.8,C,3.0,11.0,IDC,0.0,0.154633


In [147]:
roc_auc_score(test_clin['margin'], test_clin['y_pred'])

0.6837050054538842

In [148]:
cat_cols = ['race/ethnicity', 'density', 'tumor_grade', 'tumor_type']
for col in cat_cols:
    test_clin[col] = test_clin[col].astype('category')

In [149]:
densities = test_clin['density'].cat.categories.tolist()
race_eths = test_clin['race/ethnicity'].cat.categories.tolist()
tumor_grades = test_clin['tumor_grade'].cat.categories.tolist()
tumor_types = test_clin['tumor_type'].cat.categories.tolist()

In [150]:
#make a dataframe with auroc, sensitivity, specificity, positive predictive value, negative predictive value
res_df = pd.DataFrame(columns = ['subset', 'AUROC', 'AUPRC', 'Sensitivity', 'Specificity', 'PPV', 'NPV'])
for density in densities:
    subset = test_clin.loc[test_clin['density'] == density]
    y_true = subset['margin']
    y_pred = subset['y_pred']
    auroc = roc_auc_score(y_true, y_pred)
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
    auprc = auc(recall, precision)
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred.round()).ravel()
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    ppv = tp/(tp+fp)
    npv = tn/(tn+fn)
    res_df = res_df.append({'subset': density, 'AUROC': auroc, 'AUPRC': auprc, 'Sensitivity': sensitivity, 'Specificity': specificity, 'PPV': ppv, 'NPV': npv}, ignore_index=True)
res_df = res_df.round(3)
res_df.head()

Unnamed: 0,subset,AUROC,AUPRC,Sensitivity,Specificity,PPV,NPV
0,A,0.758,0.877,0.909,0.333,0.714,0.667
1,B,0.665,0.752,0.944,0.175,0.66,0.647
2,C,0.741,0.766,0.961,0.213,0.603,0.812
3,D,0.54,0.619,0.897,0.222,0.65,0.571


In [151]:
res_df.to_csv('../results/res_density.csv', index=False)

In [152]:
pd.crosstab(test_clin['margin'], test_clin['race/ethnicity'])

race/ethnicity,Asian,Hispanic,Non-Hispanic Black,Non-Hispanic White,Other/Unknown
margin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,7,9,25,104,3
1.0,5,20,42,156,0


In [153]:
race_eths.remove('Other/Unknown')

In [154]:
#make a dataframe with auroc, sensitivity, specificity, positive predictive value, negative predictive value
res_df = pd.DataFrame(columns = ['subset', 'AUROC', 'AUPRC', 'Sensitivity', 'Specificity', 'PPV', 'NPV'])
for density in race_eths:
    subset = test_clin.loc[test_clin['race/ethnicity'] == density]
    y_true = subset['margin']
    y_pred = subset['y_pred']
    auroc = roc_auc_score(y_true, y_pred)
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
    auprc = auc(recall, precision)
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred.round()).ravel()
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    ppv = tp/(tp+fp)
    npv = tn/(tn+fn)
    res_df = res_df.append({'subset': density, 'AUROC': auroc, 'AUPRC': auprc, 'Sensitivity': sensitivity, 'Specificity': specificity, 'PPV': ppv, 'NPV': npv}, ignore_index=True)
res_df = res_df.round(3)
res_df.head()

Unnamed: 0,subset,AUROC,AUPRC,Sensitivity,Specificity,PPV,NPV
0,Asian,0.771,0.802,1.0,0.286,0.5,1.0
1,Hispanic,0.733,0.831,0.95,0.111,0.704,0.5
2,Non-Hispanic Black,0.726,0.83,0.905,0.2,0.655,0.556
3,Non-Hispanic White,0.658,0.718,0.949,0.202,0.641,0.724


In [155]:
res_df.to_csv('../results/res_raceth.csv', index=False)

In [123]:
test_clin['Race/Ethnicity'] = np.NaN
test_clin.loc[test_clin['race/ethnicity'] == 'Non-Hispanic White', 'Race/Ethnicity'] = 'Non-Hispanic White'
test_clin['Race/Ethnicity'].fillna('Non-White', inplace=True)
test_clin['Race/Ethnicity'].value_counts(dropna=False)

Non-Hispanic White    260
Non-White             111
Name: Race/Ethnicity, dtype: int64

In [124]:
raceth_density = pd.crosstab(test_clin['density'], test_clin['Race/Ethnicity'])
raceth_density


Race/Ethnicity,Non-Hispanic White,Non-White
density,Unnamed: 1_level_1,Unnamed: 2_level_1
A,9,8
B,121,49
C,93,44
D,37,10


In [125]:
raceth_density_norm = pd.crosstab(test_clin['density'], test_clin['Race/Ethnicity'], normalize='columns')
raceth_density_norm

Race/Ethnicity,Non-Hispanic White,Non-White
density,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.034615,0.072072
B,0.465385,0.441441
C,0.357692,0.396396
D,0.142308,0.09009


In [126]:
raceth_density.to_csv('../results/raceth_density.csv')

In [127]:
#show the auroc, auprc, sensitivity, specificity, PPV, and NPV for the entire test set
y_true = test_clin['margin']
y_pred = test_clin['y_pred']
auroc = roc_auc_score(y_true, y_pred)
precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
auprc = auc(recall, precision)
fpr, tpr, thresholds = roc_curve(y_true, y_pred)
tn, fp, fn, tp = confusion_matrix(y_true, y_pred.round()).ravel()
sensitivity = tp/(tp+fn)
specificity = tn/(tn+fp)
ppv = tp/(tp+fp)
npv = tn/(tn+fn)
res_df = res_df.append({'subset': 'All', 'AUROC': auroc, 'AUPRC': auprc, 'Sensitivity': sensitivity, 'Specificity': specificity, 'PPV': ppv, 'NPV': npv}, ignore_index=True)
res_df

Unnamed: 0,subset,AUROC,AUPRC,Sensitivity,Specificity,PPV,NPV
0,Asian,0.771429,0.801587,1.0,0.285714,0.5,1.0
1,Hispanic,0.733333,0.831003,0.95,0.111111,0.703704,0.5
2,Non-Hispanic Black,0.725714,0.830392,0.904762,0.2,0.655172,0.555556
3,Non-Hispanic White,0.657853,0.718263,0.948718,0.201923,0.640693,0.724138
4,All,0.683705,0.747908,0.941704,0.202703,0.640244,0.697674
