In [1]:
import pandas as pd
import statistics as stat
import json
import os, re

# Get bbox annotation results

In [2]:
# Maps annotator to study order (1st or 2nd study for each of the 500 patients)
annotators_dict = {1:['C','W'],2:['S','J']}
# Annotator S was paired with J, and C with W to independently dual annotate the images

In [3]:
# These bboxes are more common described/annotated in the nlp ground truth
bboxlist = ['aortic arch', 'svc', 'trachea', 'carina', 'upper mediastinum'
            , 'cavoatrial junction', 'right atrium', 'cardiac silhouette'
            #, 'mediastinum' # mediastinum was re-derived from annotations from cardiac silhouette and upper mediastinum
            , 'left apical zone', 'left upper lung zone', 'left mid lung zone', 'left lower lung zone', 'left lung'
            , 'right apical zone' , 'right upper lung zone', 'right mid lung zone', 'right lower lung zone', 'right lung'
            , 'left clavicle', 'right clavicle'
            , 'left hilar structures', 'right hilar structures'
            , 'left costophrenic angle', 'right costophrenic angle'
            , 'left hemidiaphragm', 'right hemidiaphragm'
            #, 'left upper abdomen', 'right upper abdomen' # not in NLP evaluation
            #,'abdomen' # not directly annotated but can be derived from the hemidiaphragms to lower border of CXR images
            ]
print(len(bboxlist))

26


In [4]:
# Cleaned annotations are in the cleaned directory

study_order = 1
# for annotator 1 for the study order
annot1 = pd.read_csv('../gold_dataset/bbox_coordinate_annotations_'+str(study_order)+'_'+annotators_dict[study_order][0]+'.csv')
annot1 = annot1[annot1.bbox_name.isin(bboxlist)].reset_index(drop=True).copy()
annot1['annotator'] = annotators_dict[study_order][0]
print(annot1.shape)

# for annotator 2 for the study order
annot2 = pd.read_csv('../gold_dataset/bbox_coordinate_annotations_'+str(study_order)+'_'+annotators_dict[study_order][1]+'.csv')
annot2 = annot2[annot2.bbox_name.isin(bboxlist)].reset_index(drop=True).copy()
annot2['annotator'] = annotators_dict[study_order][1]
print(annot2.shape)

study_order = 2
# for annotator 1 for the study order
annot3 = pd.read_csv('../gold_dataset/bbox_coordinate_annotations_'+str(study_order)+'_'+annotators_dict[study_order][0]+'.csv')
annot3 = annot3[annot3.bbox_name.isin(bboxlist)].reset_index(drop=True).copy()
annot3['annotator'] = annotators_dict[study_order][0]
print(annot3.shape)

# for annotator 2 for the study order
annot4 = pd.read_csv('../gold_dataset/bbox_coordinate_annotations_'+str(study_order)+'_'+annotators_dict[study_order][1]+'.csv')
annot4 = annot4[annot4.bbox_name.isin(bboxlist)].reset_index(drop=True).copy()
annot4['annotator'] = annotators_dict[study_order][1]
print(annot4.shape)

# combine
annot = pd.concat([annot1,annot2,annot3,annot4],ignore_index=True).copy()
print('all bbox annotations', annot.shape)
print(annot.shape)
annot.head()

(12995, 16)
(12995, 16)
(12997, 16)
(12932, 16)
all bbox annotations (51919, 16)
(51919, 16)


Unnamed: 0,image_id,x1,y1,x2,y2,width,height,bbox_name,annot_id,original_x1,original_x2,original_y1,original_y2,original_width,original_height,annotator
0,00046130-fd952ef0-57f2948d-491a16b4-5db3a18c.dcm,124.0,85.0,145.0,105.0,21,20,aortic arch,00046130-fd952ef0-57f2948d-491a16b4-5db3a18c.d...,1691.0,1978.0,900.0,1173.0,287.0,273.0,C
1,00046130-fd952ef0-57f2948d-491a16b4-5db3a18c.dcm,100.0,101.0,185.0,181.0,85,80,cardiac silhouette,00046130-fd952ef0-57f2948d-491a16b4-5db3a18c.d...,1364.0,2523.0,1118.0,2210.0,1159.0,1092.0,C
2,00046130-fd952ef0-57f2948d-491a16b4-5db3a18c.dcm,115.0,102.0,125.0,112.0,10,10,carina,00046130-fd952ef0-57f2948d-491a16b4-5db3a18c.d...,1568.0,1705.0,1132.0,1268.0,137.0,136.0,C
3,00046130-fd952ef0-57f2948d-491a16b4-5db3a18c.dcm,100.0,101.0,113.0,125.0,13,24,cavoatrial junction,00046130-fd952ef0-57f2948d-491a16b4-5db3a18c.d...,1364.0,1541.0,1118.0,1446.0,177.0,328.0,C
4,00046130-fd952ef0-57f2948d-491a16b4-5db3a18c.dcm,127.0,44.0,180.0,76.0,53,32,left apical zone,00046130-fd952ef0-57f2948d-491a16b4-5db3a18c.d...,1732.0,2455.0,341.0,777.0,723.0,436.0,C


In [5]:
# annotations were collected with 224x224 resized images
annot['coord224'] = [[x1,y1,x2,y2] for x1,y1,x2,y2 in zip(annot['x1'],annot['y1'],annot['x2'],annot['y2'])]

# Sanity check -- got all images in the annotations

In [6]:
# Annotated the first and exam CXR image for 500 patients
dfimages = pd.read_csv('../gold_dataset/gold_all_sentences_500pts_1000studies.txt',sep='\t')
keep = ['patient_id', 'study_id', 'image_id', 'StudyOrder']
dfimages = dfimages[keep].drop_duplicates().reset_index(drop=True).copy()
print(dfimages.shape)
dfimages.head()

(1000, 4)


Unnamed: 0,patient_id,study_id,image_id,StudyOrder
0,10020740,55522869,27776756-1d9ef4fc-cd8dd0ca-1453072f-12c0f484.dcm,1
1,10020740,58116104,d3dbb519-1ea6cf3c-bb4c1fd8-79bb117a-1dc3869f.dcm,2
2,10037020,50400085,9e66ccc9-98952e0b-ee215d63-7b8090e9-c941aae1.dcm,2
3,10037020,58400371,76289ac1-3ef7c087-3e77810d-63462e2c-20c0364c.dcm,1
4,10063856,54814005,4bb710ab-ab7d4781-568bcd6e-5079d3e6-7fdb61b6.dcm,1


In [7]:
# This is the image order for the first CXR exam
imagefns1 = dfimages[dfimages.StudyOrder == 1].image_id.tolist()
# Sorting to make the order consistent between each kernel restart
imagefns1.sort()
print(len(imagefns1))
# This is the patient order following the imagefns1 order
ptOrder = [dfimages[(dfimages.StudyOrder == 1)&(dfimages.image_id==x)].patient_id.tolist()[0] for x in imagefns1]

order = 2

# Ordering the subsequent cxr exam following the same patient order
imagefns2 = [dfimages[(dfimages.StudyOrder == order)&(dfimages.patient_id==s)].image_id.tolist() for s in ptOrder]
imagefns2 = [x[0] for x in imagefns2 if len(x)>0]
print(len(imagefns2))

# Sanity check that the same images are all there
imagefnsCheck = dfimages[dfimages.StudyOrder == order].image_id.tolist()
print(len(set(imagefns2).intersection(imagefnsCheck)))

# This should be the same
print(len(imagefns2))
print(len(set(imagefns2)))

500
500
500
500
500


In [8]:
# sanity check - should all be 1000
images_annotated = set(imagefns1).union(set(imagefns2))
print(len(images_annotated))
print(len(set(annot.image_id)))
print(len(images_annotated.intersection(set(annot.image_id))))

1000
1000
1000


In [9]:
# 500 patients in final gold standard dataset
print('Final number of patients in gold standard:', len(set(dfimages[dfimages.image_id.isin(annot.image_id)].patient_id)))

Final number of patients in gold standard: 500


# Get inter-annotator IoU

In [10]:
# adapted from: https://www.pyimagesearch.com/2016/11/07/intersection-over-union-iou-for-object-detection/
def bb_intersection_over_union(boxA, boxB):
    # determine the (x, y)-coordinates of the intersection rectangle
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    # compute the area of intersection rectangle
    interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
    # compute the area of both the prediction and ground-truth
    # rectangles
    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    if float(boxAArea + boxBArea - interArea) != 0:
        iou = interArea / float(boxAArea + boxBArea - interArea)
    else:
        iou = 0
    # return the intersection over union value
    return iou

In [11]:
frames = []
# First annotators for the 1000 gold images
A = [annotators_dict[1][0],annotators_dict[2][0]]
# Second annotators for the 1000 gold images
B = [annotators_dict[1][1],annotators_dict[2][1]]

for box in bboxlist:
    annot_A = annot[annot.bbox_name == box].reset_index(drop=True).copy()
    annot_B = annot[annot.bbox_name==box].reset_index(drop=True).copy()
    # Add coordinates for B to df for A for IoU calculations
    annot_A['coord224_B'] = [annot_B[annot_B.annot_id == x]['coord224'].tolist()[0] 
                                 if len(annot_B[annot_B.annot_id == x])>0 else [0,0,0,0]
                                 for x in annot_A['annot_id']]
    annot_A['IoU'] = [bb_intersection_over_union(boxA, boxB ) 
                        if (sum(boxB)!=0)&(sum(boxA)!=0) else 0
                        for boxA, boxB in zip(annot_A['coord224'],annot_A['coord224_B'])]
    frames.append(annot_A)
    print(box, len(annot_A))

df_iou_inter = pd.concat(frames,ignore_index=True).copy()
print(df_iou_inter.shape)
print('Done')

aortic arch 2000
svc 2000
trachea 2000
carina 1994
upper mediastinum 2000
cavoatrial junction 2000
right atrium 2000
cardiac silhouette 2000
left apical zone 1988
left upper lung zone 2001
left mid lung zone 1992
left lower lung zone 1993
left lung 2000
right apical zone 1988
right upper lung zone 2002
right mid lung zone 1992
right lower lung zone 1991
right lung 2000
left clavicle 1996
right clavicle 1998
left hilar structures 1991
right hilar structures 1991
left costophrenic angle 2000
right costophrenic angle 2000
left hemidiaphragm 2000
right hemidiaphragm 2002
(51919, 19)
Done


In [12]:
df_iou_inter['different'] = [1 if iou < 1 else 0 for iou in df_iou_inter['IoU']]

In [13]:
keep = ['annot_id','bbox_name','IoU','different']
iou_summary_inter = df_iou_inter[keep].copy()
iou_summary_inter['mean_IoU'] = iou_summary_inter.groupby(['bbox_name'])['IoU'].transform(lambda x: round(stat.mean(x),3))
iou_summary_inter['total_freq'] = iou_summary_inter.groupby(['bbox_name'])['annot_id'].transform(lambda x: len(x))
iou_summary_inter['different_freq'] = iou_summary_inter.groupby(['bbox_name'])['different'].transform(lambda x: sum(x))
print(iou_summary_inter.shape)
# Get mean IoU for each bbox
keep = ['bbox_name','mean_IoU','total_freq','different_freq']
iou_summary_inter = iou_summary_inter[keep].drop_duplicates().reset_index(drop=True).copy()
print(iou_summary_inter.shape)
iou_summary_inter['%_different'] = [str(round(x/y*100,2))+' %' for x,y in zip(iou_summary_inter['different_freq'],iou_summary_inter['total_freq'])]

(51919, 7)
(26, 4)


In [14]:
iou_summary_inter

Unnamed: 0,bbox_name,mean_IoU,total_freq,different_freq,%_different
0,aortic arch,0.995,2000,15,0.75 %
1,svc,0.996,2000,10,0.5 %
2,trachea,0.997,2000,9,0.45 %
3,carina,0.997,1994,8,0.4 %
4,upper mediastinum,0.996,2000,15,0.75 %
5,cavoatrial junction,0.985,2000,44,2.2 %
6,right atrium,0.987,2000,41,2.05 %
7,cardiac silhouette,0.983,2000,97,4.85 %
8,left apical zone,0.986,1988,55,2.77 %
9,left upper lung zone,0.985,2001,66,3.3 %


# Get Ground Truth from the dual annotations

In [15]:
# Since pretty high agreement
# So will get average of 2 annotators for each bbox as ground truth
print(annot.columns.tolist())
coordinates = ['x1', 'y1', 'x2', 'y2', 'width', 'height', 
               'original_x1', 'original_x2', 'original_y1', 'original_y2', 'original_width', 'original_height']
print(annot.loc[0,'annot_id'])
print()

annot['annotator'] = annot.groupby(['image_id','bbox_name'])['annotator'].transform(lambda x: '&&'.join(x))

for col in coordinates:
    annot[col] = annot.groupby(['image_id','bbox_name'])[col].transform(lambda x: stat.mean(x))

print(annot.shape)
annot.drop(['coord224'],axis=1,inplace=True) # cannot set list
annot = annot.drop_duplicates().reset_index(drop=True).copy()
print(annot.shape)

['image_id', 'x1', 'y1', 'x2', 'y2', 'width', 'height', 'bbox_name', 'annot_id', 'original_x1', 'original_x2', 'original_y1', 'original_y2', 'original_width', 'original_height', 'annotator', 'coord224']
00046130-fd952ef0-57f2948d-491a16b4-5db3a18c.dcm|aortic arch

(51919, 17)
(25989, 16)


In [20]:
# annotations were collected with 224x224 resized images
annot['coord224'] = [[int(x1),int(y1),int(x2),int(y2)] for x1,y1,x2,y2 in zip(annot['x1'],annot['y1'],annot['x2'],annot['y2'])]
annot['coord_original'] = [[int(x1),int(y1),int(x2),int(y2)] for x1,y1,x2,y2 in zip(annot['original_x1'],annot['original_y1'],annot['original_x2'],annot['original_y2'])]


In [21]:
# This rights out the ground truth file for the annotated bboxes for 1000 images
annot.to_csv('../gold_dataset/gold_bbox_coordinate_annotations_1000images.csv',index=False,encoding='utf-8')

# Get automatically extracted Bbox coordinates

In [17]:
# This file was flattened to a table from the scene graphs for the 1000 images in advance
auto = pd.read_csv('../gold_dataset/auto_bbox_pipeline_coordinates_1000_images.txt',sep='\t')
print(auto.shape)
auto.head()

(25880, 18)


Unnamed: 0,object_id,x1,y1,x2,y2,width,height,bbox_name,synsets,name,original_x1,original_y1,original_x2,original_y2,original_width,original_height,image_id,annot_id
0,00637f42-a2f19a71-cd265165-0928eca9-c6d14eee_r...,48,19,107,130,59,111,right lung,['C0225706'],Right lung,395,259,1200,1773,805,1514,00637f42-a2f19a71-cd265165-0928eca9-c6d14eee.dcm,00637f42-a2f19a71-cd265165-0928eca9-c6d14eee.d...
1,00637f42-a2f19a71-cd265165-0928eca9-c6d14eee_r...,64,23,107,65,43,42,right upper lung zone,['C0934570'],Right upper lung zone,613,313,1200,886,587,573,00637f42-a2f19a71-cd265165-0928eca9-c6d14eee.dcm,00637f42-a2f19a71-cd265165-0928eca9-c6d14eee.d...
2,00637f42-a2f19a71-cd265165-0928eca9-c6d14eee_r...,60,65,104,89,44,24,right mid lung zone,['CL380307'],Right mid lung zone,559,886,1159,1214,600,328,00637f42-a2f19a71-cd265165-0928eca9-c6d14eee.dcm,00637f42-a2f19a71-cd265165-0928eca9-c6d14eee.d...
3,00637f42-a2f19a71-cd265165-0928eca9-c6d14eee_r...,48,89,104,130,56,41,right lower lung zone,['C0929214'],Right lower lung zone,395,1214,1159,1773,764,559,00637f42-a2f19a71-cd265165-0928eca9-c6d14eee.dcm,00637f42-a2f19a71-cd265165-0928eca9-c6d14eee.d...
4,00637f42-a2f19a71-cd265165-0928eca9-c6d14eee_r...,82,60,105,93,23,33,right hilar structures,['C1708369'],Hilar Area of the Right Lung,859,818,1173,1268,314,450,00637f42-a2f19a71-cd265165-0928eca9-c6d14eee.dcm,00637f42-a2f19a71-cd265165-0928eca9-c6d14eee.d...


In [18]:
# some annot bboxes not in auto --> # additional bboxes that were manually annotated
print(len(set(annot.annot_id).difference(auto.annot_id))) 
# all auto bboxes in annot
print(len(set(auto.annot_id).difference(annot.annot_id)))

109
0


In [19]:
# annotations were collected with 224x224 resized images
auto['coord224'] = [[x1,y1,x2,y2] for x1,y1,x2,y2 in zip(auto['x1'],auto['y1'],auto['x2'],auto['y2'])]

# Calculate Bbox detection performance:

Compare against automatically extracted results -- mean IOUs, number needing correction, etc for each bbox

In [20]:
frames = []
for box in bboxlist:
    auto_box = auto[auto.bbox_name == box].reset_index(drop=True).copy()
    annot_box = annot[annot.bbox_name==box].reset_index(drop=True).copy()
    annot_box['coord224_auto'] = [auto_box[auto_box.annot_id == x]['coord224'].tolist()[0] 
                                 if len(auto_box[auto_box.annot_id == x])>0 else [0,0,0,0]
                                 for x in annot_box['annot_id']]
    annot_box['IoU'] = [bb_intersection_over_union(boxA, boxB ) 
                        if (sum(boxB)!=0)&(sum(boxA)!=0) else 0
                        for boxA, boxB in zip(annot_box['coord224'],annot_box['coord224_auto'])]
    frames.append(annot_box)
    print(box, len(annot_box))

df_iou = pd.concat(frames,ignore_index=True).copy()
print(df_iou.shape)
print('Done')

aortic arch 1000
svc 1000
trachea 1000
carina 997
upper mediastinum 1000
cavoatrial junction 1000
right atrium 1000
cardiac silhouette 1000
left apical zone 998
left upper lung zone 1000
left mid lung zone 1000
left lower lung zone 1000
left lung 1000
right apical zone 998
right upper lung zone 1000
right mid lung zone 1000
right lower lung zone 999
right lung 1000
left clavicle 998
right clavicle 999
left hilar structures 1000
right hilar structures 1000
left costophrenic angle 1000
right costophrenic angle 1000
left hemidiaphragm 1000
right hemidiaphragm 1000
(25989, 19)
Done


In [21]:
df_iou['corrected'] = [1 if iou < 1 else 0 for iou in df_iou['IoU']]

In [22]:
keep = ['annot_id','bbox_name','IoU','corrected']
iou_summary = df_iou[keep].copy()
iou_summary['mean_IoU'] = iou_summary.groupby(['bbox_name'])['IoU'].transform(lambda x: round(stat.mean(x),3))
iou_summary['total_freq'] = iou_summary.groupby(['bbox_name'])['annot_id'].transform(lambda x: len(x))
iou_summary['corrected_freq'] = iou_summary.groupby(['bbox_name'])['corrected'].transform(lambda x: sum(x))

# Get mean IoU for each bbox
keep = ['bbox_name','mean_IoU','total_freq','corrected_freq']
iou_summary = iou_summary[keep].drop_duplicates().reset_index(drop=True).copy()
iou_summary['%_corrected'] = [str(round(x/y*100,2))+' %' for x,y in zip(iou_summary['corrected_freq'],iou_summary['total_freq'])]

In [23]:
iou_summary

Unnamed: 0,bbox_name,mean_IoU,total_freq,corrected_freq,%_corrected
0,aortic arch,0.991,1000,14,1.4 %
1,svc,0.995,1000,7,0.7 %
2,trachea,0.995,1000,9,0.9 %
3,carina,0.994,997,8,0.8 %
4,upper mediastinum,0.994,1000,14,1.4 %
5,cavoatrial junction,0.977,1000,43,4.3 %
6,right atrium,0.979,1000,40,4.0 %
7,cardiac silhouette,0.967,1000,97,9.7 %
8,left apical zone,0.963,998,62,6.21 %
9,left upper lung zone,0.968,1000,64,6.4 %
