In [1]:
import pandas as pd
import json
import os, re
import ast
import numpy as np

# Read comparison ground truth annotations

In [2]:
comp = pd.read_csv('../gold_dataset/gold_comparison_relations_500pts_500studies2nd.txt',sep='\t')
comp['comparison'] = [sorted(list(set(ast.literal_eval(x)))) for x in comp['comparison']]
# comp['bbox'] = ['' if str(x) =='nan' else x for x in comp['bbox']]
# comp['region'] = comp.groupby(['image_id','row_id'])['bbox'].transform(lambda x: '&&'.join(x))
# comp['region'] = [sorted(list(set(x.strip('&&').split('&&')))) for x in comp['region']]
print(comp.shape)
comp.head()

(5433, 15)


Unnamed: 0,patient_id,study_id,studyOrder,image_id,row_id,section,bbox,relation,label_name,context,categoryID,region,annot_id,sentence,comparison
0,10020740,58116104,2,d3dbb519-1ea6cf3c-bb4c1fd8-79bb117a-1dc3869f.dcm,58116104|5,finalreport,left lung,1.0,low lung volumes,yes,technicalassessment,"['left lung', 'right lung']",58116104|5|left lung|1|low lung volumes,"as compared to the previous radiograph, the l...",[no change]
1,10020740,58116104,2,d3dbb519-1ea6cf3c-bb4c1fd8-79bb117a-1dc3869f.dcm,58116104|5,finalreport,right lung,1.0,low lung volumes,yes,technicalassessment,"['left lung', 'right lung']",58116104|5|right lung|1|low lung volumes,"as compared to the previous radiograph, the l...",[no change]
2,10020740,58116104,2,d3dbb519-1ea6cf3c-bb4c1fd8-79bb117a-1dc3869f.dcm,58116104|9,finalreport,left costophrenic angle,0.0,pleural effusion,no,anatomicalfinding,"['left costophrenic angle', 'left lung', 'righ...",58116104|9|left costophrenic angle|0|pleural e...,no larger pleural effusions,[no change]
3,10020740,58116104,2,d3dbb519-1ea6cf3c-bb4c1fd8-79bb117a-1dc3869f.dcm,58116104|9,finalreport,left lung,0.0,pleural effusion,no,anatomicalfinding,"['left costophrenic angle', 'left lung', 'righ...",58116104|9|left lung|0|pleural effusion,no larger pleural effusions,[no change]
4,10020740,58116104,2,d3dbb519-1ea6cf3c-bb4c1fd8-79bb117a-1dc3869f.dcm,58116104|9,finalreport,right costophrenic angle,1.0,abnormal,yes,nlp,"['left costophrenic angle', 'left lung', 'righ...",58116104|9|right costophrenic angle|1|abnormal,no larger pleural effusions,[no change]


In [3]:
print('The second report study for the same 500 patients were annotated for comparison descriptions')
print('Number of unique report studies with any comparison descriptions:',len(set(comp.study_id)))
print('From number of unique patients with any comparison descriptions:',len(set(comp.patient_id)))

The second report study for the same 500 patients were annotated for comparison descriptions
Number of unique report studies with any comparison descriptions: 342
From number of unique patients with any comparison descriptions: 342


In [4]:
# Some of the comparison descriptions were non-specific and/or cannot be localized to an anatomical location even manually
# Excluding these, there remains:
comp = comp[(~comp.bbox.isnull())&(~comp.label_name.isnull())].reset_index(drop=True).copy()
print(len(set(comp.row_id)))
print(comp.shape)
print('Number of unique report studies with localized comparison relation descriptions:',len(set(comp.study_id)))
print('From number of unique patients with localized comparison relation descriptions:',len(set(comp.patient_id)))

638
(5156, 15)
Number of unique report studies with localized comparison relation descriptions: 290
From number of unique patients with localized comparison relation descriptions: 290


# Get all second studies for the 500 patients

In [5]:
# Read in the file with all the raw information for the gold standard dataset
gold = pd.read_csv('../gold_dataset/gold_all_sentences_500pts_1000studies.txt',sep='\t')
print('Number of unique patients:',len(set(gold.patient_id)))
print('Number of unique studies:',len(set(gold.study_id)))

# second CXR exam for each patient to assess for comparison relation extraction
imageIDs2 = gold[gold.StudyOrder == 2][['image_id']].drop_duplicates().image_id.tolist()
print('Number of second exam images', len(imageIDs2))

Number of unique patients: 500
Number of unique studies: 1000
Number of second exam images 500


# Read comparison relations from scene graphs to a table

In [6]:
# read json if path exists
def readJSON(filepath):
    try:
        with open(filepath) as f:
            data = json.load(f)
            return data
    except Exception as e:
        print('File does not exist',filepath)
        return None
    
    
#   Tokenizing sentences
def tokensent(sent):
    sent = sent.lower()
    sent = sent.replace('\n', ' ').replace('  ', ' ').replace(' ,', ',').replace('-', ' ') 
    sent = re.sub(r'\[.+?\]', '', sent)  # gets rid of the deidentification tags in mimic sentences
    return sent


# Get the comparison relationships between bboxes from sequential images into a table format from the scene graph json
def SceneGraphs2ComparisonRelationsTable(imageIDs, basepath, drop_duplicates=True):
    tab = {
           # pattern: row_id (for subject sentence) +'_'+ study_id (for object) +'_'+ bbox synsets (UMLS id) +'_'+ element 0
           # of object_id after split by '-'
          'relationship_id':[], 
           # dicom_id of the subject image + '_' + bbox_name
          'subject_id': [],
           # dicom_id of the object image + '_' + bbox_name
          'object_id': [],
           # bbox_name
          'bbox': [],
           # comparison name
          'comparison': [],
           # with respect to a label, context included
          'attribute': [],
           # The subject sentence describing the comparison relationship
          'sentence':[]
          }
    print(tab.keys())
    
    print('Getting the comparison relationships')
    for i, image_id in enumerate(imageIDs):
        if i % 1000 == 0: print('Processed comparison relations from ', i , ' scene graphs')
        filepath = os.path.join(basepath,str(image_id.replace('.dcm','')) + '_SceneGraph.json')
        data = readJSON(filepath)  
        
        comparisons = data['relationships']
        for comp in comparisons:
            compare = comp['relationship_names']
            compare = [x for x in compare if x in ['comparison|yes|worsened'
                                                   ,'comparison|yes|improved','comparison|yes|no change']]
            compare = ';;'.join(sorted([x.split('|')[2] for x in compare]))
            if len(compare)>0:
                attributes = comp['attributes']
                sent = tokensent(comp['phrase'])
                # Get 1 row per conditioned on attribute
                for attr in attributes:
                    subject_id = comp['subject_id']
                    object_id = comp['object_id']
                    # The comparison relation should be interpreted as subject_id <has relation as compared to> object_id
                    tab['relationship_id'].append(comp['relationship_id']) # relationship id that unique ID the comparison
                    tab['subject_id'].append(subject_id) # The bbox from the current image (NOT the patient id here)
                    tab['object_id'].append(object_id) # The bbox (same anatomy) from the previous image
                    tab['bbox'].append(comp['bbox_name']) # The anatomical location described (bbox)
                    tab['comparison'].append(compare) # The comparison relationships are in ['improved','worsened','no change']
                    tab['attribute'].append(attr) # The finding, disease, or tech assess label that conditions the relationship
                    tab['sentence'].append(sent) # The sentence in the current report that describes the comparison relationship
                    
    scene_comp = pd.DataFrame(tab).copy()
    print(scene_comp.shape)
    
    scene_comp['current_image_id'] = [x.split('_')[0]+'.dcm' for x in scene_comp['subject_id']]
    print(set(scene_comp.comparison))

    # Some preprocessing -- needs to be same as that done for experiment
    # Similar rationale as that for picking the attribute annotations
    # There may be multiple sents per bbox-comparison-attribute relation with different context for the attribute
    scene_comp['sent_loc'] = [float(x.split('_')[0].split('|')[-1]) for x in scene_comp['relationship_id']]
    scene_comp['label_name'] = [x.split('|')[-1] for x in scene_comp['attribute']]

    if drop_duplicates:
        # Sort by unique subject-object-label and sent_loc (order of sentences in a report)
        scene_comp = scene_comp.sort_values(by=['subject_id','object_id','sent_loc','bbox','label_name']).reset_index(drop=True).copy()
        print(scene_comp.shape)
        # keep last label context and comparison for subject-object-label combinations
        scene_comp = scene_comp.drop_duplicates(subset=['subject_id','object_id','bbox','label_name'],keep='last').reset_index(drop=True).copy()
        scene_comp.drop(['sent_loc','label_name'],axis=1,inplace=True)
        print(scene_comp.shape)

    # needs to be a list for evaluation analysis
    scene_comp['comparison'] = [sorted(list(set(x.split(';;')))) for x in scene_comp['comparison']]
    scene_comp['annot_id'] = ['|'.join([rel.split('_')[0],box,att.split('|')[1],att.split('|')[2]]) 
                              for rel, box, att in zip(scene_comp['relationship_id'],scene_comp['bbox'],scene_comp['attribute'])]
    scene_comp['annot_id'] = [x.replace('|yes|','|1|').replace('|no|','|0|') for x in scene_comp['annot_id']]
    
    return scene_comp

In [7]:
# TO DO: set path to the scene_graph json directory
images_dir = '../../../subset/scene_graph/' 

In [8]:
# Extract comparison relations in scene graphs to a table for easier comparison against ground truth
scene_comp = SceneGraphs2ComparisonRelationsTable(imageIDs2, images_dir)

dict_keys(['relationship_id', 'subject_id', 'object_id', 'bbox', 'comparison', 'attribute', 'sentence'])
Getting the comparison relationships
Processed comparison relations from  0  scene graphs
(5368, 7)
{'improved', 'no change', 'improved;;worsened', 'worsened', 'no change;;worsened', 'improved;;no change'}
(5368, 10)
(4157, 8)


In [9]:
scene_comp.head(2)

Unnamed: 0,relationship_id,subject_id,object_id,bbox,comparison,attribute,sentence,current_image_id,annot_id
0,58043799|9_53976162_C0003489_00637f42_1b190280,00637f42-a2f19a71-cd265165-0928eca9-c6d14eee_a...,1b190280-f5724ebf-d8ec76c3-cc23434f-9969aec8_a...,aortic arch,[no change],nlp|yes|abnormal,slight rightward deviation of the trachea is t...,00637f42-a2f19a71-cd265165-0928eca9-c6d14eee.dcm,58043799|9|aortic arch|1|abnormal
1,58043799|9_53976162_C0003489_00637f42_1b190280,00637f42-a2f19a71-cd265165-0928eca9-c6d14eee_a...,1b190280-f5724ebf-d8ec76c3-cc23434f-9969aec8_a...,aortic arch,[no change],anatomicalfinding|yes|tortuous aorta,slight rightward deviation of the trachea is t...,00637f42-a2f19a71-cd265165-0928eca9-c6d14eee.dcm,58043799|9|aortic arch|1|tortuous aorta


# Get comparison annotation IDs

In [10]:
comp['com_annot_id'] = ['|'.join([annotID,'&'.join(sorted(list(set(cp))))]) for annotID, cp in zip(comp['annot_id'],comp['comparison'])]
scene_comp['com_annot_id'] = ['|'.join([annotID,'&'.join(sorted(list(set(cp))))]) for annotID, cp in zip(scene_comp['annot_id'],scene_comp['comparison'])]

# # pattern: 0 study_id | 1 sent_loc | 2 bbox | 3 context | 4 attribute | 5 comparison

# Sentence level evaluation

In [11]:
# Sentence level comparison performance
print('Sentence level comparison performance, attribute sensitive:')
compgt = set(comp['com_annot_id'].tolist())
compscene = set(scene_comp['com_annot_id'].tolist())
truepos = compgt.intersection(compscene)
falseneg = compgt.difference(compscene)
falsepos = compscene.difference(compgt)
precision = len(truepos)/(len(truepos)+len(falsepos))
recall = len(truepos)/(len(truepos)+len(falseneg))
print('True positive sent level:', len(truepos))
print('False negative sent level:', len(falseneg))
print('False positive sent level:', len(falsepos))
print('Number of relations:', len(truepos) + len(falseneg))
print('Precision sent level:', precision)
print('Recall sent level:', recall)
print('f1-score sent level:', 2*precision*recall/(precision+recall))


Sentence level comparison performance, attribute sensitive:
True positive sent level: 3040
False negative sent level: 2114
False positive sent level: 620
Number of relations: 5154
Precision sent level: 0.8306010928961749
Recall sent level: 0.5898331393092744
f1-score sent level: 0.6898116632629907


In [12]:
# For different comparison relations - sentence level, attribute sensitive
for rel in ['improved','worsened','no change']:
    gt = comp[comp['com_annot_id'].str.contains(rel)].copy()
    pred = scene_comp[scene_comp['com_annot_id'].str.contains(rel)].copy()
    compgt = set(gt['com_annot_id'].tolist())
    compscene = set(pred['com_annot_id'].tolist())
    truepos = compgt.intersection(compscene)
    falseneg = compgt.difference(compscene)
    falsepos = compscene.difference(compgt)
    precision = len(truepos)/(len(truepos)+len(falsepos))
    recall = len(truepos)/(len(truepos)+len(falseneg))
    print('For comparison relation:', rel)
    print('True positive sent level:', len(truepos))
    print('False negative sent level:', len(falseneg))
    print('False positive sent level:', len(falsepos))
    print('Precision sent level:', precision)
    print('Recall sent level:', recall)
    print('f1-score sent level:', 2*precision*recall/(precision+recall))
    print()

For comparison relation: improved
True positive sent level: 484
False negative sent level: 411
False positive sent level: 179
Precision sent level: 0.7300150829562594
Recall sent level: 0.5407821229050279
f1-score sent level: 0.6213093709884466

For comparison relation: worsened
True positive sent level: 943
False negative sent level: 1073
False positive sent level: 183
Precision sent level: 0.8374777975133215
Recall sent level: 0.4677579365079365
f1-score sent level: 0.6002546148949713

For comparison relation: no change
True positive sent level: 1619
False negative sent level: 670
False positive sent level: 399
Precision sent level: 0.8022794846382557
Recall sent level: 0.7072957623416339
f1-score sent level: 0.7517993963315532



In [13]:
# Sentence level comparison performance
print('Sentence level comparison performance, attribute blind:')
compgt = set(['|'.join([c.split('|')[0],c.split('|')[1],c.split('|')[2],c.split('|')[5]]) for c in comp['com_annot_id']])
compscene = set(['|'.join([c.split('|')[0],c.split('|')[1],c.split('|')[2],c.split('|')[5]]) for c in scene_comp['com_annot_id']])

truepos = compgt.intersection(compscene)
falseneg = compgt.difference(compscene)
falsepos = compscene.difference(compgt)

precision = len(truepos)/(len(truepos)+len(falsepos))
recall = len(truepos)/(len(truepos)+len(falseneg))
print('True positive sent level:', len(truepos))
print('False negative sent level:', len(falseneg))
print('False positive sent level:', len(falsepos))
print('Number of relations:', len(truepos) + len(falseneg))
print('Precision sent level:', precision)
print('Recall sent level:', recall)
print('f1-score sent level:', 2*precision*recall/(precision+recall))


Sentence level comparison performance, attribute blind:
True positive sent level: 1184
False negative sent level: 603
False positive sent level: 199
Number of relations: 1787
Precision sent level: 0.8561099060014461
Recall sent level: 0.6625629546726357
f1-score sent level: 0.7470031545741325


In [14]:
# For different comparison relations - sentence level, attribute blind
for rel in ['improved','worsened','no change']:
    gt = comp[comp['com_annot_id'].str.contains(rel)].copy()
    pred = scene_comp[scene_comp['com_annot_id'].str.contains(rel)].copy()
    compgt = set(['|'.join([c.split('|')[0],c.split('|')[2],c.split('|')[5]]) for c in gt['com_annot_id']])
    compscene = set(['|'.join([c.split('|')[0],c.split('|')[2],c.split('|')[5]]) for c in pred['com_annot_id']])
    truepos = compgt.intersection(compscene)
    falseneg = compgt.difference(compscene)
    falsepos = compscene.difference(compgt)
    precision = len(truepos)/(len(truepos)+len(falsepos))
    recall = len(truepos)/(len(truepos)+len(falseneg))
    print('For comparison relation:', rel)
    print('True positive sent level:', len(truepos))
    print('False negative sent level:', len(falseneg))
    print('False positive sent level:', len(falsepos))
    print('Precision sent level:', precision)
    print('Recall sent level:', recall)
    print('f1-score sent level:', 2*precision*recall/(precision+recall))
    print()

For comparison relation: improved
True positive sent level: 209
False negative sent level: 61
False positive sent level: 52
Precision sent level: 0.8007662835249042
Recall sent level: 0.774074074074074
f1-score sent level: 0.7871939736346515

For comparison relation: worsened
True positive sent level: 295
False negative sent level: 148
False positive sent level: 63
Precision sent level: 0.8240223463687151
Recall sent level: 0.6659142212189616
f1-score sent level: 0.7365792759051187

For comparison relation: no change
True positive sent level: 593
False negative sent level: 95
False positive sent level: 101
Precision sent level: 0.8544668587896254
Recall sent level: 0.8619186046511628
f1-score sent level: 0.858176555716353



# Report level evaluation

In [15]:
## Rolling annotations to study level

# There may be multiple sents per bbox-comparison-attribute relation with different context for the attribute
comp2 = comp.copy()
comp2['sent_loc'] = [float(x.split('_')[0].split('|')[-1]) for x in comp2['row_id']]
# Sort by unique subject-object-label and sent_loc (order of sentences in a report)
comp2 = comp2.sort_values(by=['image_id','sent_loc','bbox','label_name']).reset_index(drop=True).copy()
print(comp2.shape)
# keep last label context and comparison for subject-object-label combinations
comp2 = comp2.drop_duplicates(subset=['image_id','bbox','label_name'],keep='last').reset_index(drop=True).copy()
comp2.drop(['sent_loc'],axis=1,inplace=True)
print(comp2.shape)

# Same for automatically extracted relations
scene_comp2 = scene_comp.copy()
scene_comp2['label_name'] = [x.split('|')[-1] for x in scene_comp2['attribute']]
scene_comp2['sent_loc'] = [float(x.split('_')[0].split('|')[-1]) for x in scene_comp2['relationship_id']]
# Sort by unique subject-object-label and sent_loc (order of sentences in a report)
scene_comp2 = scene_comp2.sort_values(by=['subject_id','sent_loc','bbox','label_name']).reset_index(drop=True).copy()
print(scene_comp2.shape)
# keep last label context and comparison for subject-object-label combinations
scene_comp2 = scene_comp2.drop_duplicates(subset=['subject_id','bbox','label_name'],keep='last').reset_index(drop=True).copy()
scene_comp2.drop(['sent_loc'],axis=1,inplace=True)
print(scene_comp2.shape)

(5156, 17)
(3995, 16)
(4157, 12)
(3660, 11)


In [16]:
# Report level comparison performance
print('Report level comparison performance, attribute sensitive:')
compgt = set(['|'.join([c.split('|')[0],'|'.join(c.split('|')[2:])]) for c in comp2['com_annot_id']])
compscene = set(['|'.join([c.split('|')[0],'|'.join(c.split('|')[2:])]) for c in scene_comp2['com_annot_id']])

truepos = compgt.intersection(compscene)
falseneg = compgt.difference(compscene)
falsepos = compscene.difference(compgt)

precision = len(truepos)/(len(truepos)+len(falsepos))
recall = len(truepos)/(len(truepos)+len(falseneg))
print('True positive report level:', len(truepos))
print('False negative report level:', len(falseneg))
print('False positive report level:', len(falsepos))
print('Number of relations:', len(truepos) + len(falseneg))
print('Precision report level:', precision)
print('Recall report level:', recall)
print('f1-score report level:', 2*precision*recall/(precision+recall))


Report level comparison performance, attribute sensitive:
True positive report level: 3044
False negative report level: 949
False positive report level: 616
Number of relations: 3993
Precision report level: 0.8316939890710382
Recall report level: 0.7623340846481342
f1-score report level: 0.7955050307069124


In [17]:
# For different comparison relations - report level, attribute sensitive
for rel in ['improved','worsened','no change']:
    gt = comp2[comp2['com_annot_id'].str.contains(rel)].copy()
    pred = scene_comp2[scene_comp2['com_annot_id'].str.contains(rel)].copy()
    compgt = set(['|'.join([c.split('|')[0],'|'.join(c.split('|')[2:])]) for c in gt['com_annot_id']])
    compscene = set(['|'.join([c.split('|')[0],'|'.join(c.split('|')[2:])]) for c in pred['com_annot_id']])
    truepos = compgt.intersection(compscene)
    falseneg = compgt.difference(compscene)
    falsepos = compscene.difference(compgt)
    precision = len(truepos)/(len(truepos)+len(falsepos))
    recall = len(truepos)/(len(truepos)+len(falseneg))
    print('For comparison relation:', rel)
    print('True positive report level:', len(truepos))
    print('False negative report level:', len(falseneg))
    print('False positive report level:', len(falsepos))
    print('Precision report level:', precision)
    print('Recall report level:', recall)
    print('f1-score report level:', 2*precision*recall/(precision+recall))
    print()

For comparison relation: improved
True positive report level: 482
False negative report level: 172
False positive report level: 181
Precision report level: 0.726998491704374
Recall report level: 0.7370030581039755
f1-score report level: 0.7319665907365223

For comparison relation: worsened
True positive report level: 943
False negative report level: 528
False positive report level: 183
Precision report level: 0.8374777975133215
Recall report level: 0.6410605030591434
f1-score report level: 0.7262225644974971

For comparison relation: no change
True positive report level: 1625
False negative report level: 275
False positive report level: 393
Precision report level: 0.8052527254707631
Recall report level: 0.8552631578947368
f1-score report level: 0.8295048494129658



In [18]:
# Report level comparison performance
print('Report level comparison performance, attribute blind:')
compgt = set(['|'.join([c.split('|')[0],c.split('|')[2],c.split('|')[5]]) for c in comp2['com_annot_id']])
compscene = set(['|'.join([c.split('|')[0],c.split('|')[2],c.split('|')[5]]) for c in scene_comp2['com_annot_id']])

truepos = compgt.intersection(compscene)
falseneg = compgt.difference(compscene)
falsepos = compscene.difference(compgt)

precision = len(truepos)/(len(truepos)+len(falsepos))
recall = len(truepos)/(len(truepos)+len(falseneg))
print('True positive report level:', len(truepos))
print('False negative report level:', len(falseneg))
print('False positive report level:', len(falsepos))
print('Number of relations:', len(truepos) + len(falseneg))
print('Precision report level:', precision)
print('Recall report level:', recall)
print('f1-score report level:', 2*precision*recall/(precision+recall))


Report level comparison performance, attribute blind:
True positive report level: 1086
False negative report level: 288
False positive report level: 179
Number of relations: 1374
Precision report level: 0.8584980237154151
Recall report level: 0.7903930131004366
f1-score report level: 0.8230390299355818


In [19]:
# For different comparison relations - report level, attribute blind
for rel in ['improved','worsened','no change']:
    gt = comp2[comp2['com_annot_id'].str.contains(rel)].copy()
    pred = scene_comp2[scene_comp2['com_annot_id'].str.contains(rel)].copy()
    compgt = set(['|'.join([c.split('|')[0],c.split('|')[2],c.split('|')[5]]) for c in gt['com_annot_id']])
    compscene = set(['|'.join([c.split('|')[0],c.split('|')[2],c.split('|')[5]]) for c in pred['com_annot_id']])
    truepos = compgt.intersection(compscene)
    falseneg = compgt.difference(compscene)
    falsepos = compscene.difference(compgt)
    precision = len(truepos)/(len(truepos)+len(falsepos))
    recall = len(truepos)/(len(truepos)+len(falseneg))
    print('For comparison relation:', rel)
    print('True positive report level:', len(truepos))
    print('False negative report level:', len(falseneg))
    print('False positive report level:', len(falsepos))
    print('Precision report level:', precision)
    print('Recall report level:', recall)
    print('f1-score report level:', 2*precision*recall/(precision+recall))
    print()

For comparison relation: improved
True positive report level: 205
False negative report level: 59
False positive report level: 56
Precision report level: 0.7854406130268199
Recall report level: 0.7765151515151515
f1-score report level: 0.780952380952381

For comparison relation: worsened
True positive report level: 294
False negative report level: 148
False positive report level: 64
Precision report level: 0.8212290502793296
Recall report level: 0.665158371040724
f1-score report level: 0.7349999999999999

For comparison relation: no change
True positive report level: 590
False negative report level: 90
False positive report level: 104
Precision report level: 0.8501440922190202
Recall report level: 0.8676470588235294
f1-score report level: 0.8588064046579331

