In [2]:
import torch
import torch.nn as nn
import math
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm

import os
import sys
import inspect

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir) 

In [5]:
class coco_format():
    def __init__(self) -> None:
        
        self.organs = ["right lung", "right apical zone", "right upper lung zone", "right mid lung zone", 
            "right lower lung zone", "right hilar structures", "right costophrenic angle", "left lung", "left apical zone",
            "left upper lung zone", "left mid lung zone", "left lower lung zone", "left hilar structures", 
            "left costophrenic angle", "mediastinum", "upper mediastinum", "cardiac silhouette", "trachea"]
        
        self.diseaselist = ['lung opacity', 'pleural effusion', 'atelectasis', 'enlarged cardiac silhouette',
        'pulmonary edema/hazy opacity', 'pneumothorax', 'consolidation', 'fluid overload/heart failure', 'pneumonia']
       
        self.scene_graphs_dir = '/home/ssd_scratch/users/arihanth.srikar/physionet.org/files/chest-imagenome/1.0.0/silver_dataset/scene_graph'
        self.outputdir = "/home/ssd_scratch/users/arihanth.srikar/physionet.org/files/chest-imagenome/1.0.0/silver_dataset"
        
        print('Disease length is {}'.format(str(len(self.diseaselist))))

        #obtain list of train, valid, test files
        self.my_data = sorted(glob(f'{self.scene_graphs_dir}/*.json'))

    def generate_data(self):  
        images = []
        categories = []
        count = 0
        coco_data = {}
        imagesID = []
        labels = []
        for filename in tqdm(self.my_data):
            image_json = {}
            annotations = []
            if 'DS_Store' not in filename:
                count += 1
                hasattributes = 0
                # print('Processing file {}'.format(str(count)))
                try:
                    f = open(str(filename),) 
                except FileNotFoundError:
                    print('{} not in directory'.format(filename))
                else:
                    data = json.load(f)
                    imageID = data['image_id']
                    ids = [obj['object_id'] for obj in data['objects']]
                    ignore = 0
                    hasdisease = 1
                    for objects in data['objects']:
                        hasdisease = 1
                        # objectID = attribute['object_id']
                        row = np.zeros([len(self.diseaselist)])
                        if objects['object_id'].split('_')[1] in self.organs:
                            annotation_json = {}
                            for attribute in data['attributes']:
                                if attribute['object_id'] == objects['object_id']:
                                    for diseases in attribute['attributes']:
                                        for disease in diseases:                                                
                                            if disease.split('|')[2] in self.diseaselist:
                                                hasdisease = 1
                                                hasattributes = 1
                                                class_index = self.diseaselist.index(disease.split('|')[2])
                                                if disease.split('|')[1] == 'yes':
                                                    row[class_index] = int(1)
                                                else:
                                                    row[class_index] = int(0)
                                annotation_json['id'] = objects['object_id']
                                annotation_json['category_id'] = self.organs.index(objects['object_id'].split('_')[1])
                                annotation_json['iscrowd'] = 0
                                annotation_json["bbox_mode"] = 1
                                annotation_json['image_id'] = data['image_id']
                                int_row = row.astype(int)
                                annotation_json['attributes'] = int_row.tolist()
                                annotation_json['bbox'] = [objects['original_x1'], objects['original_y1'],
                                objects['original_width'], objects['original_height']]
                                annotations.append(annotation_json)
            
                    image_json['image_id'] = data['image_id']
                    myfile = str(data['image_id']) + '.jpg'
                    image_json['file_name'] = myfile
                    # path = os.path.join(self.imageroot, myfile) #"./VG/data/" + myfile
                    # im = Image.open(path)
                    # image_json['width'], image_json['height'] = im.size
                    image_json['annotations'] = annotations
                    # # imagesID.append(myfile)
                    images.append(image_json)
    
        save_file_name = "xray_coco_all.json"
        filename = os.path.join(self.outputdir, save_file_name)
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(images, f, ensure_ascii=False, indent=4)            
        
# x_ray_coco = coco_format()
# x_ray_coco.generate_data()

In [None]:
print("Loading dataset")
df = pd.read_json('/home/ssd_scratch/users/arihanth.srikar/physionet.org/files/chest-imagenome/1.0.0/silver_dataset/mimic_coco_filtered.json')
temp_df = pd.read_csv('data/mimic_cxr_jpg/mimic-cxr-2.0.0-final.csv')
temp_df.rename(columns={'dicom_id': 'image_id'}, inplace=True)
df = df.merge(temp_df, on='image_id', how='left')
print("Dataset loaded")

In [6]:
import os
import shutil
import json
import statistics
from PIL import Image
import random
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.metrics import jaccard_score

class AdjacencyMatrices():
    '''
    The Similarity measure between each pair of anatomy objects A and B
    Jaccard similarity measure is used to measure the similarity between 
    each object, by measuring the average similarity over every disease class
    '''
    def __init__(self, df: pd.DataFrame, split: str='val') -> None:
        self.split = 'validate' if 'val' in split else split
        self.df = df[df['split'] == self.split]
        self.outputdir = "/home/ssd_scratch/users/arihanth.srikar/physionet.org/files/chest-imagenome/1.0.0/silver_dataset"
        
        self.diseaselist = ['lung opacity', 'pleural effusion', 'atelectasis', 'enlarged cardiac silhouette',
        'pulmonary edema/hazy opacity', 'pneumothorax', 'consolidation', 'fluid overload/heart failure', 'pneumonia']

        self.organs = ["right lung", "right apical zone", "right upper lung zone", "right mid lung zone", 
        "right lower lung zone", "right hilar structures", "right costophrenic angle", "left lung", "left apical zone",
        "left upper lung zone", "left mid lung zone", "left lower lung zone", "left hilar structures", 
        "left costophrenic angle", "mediastinum", "upper mediastinum", "cardiac silhouette", "trachea"]

    def anatomy(self):
        error = 1e-9
        anatomy_len = len(self.organs)
        row = self.organs
        column = self.organs
        adj_matrix = []

        for ind, B in enumerate(row):
            print("Processing {} from row {}".format(B, str(ind)))
            rows = np.zeros([len(self.organs)]) 
            for inde, A in enumerate(tqdm(column)):
                # print("Processing {} from column {}".format(A, str(inde)))
                AnB_count = 0
                B_count = 0
                row_counter = Counter()
                column_counter = Counter()
                a_val = []
                b_val = []
                p_anb = 0
                
                for annotation_dict in self.df['annotations']:
                    ids = [self.organs[int(obj['category_id'])] for obj in annotation_dict]
                    aa = []
                    bb = []
                    if set(ids) == set(self.organs):
                        for relation in annotation_dict:
                            if int(relation['category_id']) == ind:
                                bb = relation['attributes']
                        for relations in annotation_dict:
                            if int(relations['category_id']) == inde:
                                aa = relations['attributes']
                        if np.count_nonzero(np.array(aa)) > 0 or np.count_nonzero(np.array(bb)) > 0:
                            b_val.append(bb)
                            a_val.append(aa)
                    else:
                        continue

                df_A = pd.DataFrame(a_val, columns=self.diseaselist)
                df_B = pd.DataFrame(b_val, columns=self.diseaselist)
                
                assert len(b_val) == len(a_val)
                
                if not df_A.empty:
                    jaccard_list = []
                    for disease in self.diseaselist:
                        jaccard = jaccard_score(df_B[disease], df_A[disease], average='macro')
                        jaccard_list.append(jaccard)
                    p_anb = statistics.mean(jaccard_list)

                if ind == inde:
                    p_anb = 1
                if p_anb > 0.5:
                    p_anb = 1
                else:
                    p_anb = 0
                
                rows[inde] = p_anb
            adj_matrix.append(rows.tolist())

        
        df = pd.DataFrame(adj_matrix, columns=self.organs)
        filename = os.path.join(self.outputdir, f'anatomy_matrix_{self.split}.csv')
        df.to_csv(filename, sep='\t', index=False)
        return df

# matrix = AdjacencyMatrices()
# anatomy = matrix.anatomy()

In [8]:
val_matrix = AdjacencyMatrices(df, split='val')
val_anatomy = val_matrix.anatomy()

Processing right lung from row 0


100%|██████████| 18/18 [00:01<00:00, 13.44it/s]


Processing right apical zone from row 1


100%|██████████| 18/18 [00:01<00:00, 16.35it/s]


Processing right upper lung zone from row 2


100%|██████████| 18/18 [00:01<00:00, 15.36it/s]


Processing right mid lung zone from row 3


100%|██████████| 18/18 [00:01<00:00, 15.48it/s]


Processing right lower lung zone from row 4


100%|██████████| 18/18 [00:01<00:00, 14.55it/s]


Processing right hilar structures from row 5


100%|██████████| 18/18 [00:01<00:00, 13.91it/s]


Processing right costophrenic angle from row 6


100%|██████████| 18/18 [00:01<00:00, 16.42it/s]


Processing left lung from row 7


100%|██████████| 18/18 [00:01<00:00, 15.48it/s]


Processing left apical zone from row 8


100%|██████████| 18/18 [00:01<00:00, 14.37it/s]


Processing left upper lung zone from row 9


100%|██████████| 18/18 [00:01<00:00, 14.88it/s]


Processing left mid lung zone from row 10


100%|██████████| 18/18 [00:01<00:00, 14.46it/s]


Processing left lower lung zone from row 11


100%|██████████| 18/18 [00:01<00:00, 11.96it/s]


Processing left hilar structures from row 12


100%|██████████| 18/18 [00:01<00:00, 13.35it/s]


Processing left costophrenic angle from row 13


100%|██████████| 18/18 [00:01<00:00, 14.71it/s]


Processing mediastinum from row 14


100%|██████████| 18/18 [00:01<00:00, 14.22it/s]


Processing upper mediastinum from row 15


100%|██████████| 18/18 [00:00<00:00, 18.76it/s]


Processing cardiac silhouette from row 16


100%|██████████| 18/18 [00:01<00:00, 13.91it/s]


Processing trachea from row 17


100%|██████████| 18/18 [00:01<00:00, 17.58it/s]


In [9]:
test_matrix = AdjacencyMatrices(df, split='test')
test_anatomy = test_matrix.anatomy()

Processing right lung from row 0


100%|██████████| 18/18 [00:02<00:00,  8.23it/s]


Processing right apical zone from row 1


100%|██████████| 18/18 [00:01<00:00, 10.17it/s]


Processing right upper lung zone from row 2


100%|██████████| 18/18 [00:01<00:00, 10.21it/s]


Processing right mid lung zone from row 3


100%|██████████| 18/18 [00:01<00:00, 10.49it/s]


Processing right lower lung zone from row 4


100%|██████████| 18/18 [00:01<00:00,  9.96it/s]


Processing right hilar structures from row 5


100%|██████████| 18/18 [00:01<00:00,  9.75it/s]


Processing right costophrenic angle from row 6


100%|██████████| 18/18 [00:01<00:00,  9.60it/s]


Processing left lung from row 7


100%|██████████| 18/18 [00:02<00:00,  7.70it/s]


Processing left apical zone from row 8


100%|██████████| 18/18 [00:01<00:00,  9.84it/s]


Processing left upper lung zone from row 9


100%|██████████| 18/18 [00:01<00:00, 10.04it/s]


Processing left mid lung zone from row 10


100%|██████████| 18/18 [00:02<00:00,  8.80it/s]


Processing left lower lung zone from row 11


100%|██████████| 18/18 [00:01<00:00,  9.61it/s]


Processing left hilar structures from row 12


100%|██████████| 18/18 [00:02<00:00,  8.78it/s]


Processing left costophrenic angle from row 13


100%|██████████| 18/18 [00:01<00:00,  9.11it/s]


Processing mediastinum from row 14


100%|██████████| 18/18 [00:01<00:00, 11.68it/s]


Processing upper mediastinum from row 15


100%|██████████| 18/18 [00:01<00:00,  9.74it/s]


Processing cardiac silhouette from row 16


100%|██████████| 18/18 [00:01<00:00,  9.19it/s]


Processing trachea from row 17


100%|██████████| 18/18 [00:01<00:00, 10.93it/s]


In [10]:
train_matrix = AdjacencyMatrices(df, split='train')
train_anatomy = train_matrix.anatomy()

Processing right lung from row 0


100%|██████████| 18/18 [01:32<00:00,  5.13s/it]


Processing right apical zone from row 1


100%|██████████| 18/18 [01:17<00:00,  4.31s/it]


Processing right upper lung zone from row 2


100%|██████████| 18/18 [01:21<00:00,  4.55s/it]


Processing right mid lung zone from row 3


100%|██████████| 18/18 [01:22<00:00,  4.56s/it]


Processing right lower lung zone from row 4


100%|██████████| 18/18 [01:23<00:00,  4.65s/it]


Processing right hilar structures from row 5


100%|██████████| 18/18 [01:22<00:00,  4.59s/it]


Processing right costophrenic angle from row 6


100%|██████████| 18/18 [01:22<00:00,  4.57s/it]


Processing left lung from row 7


100%|██████████| 18/18 [01:55<00:00,  6.39s/it]


Processing left apical zone from row 8


100%|██████████| 18/18 [01:19<00:00,  4.41s/it]


Processing left upper lung zone from row 9


100%|██████████| 18/18 [01:19<00:00,  4.40s/it]


Processing left mid lung zone from row 10


100%|██████████| 18/18 [01:21<00:00,  4.55s/it]


Processing left lower lung zone from row 11


100%|██████████| 18/18 [01:26<00:00,  4.79s/it]


Processing left hilar structures from row 12


100%|██████████| 18/18 [01:24<00:00,  4.71s/it]


Processing left costophrenic angle from row 13


100%|██████████| 18/18 [01:23<00:00,  4.63s/it]


Processing mediastinum from row 14


100%|██████████| 18/18 [01:19<00:00,  4.43s/it]


Processing upper mediastinum from row 15


100%|██████████| 18/18 [01:15<00:00,  4.22s/it]


Processing cardiac silhouette from row 16


100%|██████████| 18/18 [01:22<00:00,  4.56s/it]


Processing trachea from row 17


100%|██████████| 18/18 [01:17<00:00,  4.29s/it]


In [16]:
np.round(np.sum(val_anatomy.to_numpy() == test_anatomy.to_numpy(), axis=1)/18, 2)

array([0.89, 1.  , 0.72, 0.89, 1.  , 0.94, 0.94, 0.89, 1.  , 0.89, 0.89,
       1.  , 0.94, 0.94, 0.94, 0.94, 1.  , 0.94])

In [17]:
np.round(np.sum(train_anatomy.to_numpy() == test_anatomy.to_numpy(), axis=1)/18, 2)

array([0.94, 1.  , 1.  , 0.89, 1.  , 0.94, 1.  , 0.94, 1.  , 0.67, 0.94,
       1.  , 0.94, 1.  , 0.94, 0.94, 1.  , 0.94])

In [18]:
test_anatomy

Unnamed: 0,right lung,right apical zone,right upper lung zone,right mid lung zone,right lower lung zone,right hilar structures,right costophrenic angle,left lung,left apical zone,left upper lung zone,left mid lung zone,left lower lung zone,left hilar structures,left costophrenic angle,mediastinum,upper mediastinum,cardiac silhouette,trachea
0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
6,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
8,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
9,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
