In [None]:
# folders
data_path = 'xxx' # todo
results_path = 'xxx' # todo

## Prepare MostCommonCUIs.csv file

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# paths
data_dir = data_path + '/Data/ROCO_ext/'
concept_list = 'concept_names.csv'
task_ext = 'concept_detection_'
imgs_ext = '_images/'


# train concepts
df = pd.read_csv(data_dir + task_ext + 'train.csv'    ,sep='\t')  
print(df)

# all concepts
df_concepts_all = pd.read_csv(data_dir + concept_list ,sep='\t')  
print(df_concepts_all)

# initialize full concept list
all_concepts = []

# iterate lines
for index, row in df.iterrows():
        
    # split the line
    x = row['cuis'].split(';')
    all_concepts.extend(x)

# initialize occurance 
occ = []
for curr_concept in df_concepts_all['concept']:
    occ.append(all_concepts.count(curr_concept))
    
# save most common CUIs
df_mostcommon = pd.DataFrame()
df_mostcommon['CUI'] = df_concepts_all['concept']
df_mostcommon['Number of occurance'] = occ
df_mostcommon['Explanation'] = df_concepts_all['concept_name']

# sort according to the number of occurance
df_mostcommon = df_mostcommon.sort_values('Number of occurance', ascending=False)
df_mostcommon.to_csv('./MostCommonCUIs.csv', index=False)  

## we added a manual column of "Semantic type" 
## using UMLS page: https://uts.nlm.nih.gov/uts/umls/home
## and placed MostCommonCUIs.csv (with top 100 concepts) to the data_dir 

## Plot figure 1 for ImageCLEFmedical (organ vs modality)

In [None]:
from PIL import Image
import matplotlib.pyplot as plt
import pandas as pd
from torchvision import transforms
import numpy as np

data_dir = data_path + '/Data/ROCO_ext/'
sample_class = 3

from  matplotlib.colors import LinearSegmentedColormap
cmap=LinearSegmentedColormap.from_list('rg',["r", "w", "g"], N=256) 

# read semantic group
df_concepts_all_semantic = pd.read_csv(data_dir + 'MostCommonCUIs.csv',sep=',')
df_concepts_all_semantic = df_concepts_all_semantic[0:100]
    
# classes task[0] primary task task[1] to filter
task = ['Body Part, Organ, or Organ Component','Diagnostic Procedure'] # ['Diagnostic Procedure'] # 
CLASS_NAMES = []
CLASS_NAMES_exp = []
for i in range(len(task)):
    idx = df_concepts_all_semantic['Semantic attribute'] == task[i]
    
    CLASS_NAMES.append(df_concepts_all_semantic[idx]['CUI'].to_numpy())
    CLASS_NAMES_exp.append(df_concepts_all_semantic[idx]['Explanation'].to_numpy())
    
# get every sample_class class
CLASS_NAMES[0] = CLASS_NAMES[0][1::sample_class]
CLASS_NAMES_exp[0] = CLASS_NAMES_exp[0][1::sample_class]
        
# x and y_axis ticks
y_list = CLASS_NAMES_exp[0]  
x_list = CLASS_NAMES_exp[1]  

num_classes = np.zeros(len(CLASS_NAMES))
for i in range(len(CLASS_NAMES)):
    num_classes[i] = len(CLASS_NAMES[i])
num_classes = num_classes.astype(int)
num_classes_total = int(sum(num_classes))

task_ext = 'concept_detection_'
imgs_ext = '_images/'
annot_dir = data_dir + task_ext + 'train.csv'    
df = pd.read_csv(annot_dir,sep='\t')  
img_idx = []
label_vec = np.zeros((0,len(task)), dtype = float)

# iterate lines
for index, row in df.iterrows():
        
    # split the line
    x = row['cuis'].split(';')
    curr_label = []
        
    # iterate over task for getting the label
    for task_id in range(len(task)):
        
        # compare the concepts of the current row with the classes to find the label
        curr_concept_list = np.intersect1d(x,CLASS_NAMES[task_id])
        
        # if not only one concept per image, skip
        if len(curr_concept_list) == 0:
            continue      
        elif len(curr_concept_list) > 1:
            continue           
     
        idx = np.where(CLASS_NAMES[task_id]==curr_concept_list)[0][0]
        curr_label.append(idx)

    if len(curr_label) != len(task):
        continue    
        
    img_idx.append(row['ID'])
    label_vec = np.vstack([label_vec, curr_label])    

# plot
fig = plt.figure(figsize=(num_classes[1], num_classes[0]), dpi=80)

np.random.seed(77)
data_dir = data_path + '/Data/ROCO_ext/train_images/'
img_size = 224
for organ in range(0,num_classes[0]):
    for modality in range(0,num_classes[1]):
        curr_label = [organ,modality]
        curr_list = np.where(np.sum(label_vec == curr_label,axis=1) == 2)[0]
        if len(curr_list) > 0:
            # get a random number
            idx = np.random.randint(1,len(curr_list))
            curr_img = img_idx[curr_list[idx]]
        else:
            continue
        curr_file = data_dir + curr_img + '.jpg'
        with Image.open(curr_file).convert('RGB') as img:
            # center crop
            height, width = img.size
            r_min = max(0,np.floor((height-width)/2))
            r_max = r_min + min(height,width)
            c_min = max(0,np.floor((width-height)/2))
            c_max = c_min + min(height,width)
            img = img.crop((r_min,c_min,r_max,c_max))
            img = transforms.Resize(size=img_size)(img)
            
            # plot
            cnt = organ*num_classes[1] + modality + 1
            plt.subplot(num_classes[0],num_classes[1],cnt)
            plt.imshow(img)
            plt.axis('off') 
plt.subplots_adjust(wspace=0, hspace=0)     
plt.show()            

## Plot train+validation and test split number of datapoints

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# paths
data_dir = data_path + '/Data/ROCO_ext/' 
task_ext = 'concept_detection_'
imgs_ext = '_images/'
sample_class = 3

# read semantic concepts
df_concepts_all_semantic = pd.read_csv(data_dir + 'MostCommonCUIs.csv',sep=',')

# classes task[0] primary task task[1] to filter
task = ['Body Part, Organ, or Organ Component','Diagnostic Procedure'] # ['Diagnostic Procedure'] # 
CLASS_NAMES = []
CLASS_NAMES_exp = []
for i in range(len(task)):
    idx = df_concepts_all_semantic['Semantic attribute'] == task[i]
    
    CLASS_NAMES.append(df_concepts_all_semantic[idx]['CUI'].to_numpy())
    CLASS_NAMES_exp.append(df_concepts_all_semantic[idx]['Explanation'].to_numpy())
    
# get every sample_class class
CLASS_NAMES[0] = CLASS_NAMES[0][1::sample_class]
CLASS_NAMES_exp[0] = CLASS_NAMES_exp[0][1::sample_class]

# x and y_axis ticks
# y_list = CLASS_NAMES_exp[0]  
# x_list = CLASS_NAMES_exp[1]  
y_list =['Pelvis', 'Vertebral column', 'Lung','Urinary bladder', 'Right ventricular structure', 'Stomach', 'Pulmonary artery structure', 'Art. desc. b. left cor. artery', 'Left kidney']
x_list = ['CT', 'X-ray', 'MRI', 'US', 'AG', 'PET']

# train concepts
df = pd.read_csv(data_dir + task_ext + 'train.csv'    ,sep='\t')  

# initialize
img_idx = []
label_vec = np.zeros((0,len(task)), dtype = float)

# iterate lines
for index, row in df.iterrows():
        
    # split the line
    x = row['cuis'].split(';')
    curr_label = []
    
    # iterate over task for getting the label
    for task_id in range(len(task)):
        
        # compare the concepts of the current row with the classes to find the label
        curr_concept_list = np.intersect1d(x,CLASS_NAMES[task_id])
        
        # if not only one concept per image, skip (eg. multiple organs in am image is skipped)
        if len(curr_concept_list) == 0:
            continue      
        elif len(curr_concept_list) > 1:
            continue           
            
        idx = np.where(CLASS_NAMES[task_id]==curr_concept_list)[0][0]
        curr_label.append(idx)

    if len(curr_label) != len(task):
        continue    
        
    # save names and labels    
    img_idx.append(row['ID'])
    label_vec = np.vstack([label_vec, curr_label])     
    
# plot 
number_imgs_mat_train_val = np.zeros((len(CLASS_NAMES[0]),len(CLASS_NAMES[1])))

# iterate over classes
for i in range(len(CLASS_NAMES[0])):
    for j in range(len(CLASS_NAMES[1])):
        number_imgs_mat_train_val[i,j] = sum((label_vec[:,0] == i) & (label_vec[:,1] == j)) # sum(label_vec[label_vec[:,1]==j,0]==i)

# plot
fig = plt.figure(figsize=(5, 5), dpi=80)
plt.imshow(number_imgs_mat_train_val) 
plt.xticks(np.arange(len(x_list)), labels=x_list, rotation=45)
plt.yticks(np.arange(len(y_list)), labels=y_list, rotation=0)
plt.colorbar()
plt.show()
# plt.savefig('figs/imageclef_data_train_val.eps', bbox_inches='tight', format='eps')
# plt.savefig('figs/imageclef_data_train_val.pdf', bbox_inches='tight', format='pdf')

# train concepts
df = pd.read_csv(data_dir + task_ext + 'valid.csv'    ,sep='\t')  

# initialize
img_idx = []
label_vec = np.zeros((0,len(task)), dtype = float)

# iterate lines
for index, row in df.iterrows():
        
    # split the line
    x = row['cuis'].split(';')
    curr_label = []
    
    # iterate over task for getting the label
    for task_id in range(len(task)):
        
        # compare the concepts of the current row with the classes to find the label
        curr_concept_list = np.intersect1d(x,CLASS_NAMES[task_id])
        
        # if not only one concept per image, skip (eg. multiple organs in am image is skipped)
        if len(curr_concept_list) == 0:
            continue      
        elif len(curr_concept_list) > 1:
            continue           
            
        idx = np.where(CLASS_NAMES[task_id]==curr_concept_list)[0][0]
        curr_label.append(idx)

    if len(curr_label) != len(task):
        continue    
        
    # save names and labels    
    img_idx.append(row['ID'])
    label_vec = np.vstack([label_vec, curr_label])     
    
# plot 
number_imgs_mat_test = np.zeros((len(CLASS_NAMES[0]),len(CLASS_NAMES[1])))

# iterate over classes
for i in range(len(CLASS_NAMES[0])):
    for j in range(len(CLASS_NAMES[1])):
        number_imgs_mat_test[i,j] = sum((label_vec[:,0] == i) & (label_vec[:,1] == j))

# plot
# fig = plt.figure(figsize=(15, 5), dpi=80)
fig = plt.figure(figsize=(5, 5), dpi=80)

# validation
plt.imshow(number_imgs_mat_test) 
plt.xticks(np.arange(len(x_list)), labels=x_list, rotation=45)
plt.yticks(np.arange(len(y_list)), labels=y_list, rotation=0)
plt.colorbar()
plt.show()
# plt.savefig('figs/imageclef_data_test.eps', bbox_inches='tight', format='eps')
# plt.savefig('figs/imageclef_data_test.pdf', bbox_inches='tight', format='pdf')

print(number_imgs_mat_train_val)
print(number_imgs_mat_test)
print(sum(sum(number_imgs_mat_train_val)))
print(sum(sum(number_imgs_mat_test)))

In [None]:
print(np.sum(number_imgs_mat_train_val,axis=1))