In [1]:
import numpy as np
import pandas as pd
import tifffile
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.metrics import jaccard_score
from scipy.spatial.distance import directed_hausdorff

\<h6> Step 2 - Write utility functions </h6> 

In [2]:
def enc2mask(encs, shape):
    img = np.zeros(shape[0] * shape[1], dtype=np.uint8)
    for m, enc in enumerate(encs):
        if isinstance(enc, np.float) and np.isnan(enc):
            continue
        enc_split = enc.split()
        for i in range(len(enc_split) // 2):
            start = int(enc_split[2 * i]) - 1
            length = int(enc_split[2 * i + 1])
            img[start: start + length] = 1 + m

    return img.reshape(shape).T

In [3]:
def dice_scores_img(pred, truth, eps=1e-8):
    pred = pred.reshape(-1) > 0
    truth = truth.reshape(-1) > 0
    intersect = (pred & truth).sum(-1)
    union = pred.sum(-1) + truth.sum(-1)

    dice = (2.0 * intersect + eps) / (union + eps)
    return dice

In [4]:
def perf_metrics(gt, pred):
    n = 0
    d = 0
    for i in range(gt.shape[0]):
        for j in range (gt.shape[1]):
            if (gt[i][j]==pred[i][j]):
                n = n+1
            d = d+1
    
    return n/d, jaccard_score(gt.flatten(order='C'), pred.flatten(order='C')), directed_hausdorff(gt, pred)

<h6> Step 3 - Calculate mean metrics values for test images </h6> 

In [5]:
DATA_PATH = Path(r'/N/slate/yashjain/kaggle_data_package/kaggle_data_multiftu/data/'+'gftu'+'_dataset/')
df_pred = pd.read_csv('../gftu_test_submission_complete.csv')
df_truth = pd.read_csv(DATA_PATH/'test.csv')
df_info = pd.read_csv(DATA_PATH/'metadata.csv')

In [6]:
df_pred

Unnamed: 0,filename,rle,patient_id,tissue_name,predicted
0,27152_70666_A_2_4_lung,3062157 5 3065157 7 3068156 10 3071156 13 3074...,2268.0,lung,4690160 3 6022375 1 6025369 1 6025375 3 609136...
1,62818_166850_A_1_4_lung,1622123 3 1625122 4 1628121 6 1631121 6 163412...,4840.0,lung,1357886 2 1357895 2 1360887 1 1360894 3 136389...
2,65896_153870_A_1_4_lung,1633318 17 1636316 23 1639312 28 1642312 28 16...,2208.0,lung,3073906 14 3073924 7 3076906 24 3079906 25 308...
3,18959_147530_A_1_4_lung,4583458 13 4586454 18 4589449 24 4592447 26 45...,2208.0,lung,
4,72854_159060_A_1_4_lung,1975433 1 1978433 2 1981434 2 1984434 3 198446...,4840.0,lung,1867458 11 1870457 13 1873455 26 1876451 33 18...
...,...,...,...,...,...
524,JP072_patch_1_0_largeintestine,612096 85 616462 87 620828 89 625194 91 629560...,JP,largeintestine,297566 5 301932 6 301943 1 306294 1 306298 8 3...
525,A002-C-025_patch_1_0_largeintestine,603744 19 608107 25 612471 30 616834 36 621199...,A002,largeintestine,1778906 2 1783273 4 1787640 7 1792007 9 179637...
526,A001-C-224_patch_1_0_largeintestine,1 276 523 428 4107 139 4368 276 4890 429 8475 ...,A001,largeintestine,4236 49 8603 49 12972 46 17341 42 21710 39 260...
527,A001-C-202_patch_0_0_largeintestine,464156 48 468521 54 472880 72 477245 78 481609...,A001,largeintestine,848698 14 853059 30 857421 39 861783 47 866147...


In [7]:
# plt.rcParams["figure.figsize"] = [20,20]

In [8]:
scores = []
pa_list = []
ji_list = []
haus_dis_list = []

for img in df_truth['filename'].unique():
    
    organ = df_truth[df_truth['filename'] == img]['tissue_name']
    im_orig = tifffile.imread(str(DATA_PATH)+"/test/"+img+".tif")
    
    shape_ = im_orig.shape
    shape = (shape_[0],shape_[1])#df_info[df_info.image_file == img][['width_pixels', 'height_pixels']].values.astype(int)[0]
    truth = df_truth[df_truth['filename'] == img]['rle']
    mask_truth = enc2mask(truth, shape)
    print(img)
    pred = df_pred[df_pred['filename'] == img]['predicted']
    mask_pred = enc2mask(pred, shape)
    #fig,ax = plt.subplots(1,3)
    # # plt.title(f"Organ {organ.values[0]}")
    # plt.suptitle()
    # fig.tight_layout()

    # ax[0].imshow(im_orig)
    # ax[0].set_title(f"Image Organ = {organ.values[0]}")
    # ax[1].imshow(mask_truth)
    # ax[1].set_title("Ground truth")
    # ax[2].imshow(mask_pred)
    # ax[2].set_title("Predicted mask")
    #plt.show()
    score = dice_scores_img(mask_pred, mask_truth)
    print (score)
#     pa, ji, haus = perf_metrics(mask_pred, mask_truth)    
#     pa_list.append (pa)
#     ji_list.append(ji)
#     haus_dis_list.append(haus[0])
    scores.append(score)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if isinstance(enc, np.float) and np.isnan(enc):


27152_70666_A_2_4_lung
0.0010151704535273356
62818_166850_A_1_4_lung
0.25515285865036375
65896_153870_A_1_4_lung
0.07411930044584136
18959_147530_A_1_4_lung
1.0566580020709381e-13
72854_159060_A_1_4_lung
0.5218219209384009
47995_147596_A_2_4_lung
0.3100658658440473
62818_166850_A_2_4_lung
0.06946466771591848
24257_52477_A_3_4_lung
0.1802578120761332
23107_50118_A_3_4_lung
0.030555494104416495
13466_31726_A_3_4_lung
0.16026289123105278
62818_166850_A_3_4_lung
0.29519493598048513
67951_159262_A_3_4_lung
0.43262840080402887
74886_167794_A_3_4_lung
0.34055840289853934
35560_64867_A_3_4_lung
0.6056827419589955
16726_36293_A_9_3_largeintestine
0.9223917700724058
13466_31726_A_9_3_largeintestine
0.9228840254614296
72854_159060_A_8_3_largeintestine
0.9428556574788176
28437_161560_A_8_3_largeintestine
0.8469021547119622
67951_159262_A_8_3_largeintestine
0.8820870608698997
67951_159262_A_7_3_largeintestine
0.8387139771174745
18959_147530_A_9_3_largeintestine
0.9246639571909144
52532_138109_A_9_3

In [9]:
l = len(df_pred)
for img, s in zip(df_truth[5:]['filename'],scores):
    print (round(s, 3))
    
print ("Average Dice Score = ", round(sum(scores)/l,3))

0.001
0.255
0.074
0.0
0.522
0.31
0.069
0.18
0.031
0.16
0.295
0.433
0.341
0.606
0.922
0.923
0.943
0.847
0.882
0.839
0.925
0.922
0.935
0.913
0.926
0.947
0.895
0.911
0.913
0.925
0.859
0.936
0.976
0.968
0.946
0.983
0.987
0.975
0.938
0.767
0.985
0.95
0.893
0.972
0.958
0.986
0.97
0.912
0.958
0.98
0.912
0.727
0.921
0.855
0.657
0.841
0.601
0.917
0.369
0.819
0.821
0.808
0.711
0.933
0.765
0.464
0.928
0.748
0.959
0.91
0.884
0.944
0.81
0.827
0.929
0.92
0.912
0.943
0.938
0.467
0.875
0.628
0.665
0.477
0.527
0.595
0.631
0.56
0.413
0.345
0.621
0.606
0.594
0.283
0.705
0.498
0.645
0.796
0.22
0.537
0.677
0.704
0.394
0.755
0.333
0.498
0.514
0.532
0.549
0.228
0.5
0.641
0.714
0.475
0.529
0.287
0.605
0.266
0.48
0.542
0.654
0.514
0.563
0.381
0.359
0.542
0.633
0.529
0.633
0.532
0.768
0.483
0.512
0.596
0.688
0.571
0.503
0.551
0.587
0.548
0.666
0.365
0.768
0.616
0.591
0.493
0.59
0.488
0.439
0.54
0.757
0.46
0.546
0.368
0.639
0.665
0.472
0.408
0.592
0.615
0.543
0.517
0.663
0.567
0.179
0.521
0.771
0.327
0.796
0.701

In [10]:
df_pred['dice_scores'] = scores

In [11]:
df_pred

Unnamed: 0,filename,rle,patient_id,tissue_name,predicted,dice_scores
0,27152_70666_A_2_4_lung,3062157 5 3065157 7 3068156 10 3071156 13 3074...,2268.0,lung,4690160 3 6022375 1 6025369 1 6025375 3 609136...,1.015170e-03
1,62818_166850_A_1_4_lung,1622123 3 1625122 4 1628121 6 1631121 6 163412...,4840.0,lung,1357886 2 1357895 2 1360887 1 1360894 3 136389...,2.551529e-01
2,65896_153870_A_1_4_lung,1633318 17 1636316 23 1639312 28 1642312 28 16...,2208.0,lung,3073906 14 3073924 7 3076906 24 3079906 25 308...,7.411930e-02
3,18959_147530_A_1_4_lung,4583458 13 4586454 18 4589449 24 4592447 26 45...,2208.0,lung,,1.056658e-13
4,72854_159060_A_1_4_lung,1975433 1 1978433 2 1981434 2 1984434 3 198446...,4840.0,lung,1867458 11 1870457 13 1873455 26 1876451 33 18...,5.218219e-01
...,...,...,...,...,...,...
524,JP072_patch_1_0_largeintestine,612096 85 616462 87 620828 89 625194 91 629560...,JP,largeintestine,297566 5 301932 6 301943 1 306294 1 306298 8 3...,1.124938e-01
525,A002-C-025_patch_1_0_largeintestine,603744 19 608107 25 612471 30 616834 36 621199...,A002,largeintestine,1778906 2 1783273 4 1787640 7 1792007 9 179637...,3.213393e-01
526,A001-C-224_patch_1_0_largeintestine,1 276 523 428 4107 139 4368 276 4890 429 8475 ...,A001,largeintestine,4236 49 8603 49 12972 46 17341 42 21710 39 260...,1.281156e-01
527,A001-C-202_patch_0_0_largeintestine,464156 48 468521 54 472880 72 477245 78 481609...,A001,largeintestine,848698 14 853059 30 857421 39 861783 47 866147...,6.675470e-01


In [12]:
df_pred.to_csv("test_result_dice_complete.csv",index=None)

In [13]:
df_pred

Unnamed: 0,filename,rle,patient_id,tissue_name,predicted,dice_scores
0,27152_70666_A_2_4_lung,3062157 5 3065157 7 3068156 10 3071156 13 3074...,2268.0,lung,4690160 3 6022375 1 6025369 1 6025375 3 609136...,1.015170e-03
1,62818_166850_A_1_4_lung,1622123 3 1625122 4 1628121 6 1631121 6 163412...,4840.0,lung,1357886 2 1357895 2 1360887 1 1360894 3 136389...,2.551529e-01
2,65896_153870_A_1_4_lung,1633318 17 1636316 23 1639312 28 1642312 28 16...,2208.0,lung,3073906 14 3073924 7 3076906 24 3079906 25 308...,7.411930e-02
3,18959_147530_A_1_4_lung,4583458 13 4586454 18 4589449 24 4592447 26 45...,2208.0,lung,,1.056658e-13
4,72854_159060_A_1_4_lung,1975433 1 1978433 2 1981434 2 1984434 3 198446...,4840.0,lung,1867458 11 1870457 13 1873455 26 1876451 33 18...,5.218219e-01
...,...,...,...,...,...,...
524,JP072_patch_1_0_largeintestine,612096 85 616462 87 620828 89 625194 91 629560...,JP,largeintestine,297566 5 301932 6 301943 1 306294 1 306298 8 3...,1.124938e-01
525,A002-C-025_patch_1_0_largeintestine,603744 19 608107 25 612471 30 616834 36 621199...,A002,largeintestine,1778906 2 1783273 4 1787640 7 1792007 9 179637...,3.213393e-01
526,A001-C-224_patch_1_0_largeintestine,1 276 523 428 4107 139 4368 276 4890 429 8475 ...,A001,largeintestine,4236 49 8603 49 12972 46 17341 42 21710 39 260...,1.281156e-01
527,A001-C-202_patch_0_0_largeintestine,464156 48 468521 54 472880 72 477245 78 481609...,A001,largeintestine,848698 14 853059 30 857421 39 861783 47 866147...,6.675470e-01


In [14]:
df_pred = pd.read_csv("test_result_dice_complete.csv")
df_meta = pd.read_csv("/N/slate/yashjain/kaggle_data_package/kaggle_data_multiftu/data/gftu_dataset/metadata.csv")

In [15]:
df_meta

Unnamed: 0,tissue_name,antibody_id,patient_id,age,sex,tmaslide_id,data_type,filename
0,lung,41507.0,2268.0,49.0,Female,20136118.0,public,41507_88065_A_2_4_lung
1,lung,44657.0,2268.0,49.0,Female,20160434.0,public,44657_99514_A_2_4_lung
2,lung,68695.0,4840.0,43.0,Female,20361528.0,public,68695_162533_A_1_4_lung
3,lung,35856.0,2208.0,67.0,Female,20094887.0,public,35856_71907_A_1_4_lung
4,lung,54950.0,2208.0,67.0,Female,20275557.0,public,54950_136422_A_1_4_lung
...,...,...,...,...,...,...,...,...
876,largeintestine,,JP,33.0,Male,,hubmap,JP072_patch_1_0_largeintestine
877,largeintestine,,A002,22.0,Female,,hubmap,A002-C-025_patch_1_0_largeintestine
878,largeintestine,,A001,48.0,Male,,hubmap,A001-C-224_patch_1_0_largeintestine
879,largeintestine,,A001,48.0,Male,,hubmap,A001-C-202_patch_0_0_largeintestine


In [16]:
def get_source(x):
    return df_meta[df_meta["filename"]==x]['data_type'].values[0]
    

In [17]:
df_pred['source'] = df_pred['filename'].apply(lambda x: get_source(x))

In [18]:
df_pred

Unnamed: 0,filename,rle,patient_id,tissue_name,predicted,dice_scores,source
0,27152_70666_A_2_4_lung,3062157 5 3065157 7 3068156 10 3071156 13 3074...,2268.0,lung,4690160 3 6022375 1 6025369 1 6025375 3 609136...,1.015170e-03,private
1,62818_166850_A_1_4_lung,1622123 3 1625122 4 1628121 6 1631121 6 163412...,4840.0,lung,1357886 2 1357895 2 1360887 1 1360894 3 136389...,2.551529e-01,private
2,65896_153870_A_1_4_lung,1633318 17 1636316 23 1639312 28 1642312 28 16...,2208.0,lung,3073906 14 3073924 7 3076906 24 3079906 25 308...,7.411930e-02,private
3,18959_147530_A_1_4_lung,4583458 13 4586454 18 4589449 24 4592447 26 45...,2208.0,lung,,1.056658e-13,private
4,72854_159060_A_1_4_lung,1975433 1 1978433 2 1981434 2 1984434 3 198446...,4840.0,lung,1867458 11 1870457 13 1873455 26 1876451 33 18...,5.218219e-01,private
...,...,...,...,...,...,...,...
524,JP072_patch_1_0_largeintestine,612096 85 616462 87 620828 89 625194 91 629560...,JP,largeintestine,297566 5 301932 6 301943 1 306294 1 306298 8 3...,1.124938e-01,hubmap
525,A002-C-025_patch_1_0_largeintestine,603744 19 608107 25 612471 30 616834 36 621199...,A002,largeintestine,1778906 2 1783273 4 1787640 7 1792007 9 179637...,3.213393e-01,hubmap
526,A001-C-224_patch_1_0_largeintestine,1 276 523 428 4107 139 4368 276 4890 429 8475 ...,A001,largeintestine,4236 49 8603 49 12972 46 17341 42 21710 39 260...,1.281156e-01,hubmap
527,A001-C-202_patch_0_0_largeintestine,464156 48 468521 54 472880 72 477245 78 481609...,A001,largeintestine,848698 14 853059 30 857421 39 861783 47 866147...,6.675470e-01,hubmap


In [19]:
df_pred.drop(["rle","predicted"],axis=1,inplace=True)

In [20]:
df_pred

Unnamed: 0,filename,patient_id,tissue_name,dice_scores,source
0,27152_70666_A_2_4_lung,2268.0,lung,1.015170e-03,private
1,62818_166850_A_1_4_lung,4840.0,lung,2.551529e-01,private
2,65896_153870_A_1_4_lung,2208.0,lung,7.411930e-02,private
3,18959_147530_A_1_4_lung,2208.0,lung,1.056658e-13,private
4,72854_159060_A_1_4_lung,4840.0,lung,5.218219e-01,private
...,...,...,...,...,...
524,JP072_patch_1_0_largeintestine,JP,largeintestine,1.124938e-01,hubmap
525,A002-C-025_patch_1_0_largeintestine,A002,largeintestine,3.213393e-01,hubmap
526,A001-C-224_patch_1_0_largeintestine,A001,largeintestine,1.281156e-01,hubmap
527,A001-C-202_patch_0_0_largeintestine,A001,largeintestine,6.675470e-01,hubmap


Lung

In [21]:
print(f"Number of datapoints : lung = {len(df_pred[df_pred['tissue_name'] == 'lung']['dice_scores'])}")
print(f"Average dice score : lung = {np.mean(df_pred[df_pred['tissue_name'] == 'lung']['dice_scores'])}")

Number of datapoints : lung = 129
Average dice score : lung = 0.5061202747409262


In [22]:
print(f"Number of private HPA datapoints : lung = {len(df_pred[(df_pred['tissue_name'] == 'lung') & (df_pred['source'] == 'private')]['dice_scores'])}")
print(f"Average dice score private HPA data : lung = {np.mean(df_pred[(df_pred['tissue_name'] == 'lung') & (df_pred['source'] == 'private')]['dice_scores'])}")

Number of private HPA datapoints : lung = 14
Average dice score private HPA data : lung = 0.2340557473644182


In [23]:
print(f"Number of  HuBMAP datapoints : lung = {len(df_pred[(df_pred['tissue_name'] == 'lung') & (df_pred['source'] == 'hubmap')]['dice_scores'])}")
print(f"Average dice score  HuBMAP data : lung = {np.mean(df_pred[(df_pred['tissue_name'] == 'lung') & (df_pred['source'] == 'hubmap')]['dice_scores'])}")

Number of  HuBMAP datapoints : lung = 115
Average dice score  HuBMAP data : lung = 0.5392411737258924


Kidney

In [24]:
print(f"Number of datapoints : kidney = {len(df_pred[df_pred['tissue_name'] == 'kidney']['dice_scores'])}")
print(f"Average dice score : kidney = {np.mean(df_pred[df_pred['tissue_name'] == 'kidney']['dice_scores'])}")

Number of datapoints : kidney = 98
Average dice score : kidney = 0.8762530627353433


In [25]:
print(f"Number of private HPA datapoints : kidney = {len(df_pred[(df_pred['tissue_name'] == 'kidney') & (df_pred['source'] == 'private')]['dice_scores'])}")
print(f"Average dice score private HPA data : kidney = {np.mean(df_pred[(df_pred['tissue_name'] == 'kidney') & (df_pred['source'] == 'private')]['dice_scores'])}")

Number of private HPA datapoints : kidney = 19
Average dice score private HPA data : kidney = 0.9480866722750791


In [26]:
print(f"Number of  HuBMAP datapoints : kidney = {len(df_pred[(df_pred['tissue_name'] == 'kidney') & (df_pred['source'] == 'hubmap')]['dice_scores'])}")
print(f"Average dice score  HuBMAP data : kidney = {np.mean(df_pred[(df_pred['tissue_name'] == 'kidney') & (df_pred['source'] == 'hubmap')]['dice_scores'])}")

Number of  HuBMAP datapoints : kidney = 79
Average dice score  HuBMAP data : kidney = 0.8589766249979389


largeintestine

In [27]:
print(f"Number of datapoints : largeintestine = {len(df_pred[df_pred['tissue_name'] == 'largeintestine']['dice_scores'])}")
print(f"Average dice score : largeintestine = {np.mean(df_pred[df_pred['tissue_name'] == 'largeintestine']['dice_scores'])}")

Number of datapoints : largeintestine = 61
Average dice score : largeintestine = 0.5703691006293197


In [28]:
print(f"Number of private HPA datapoints : largeintestine = {len(df_pred[(df_pred['tissue_name'] == 'largeintestine') & (df_pred['source'] == 'private')]['dice_scores'])}")
print(f"Average dice score private HPA data : largeintestine = {np.mean(df_pred[(df_pred['tissue_name'] == 'largeintestine') & (df_pred['source'] == 'private')]['dice_scores'])}")

Number of private HPA datapoints : largeintestine = 18
Average dice score private HPA data : largeintestine = 0.9089852012248093


In [29]:
print(f"Number of  HuBMAP datapoints : largeintestine = {len(df_pred[(df_pred['tissue_name'] == 'largeintestine') & (df_pred['source'] == 'hubmap')]['dice_scores'])}")
print(f"Average dice score  HuBMAP data : largeintestine = {np.mean(df_pred[(df_pred['tissue_name'] == 'largeintestine') & (df_pred['source'] == 'hubmap')]['dice_scores'])}")

Number of  HuBMAP datapoints : largeintestine = 43
Average dice score  HuBMAP data : largeintestine = 0.4286228259614401


spleen

In [30]:
print(f"Number of datapoints : spleen = {len(df_pred[df_pred['tissue_name'] == 'spleen']['dice_scores'])}")
print(f"Average dice score : spleen = {np.mean(df_pred[df_pred['tissue_name'] == 'spleen']['dice_scores'])}")

Number of datapoints : spleen = 125
Average dice score : spleen = 0.7835419764348238


In [31]:
print(f"Number of private HPA datapoints : spleen = {len(df_pred[(df_pred['tissue_name'] == 'spleen') & (df_pred['source'] == 'private')]['dice_scores'])}")
print(f"Average dice score private HPA data : spleen = {np.mean(df_pred[(df_pred['tissue_name'] == 'spleen') & (df_pred['source'] == 'private')]['dice_scores'])}")

Number of private HPA datapoints : spleen = 12
Average dice score private HPA data : spleen = 0.7540076475006572


In [32]:
print(f"Number of  HuBMAP datapoints : spleen = {len(df_pred[(df_pred['tissue_name'] == 'spleen') & (df_pred['source'] == 'hubmap')]['dice_scores'])}")
print(f"Average dice score  HuBMAP data : spleen = {np.mean(df_pred[(df_pred['tissue_name'] == 'spleen') & (df_pred['source'] == 'hubmap')]['dice_scores'])}")

Number of  HuBMAP datapoints : spleen = 113
Average dice score  HuBMAP data : spleen = 0.7866783653481864


Prostate

In [33]:
print(f"Number of datapoints : prostate = {len(df_pred[df_pred['tissue_name'] == 'prostate']['dice_scores'])}")
print(f"Average dice score : prostate = {np.mean(df_pred[df_pred['tissue_name'] == 'prostate']['dice_scores'])}")

Number of datapoints : prostate = 116
Average dice score : prostate = 0.15486765892695709


In [34]:
print(f"Number of private HPA datapoints : prostate = {len(df_pred[(df_pred['tissue_name'] == 'prostate') & (df_pred['source'] == 'private')]['dice_scores'])}")
print(f"Average dice score private HPA data : prostate = {np.mean(df_pred[(df_pred['tissue_name'] == 'prostate') & (df_pred['source'] == 'private')]['dice_scores'])}")

Number of private HPA datapoints : prostate = 18
Average dice score private HPA data : prostate = 0.841941230914148


In [35]:
print(f"Number of  HuBMAP datapoints : prostate = {len(df_pred[(df_pred['tissue_name'] == 'prostate') & (df_pred['source'] == 'hubmap')]['dice_scores'])}")
print(f"Average dice score  HuBMAP data : prostate = {np.mean(df_pred[(df_pred['tissue_name'] == 'prostate') & (df_pred['source'] == 'hubmap')]['dice_scores'])}")

Number of  HuBMAP datapoints : prostate = 98
Average dice score  HuBMAP data : prostate = 0.028670472235432218


In [36]:
np.mean(df_pred['dice_scores'])

0.5706280268769601