**About** : This notebook is used to retrieve hand-made annotations. 
  - Use the `ADD_FC` and `ONLY_FC` parameters to generate labels for the healthy and unhealthy classes.
  - Use the `SAVE_TIFF `parameter to save the external data as tiff files of half resolution.
  - Use the `PLOT` parameter to visualize the masks.
  - Use the `SAVE` parameter to save the masks as rle. 

In [2]:
%load_ext autoreload
%autoreload 2

## Initialization

### Imports

In [58]:
import os
import sys
import cv2
import json
import glob
import rasterio
import tifffile
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

sys.path.append("../code/")

In [59]:
from params import *
from utils.rle import *
from data.dataset import load_image
from utils.plots import plot_contours_preds

In [60]:
IDENTITY = rasterio.Affine(1, 0, 0, 0, 1, 0)

### Load

In [56]:
DATA_PATH

'../input/'

In [61]:
df_info = pd.read_csv(DATA_PATH + f"HuBMAP-20-dataset_information.csv")
df_mask = pd.read_csv(DATA_PATH + "train.csv")

ANNOT_PATH = DATA_PATH + "/annot/"

In [62]:
df_info

Unnamed: 0,image_file,width_pixels,height_pixels,anatomical_structures_segmention_file,glomerulus_segmentation_file,patient_number,race,ethnicity,sex,age,weight_kilograms,height_centimeters,bmi_kg/m^2,laterality,percent_cortex,percent_medulla
0,CL_HandE_1234_B004_bottomleft,4704,4536,aa05346ff-anatomical-structure.json,aa05346ff.json,67347,White,Not Hispanic or Latino,Female,58,59.0,160.0,23.0,Right,80,20
1,CL_HandE_1234_B004_bottomright,4704,4536,afa5e8098-anatomical-structure.json,afa5e8098.json,67377,White,Not Hispanic or Latino,Female,58,59.0,160.0,23.0,Right,55,45
2,CL_HandE_1234_B004_topleft,4704,4536,54f2eec69-anatomical-structure.json,54f2eec69.json,67548,Black or African American,Not Hispanic or Latino,Male,58,79.9,190.5,22.0,Right,75,25
3,CL_HandE_1234_B004_topright,4704,4536,d488c759a-anatomical-structure.json,d488c759a.json,68138,White,Not Hispanic or Latino,Female,66,81.5,158.8,32.2,Left,100,0
4,HandE_B005_CL_b_RGB_bottomleft,4704,4536,1e2425f28-anatomical-structure.json,1e2425f28.json,63921,White,Not Hispanic or Latino,Male,48,131.5,193.0,35.3,Right,65,35
5,HandE_B005_CL_b_RGB_bottomright,4704,4536,e79de561c-anatomical-structure.json,e79de561c.json,67026,Black or African American,Not Hispanic or Latino,Male,53,73.0,166.0,26.5,Left,55,45
6,HandE_B005_CL_b_RGB_topleft,4704,4536,c68fe75ea-anatomical-structure.json,c68fe75ea.json,67112,White,Not Hispanic or Latino,Male,56,91.2,167.6,32.5,Left,80,20
7,HandE_B005_CL_b_RGB_topright,4704,4536,095bf7a1f-anatomical-structure.json,095bf7a1f.json,68250,White,Not Hispanic or Latino,Female,44,71.7,160.0,28.0,Right,65,35


In [63]:
df_mask = df_mask.rename(columns={"predicted":"encoding"})
df_mask = df_mask[df_mask.id != 'HandE_B005_CL_b_RGB_topright']

In [64]:
df_mask

Unnamed: 0,id,encoding
0,CL_HandE_1234_B004_bottomright,12972850 36 12977382 42 12981914 47 12986448 5...
1,CL_HandE_1234_B004_topleft,2023885 36 2028419 40 2032953 44 2037487 47 20...
2,CL_HandE_1234_B004_topright,372094 8 376628 12 381163 14 385698 16 390233 ...
3,HandE_B005_CL_b_RGB_bottomright,2247721 26 2252254 31 2256788 34 2261323 36 22...
4,HandE_B005_CL_b_RGB_topleft,366048 27 370578 43 375111 48 379646 52 384181...


In [83]:
PLOT = False
ADD_FC = True
ONLY_FC = True

### Train data

In [84]:
new_df = df_mask.copy().set_index('id')
if ONLY_FC:
    new_df['encoding'] = ""

for id_ in tqdm(df_mask['id']):
    print(f' -> {id_}')
    if id_ + ".json" in os.listdir(ANNOT_PATH):        
        annot = json.load(open(ANNOT_PATH + id_ + ".json", 'r'))
        
        w, h = df_info[df_info['image_file'] == id_][['width_pixels', 'height_pixels']].values[0]
        
        rle = df_mask[df_mask['id'] == id_]['encoding']
        
#       mask = enc2mask(rle, (w, h)).astype(np.uint8)  # smh not working
        mask = np.zeros((h, w), dtype=np.uint8)
        if not ONLY_FC:
            mask += enc2mask(rle, (w, h)).astype(np.uint8)
        
        added = 0
        for info in annot:
            label = info['properties']['classification']['name']

            if (not ADD_FC) and (label == "FC"):
                continue
                    
            if ONLY_FC and label != "FC":
                continue

            poly = info['geometry']['coordinates']
            try:
                mask = cv2.fillPoly(mask, np.int32([poly]), True)
            except ValueError:
                poly = np.concatenate([np.array(poly[i]).squeeze() for i in range(len(poly))])
                mask = cv2.fillPoly(mask, np.int32([poly]), True)
            added +=1
            
        print(f"Added {added} glomerulis")
        
        new_df.loc[id_] = rle_encode_less_memory(mask)
        
        if PLOT:
            img = load_image(os.path.join(TIFF_PATH, id_ + ".tiff"), full_size=False)
            
            mask = cv2.resize(
                mask,
                (w // 4, h // 4),
                interpolation=cv2.INTER_NEAREST,
            )
            assert mask.shape == img.shape[:2], (mask.shape, img.shape)
        
            fig = plot_contours_preds(img, mask, w=1, downsize=4)
            w = 1000
            h = int(w *  mask.shape[0] / mask.shape[1])
            fig.update_layout(
                autosize=False,
                width=w,
                height=h,
            )

            fig.show()

            break

if not PLOT:
    name = "train_fix.csv" if not ADD_FC else "train_fc.csv"
    if ONLY_FC:
        name = "train_onlyfc.csv"
    new_df.to_csv(DATA_PATH + name)
    print (name)
    print(f'\n -> Saved masks to {DATA_PATH + name}')

  0%|          | 0/5 [00:00<?, ?it/s]

 -> CL_HandE_1234_B004_bottomright
Added 0 glomerulis
 -> CL_HandE_1234_B004_topleft
Added 0 glomerulis
 -> CL_HandE_1234_B004_topright
Added 0 glomerulis
 -> HandE_B005_CL_b_RGB_bottomright
Added 0 glomerulis
 -> HandE_B005_CL_b_RGB_topleft
Added 0 glomerulis
train_onlyfc.csv

 -> Saved masks to ../input/train_onlyfc.csv


### Extra data

In [85]:
PLOT = False
SAVE_TIFF = True
SAVE = True
# ADD_FC = True
# ONLY_FC = True

In [86]:
files = [p for p in os.listdir(DATA_PATH + "extra/") if p.endswith("svs")]
rles = {}
print (DATA_PATH, files)

for file in tqdm(files):
    id_ = file[:-4]
    print(f' -> {id_}')
    
#     if id_ != "SAS_21908_001":
#         continue
    
    if os.path.exists(ANNOT_PATH + id_ + ".json"):
        original_img = rasterio.open(DATA_PATH + "extra/" + file, transform=IDENTITY, num_threads='all_cpus')
        img = original_img.read([1, 2, 3]).transpose(1, 2, 0).astype(np.uint8)

        shape = img.shape[:2]
        
        annot = json.load(open(ANNOT_PATH + id_ + ".json", 'r'))

        mask = np.zeros(shape, dtype=np.uint8)

        added = 0
        for info in annot:
            poly = np.array(info['geometry']['coordinates'])
            
            try:
                label = info['properties']['classification']['name']
            except KeyError:
                label = "G"
            
            if not ADD_FC and label == "FC":
                continue

            if ONLY_FC and label != "FC":
                continue
                
            poly = info['geometry']['coordinates']
            try:
                mask = cv2.fillPoly(mask, np.int32([poly]), True)
            except ValueError:
                poly = np.concatenate([np.array(poly[i]).squeeze() for i in range(len(poly))])
                mask = cv2.fillPoly(mask, np.int32([poly]), True)
            added += 1
        
        print(f"Added {added} glomerulis")
        
        if PLOT:
            print('plot')
            fig = plot_contours_preds(img, mask, w=2, downsize=8)

            w = 1000
            h = int(w *  mask.shape[0] / mask.shape[1])
            fig.update_layout(
                autosize=False,
                width=w,
                height=h,
            )

            fig.show()

            break
            
        if SAVE:
            if SAVE_TIFF:
                img = cv2.resize(
                    img,
                    (img.shape[1] // 2, img.shape[0] // 2),
                    interpolation=cv2.INTER_AREA,
                )
                    
                if not os.path.exists(DATA_PATH + "extra_tiff/"):
                    os.mkdir(DATA_PATH + "extra_tiff/")
                tifffile.imsave(DATA_PATH + "extra_tiff/" + f"{id_}.tiff", img)

            mask = cv2.resize(
                mask,
                (mask.shape[1] // 2, mask.shape[0] // 2),
                interpolation=cv2.INTER_NEAREST,
            )

            print (id_, mask)
            rles[id_] = rle_encode_less_memory(mask)

../input/ ['SAS_21883_001.svs', 'SAS_21891_001.svs', 'SAS_21896_001.svs', 'SAS_21904_001.svs', 'SAS_21908_001.svs', 'SAS_21915_001.svs', 'SAS_21924_001.svs', 'SAS_21930_001.svs', 'SAS_21937_001.svs', 'SAS_21942_001.svs', 'SESCAM_102.svs', 'SESCAM_1_0.svs', 'SESCAM_2_0.svs', 'SESCAM_3_0.svs', 'SESCAM_4_0.svs', 'SESCAM_5_0.svs', 'SESCAM_6_0.svs', 'SESCAM_7_0.svs', 'SESCAM_8_0.svs', 'SESCAM_9_0.svs', 'VUHSK_1272.svs', 'VUHSK_1352.svs', 'VUHSK_1432.svs', 'VUHSK_1502.svs', 'VUHSK_1622.svs', 'VUHSK_1702.svs', 'VUHSK_1762.svs', 'VUHSK_1832.svs', 'VUHSK_1912.svs', 'VUHSK_1992.svs', 'VUHSK_2072.svs']


  0%|          | 0/31 [00:00<?, ?it/s]

 -> SAS_21883_001


  s = DatasetReader(path, driver=driver, sharing=sharing, **kwargs)
  poly = np.array(info['geometry']['coordinates'])


Added 7 glomerulis
SAS_21883_001 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
 -> SAS_21891_001
Added 0 glomerulis
SAS_21891_001 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
 -> SAS_21896_001
Added 2 glomerulis
SAS_21896_001 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
 -> SAS_21904_001
Added 3 glomerulis
SAS_21904_001 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
 -> SAS_21908_001
Added 0 glomerulis
SAS_21908_001 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
 -> SAS_21915_001
Added 8 glomerulis
SAS_21915_001 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 

In [87]:
df_annot_extra = pd.DataFrame.from_dict(rles, orient='index', columns=['encoding'])
df_annot_extra['id'] = df_annot_extra.index
df_annot_extra.reset_index(drop = True)

Unnamed: 0,encoding,id
0,15461245 4 15468525 8 15475805 13 15483085 18 ...,SAS_21883_001
1,,SAS_21891_001
2,17955006 10 17973129 24 17991253 38 18009376 5...,SAS_21896_001
3,22660434 12 22675587 28 22690740 35 22705893 3...,SAS_21904_001
4,,SAS_21908_001
5,3063050 4 3075321 17 3087600 22 3099878 28 311...,SAS_21915_001
6,20206498 3 20211741 16 20216984 29 20222227 42...,SAS_21924_001
7,,SAS_21930_001
8,3643191 3 3652290 15 3661389 27 3670486 40 367...,SAS_21937_001
9,10823529 6 10829088 15 10834647 23 10840206 31...,SAS_21942_001


In [88]:
if SAVE and not PLOT:
    name = "train_extra.csv" if not ADD_FC else "train_extra_fc.csv"
    if ONLY_FC:
        name = "train_extra_onlyfc.csv"
    df_annot_extra.to_csv(DATA_PATH + name)
    print(f'\n -> Saved masks to {DATA_PATH + name}')


 -> Saved masks to ../input/train_extra_onlyfc.csv
