**About** : This notebook is used to retrieve hand-made annotations. 
  - Use the `ADD_FC` and `ONLY_FC` parameters to generate labels for the healthy and unhealthy classes.
  - Use the `SAVE_TIFF `parameter to save the external data as tiff files of half resolution.
  - Use the `PLOT` parameter to visualize the masks.
  - Use the `SAVE` parameter to save the masks as rle. 

In [1]:
%load_ext autoreload
%autoreload 2

## Initialization

### Imports

In [2]:
import os
import sys
import cv2
import json
import glob
import rasterio
import tifffile
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

sys.path.append("../code/")

In [3]:
from params import *
from utils.rle import *
from data.dataset import load_image
from utils.plots import plot_contours_preds

In [4]:
IDENTITY = rasterio.Affine(1, 0, 0, 0, 1, 0)

### Load

In [5]:
df_info = pd.read_csv(DATA_PATH + f"HuBMAP-20-dataset_information.csv")
df_mask = pd.read_csv(DATA_PATH + "train.csv")

ANNOT_PATH = DATA_PATH + "annotation_v3/"

In [6]:
df_info

Unnamed: 0,image_file,width_pixels,height_pixels,anatomical_structures_segmention_file,glomerulus_segmentation_file,patient_number,race,ethnicity,sex,age,weight_kilograms,height_centimeters,bmi_kg/m^2,laterality,percent_cortex,percent_medulla
0,aa05346ff.tiff,47340,30720,aa05346ff-anatomical-structure.json,aa05346ff.json,67347,White,Not Hispanic or Latino,Female,58,59.0,160.0,23.0,Right,80,20
1,afa5e8098.tiff,43780,36800,afa5e8098-anatomical-structure.json,afa5e8098.json,67347,White,Not Hispanic or Latino,Female,58,59.0,160.0,23.0,Right,55,45
2,54f2eec69.tiff,22240,30440,54f2eec69-anatomical-structure.json,54f2eec69.json,67548,Black or African American,Not Hispanic or Latino,Male,58,79.9,190.5,22.0,Right,75,25
3,d488c759a.tiff,29020,46660,d488c759a-anatomical-structure.json,d488c759a.json,68138,White,Not Hispanic or Latino,Female,66,81.5,158.8,32.2,Left,100,0
4,1e2425f28.tiff,32220,26780,1e2425f28-anatomical-structure.json,1e2425f28.json,63921,White,Not Hispanic or Latino,Male,48,131.5,193.0,35.3,Right,65,35
5,e79de561c.tiff,27020,16180,e79de561c-anatomical-structure.json,e79de561c.json,67026,Black or African American,Not Hispanic or Latino,Male,53,73.0,166.0,26.5,Left,55,45
6,c68fe75ea.tiff,49780,26840,c68fe75ea-anatomical-structure.json,c68fe75ea.json,67112,White,Not Hispanic or Latino,Male,56,91.2,167.6,32.5,Left,80,20
7,095bf7a1f.tiff,39000,38160,095bf7a1f-anatomical-structure.json,095bf7a1f.json,68250,White,Not Hispanic or Latino,Female,44,71.7,160.0,28.0,Right,65,35
8,26dc41664.tiff,42360,38160,26dc41664-anatomical-structure.json,26dc41664.json,68304,White,Not Hispanic or Latino,Female,66,71.3,167.6,25.4,Left,55,45
9,57512b7f1.tiff,43160,33240,57512b7f1-anatomical-structure.json,57512b7f1.json,68555,White,Not Hispanic or Latino,Female,76,93.0,157.4,37.5,Left,80,20


In [11]:
PLOT = False
ADD_FC = False
ONLY_FC = True

### Train data

In [12]:
new_df = df_mask.copy().set_index('id')
if ONLY_FC:
    new_df['encoding'] = ""

for id_ in tqdm(df_mask['id']):
    print(f' -> {id_}')
    if id_ + ".json" in os.listdir(ANNOT_PATH):        
        annot = json.load(open(ANNOT_PATH + id_ + ".json", 'r'))
        
        w, h = df_info[df_info['image_file'] == id_ + '.tiff'][['width_pixels', 'height_pixels']].values[0]
        
        rle = df_mask[df_mask['id'] == id_]['encoding']
        
#       mask = enc2mask(rle, (w, h)).astype(np.uint8)  # smh not working
        mask = np.zeros((h, w), dtype=np.uint8)
        if not ONLY_FC:
            mask += enc2mask(rle, (w, h)).astype(np.uint8)
        
        added = 0
        for info in annot:
            label = info['properties']['classification']['name']

            if (not ADD_FC) and (label == "FC"):
                continue
                    
            if ONLY_FC and label != "FC":
                continue

            poly = info['geometry']['coordinates']
            try:
                mask = cv2.fillPoly(mask, np.int32([poly]), True)
            except ValueError:
                poly = np.concatenate([np.array(poly[i]).squeeze() for i in range(len(poly))])
                mask = cv2.fillPoly(mask, np.int32([poly]), True)
            added +=1
            
        print(f"Added {added} glomerulis")
        
        new_df.loc[id_] = rle_encode_less_memory(mask)
        
        if PLOT:
            img = load_image(os.path.join(TIFF_PATH, id_ + ".tiff"), full_size=False)
            
            mask = cv2.resize(
                mask,
                (w // 4, h // 4),
                interpolation=cv2.INTER_NEAREST,
            )
            assert mask.shape == img.shape[:2], (mask.shape, img.shape)
        
            fig = plot_contours_preds(img, mask, w=1, downsize=4)
            w = 1000
            h = int(w *  mask.shape[0] / mask.shape[1])
            fig.update_layout(
                autosize=False,
                width=w,
                height=h,
            )

            fig.show()

            break

if not PLOT:
    name = "train_fix.csv" if not ADD_FC else "train_fc.csv"
    if ONLY_FC:
        name = "train_onlyfc.csv"
    new_df.to_csv(DATA_PATH + name)
    print (name)
    print(f'\n -> Saved masks to {DATA_PATH + name}')

  0%|          | 0/15 [00:00<?, ?it/s]

 -> 2f6ecfcdf
Added 0 glomerulis
 -> 8242609fa
Added 0 glomerulis
 -> aaa6a05cc
Added 0 glomerulis
 -> cb2d976f4
Added 0 glomerulis
 -> b9a3865fc
Added 0 glomerulis
 -> b2dc8411c
Added 0 glomerulis
 -> 0486052bb
 -> e79de561c
Added 0 glomerulis
 -> 095bf7a1f
Added 0 glomerulis
 -> 54f2eec69
Added 0 glomerulis
 -> 4ef6695ce
Added 0 glomerulis
 -> 26dc41664
 -> c68fe75ea
Added 0 glomerulis
 -> afa5e8098
Added 0 glomerulis
 -> 1e2425f28
Added 0 glomerulis
train_onlyfc.csv

 -> Saved masks to ../input/train_onlyfc.csv


### Extra data

In [16]:
PLOT = False
SAVE_TIFF = True
SAVE = True
ADD_FC = True
ONLY_FC = False


In [None]:
files = [p for p in os.listdir(DATA_PATH + "extra/") if p.endswith("svs")]
rles = {}

for file in tqdm(files):
    id_ = file[:-4]
    print(f' -> {id_}')
    
#     if id_ != "SAS_21908_001":
#         continue
    
    if os.path.exists(ANNOT_PATH + id_ + ".json"):
        original_img = rasterio.open(DATA_PATH + "extra/" + file, transform=IDENTITY, num_threads='all_cpus')
        img = original_img.read([1, 2, 3]).transpose(1, 2, 0).astype(np.uint8)

        shape = img.shape[:2]
        
        annot = json.load(open(ANNOT_PATH + id_ + ".json", 'r'))

        mask = np.zeros(shape, dtype=np.uint8)

        added = 0
        for info in annot:
            poly = np.array(info['geometry']['coordinates'])
            
            try:
                label = info['properties']['classification']['name']
            except KeyError:
                print('??')
                label = "G"
            
            if not ADD_FC and label == "FC":
                continue

            if ONLY_FC and label != "FC":
                continue
                
            poly = info['geometry']['coordinates']
            try:
                mask = cv2.fillPoly(mask, np.int32([poly]), True)
            except ValueError:
                poly = np.concatenate([np.array(poly[i]).squeeze() for i in range(len(poly))])
                mask = cv2.fillPoly(mask, np.int32([poly]), True)
            added += 1
        
        print(f"Added {added} glomerulis")
        
        if PLOT:
            print('plot')
            fig = plot_contours_preds(img, mask, w=2, downsize=8)

            w = 1000
            h = int(w *  mask.shape[0] / mask.shape[1])
            fig.update_layout(
                autosize=False,
                width=w,
                height=h,
            )

            fig.show()

            break
            
        if SAVE:
            if SAVE_TIFF:
                img = cv2.resize(
                    img,
                    (img.shape[1] // 2, img.shape[0] // 2),
                    interpolation=cv2.INTER_AREA,
                )
                    
                if not os.path.exists(DATA_PATH + "extra_tiff/"):
                    os.mkdir(DATA_PATH + "extra_tiff/")
                tifffile.imsave(DATA_PATH + "extra_tiff/" + f"{id_}.tiff", img)

            mask = cv2.resize(
                mask,
                (mask.shape[1] // 2, mask.shape[0] // 2),
                interpolation=cv2.INTER_NEAREST,
            )

            rles[id_] = rle_encode_less_memory(mask)

  0%|          | 0/31 [00:00<?, ?it/s]

 -> SAS_21883_001


  poly = np.array(info['geometry']['coordinates'])


Added 11 glomerulis
 -> SAS_21891_001
Added 13 glomerulis
 -> SAS_21896_001
Added 9 glomerulis
 -> SAS_21904_001
Added 10 glomerulis
 -> SAS_21908_001
Added 7 glomerulis
 -> SAS_21915_001
Added 12 glomerulis
 -> SAS_21924_001
Added 6 glomerulis
 -> SAS_21930_001
Added 5 glomerulis
 -> SAS_21937_001
Added 17 glomerulis
 -> SAS_21942_001
Added 15 glomerulis
 -> SESCAM_102
Added 23 glomerulis
 -> SESCAM_1_0
Added 61 glomerulis
 -> SESCAM_2_0
Added 23 glomerulis
 -> SESCAM_3_0
Added 49 glomerulis
 -> SESCAM_4_0
Added 45 glomerulis


In [None]:
df_annot_extra = pd.DataFrame.from_dict(rles, orient='index', columns=['encoding'])

if SAVE and not PLOT:
    name = "train_extra.csv" if not ADD_FC else "train_extra_fc.csv"
    if ONLY_FC:
        name = "train_extra_onlyfc.csv"
    df_annot_extra.to_csv(DATA_PATH + name)
    print(f'\n -> Saved masks to {DATA_PATH + name}')

In [37]:
name

'train_fix.csv'