**About** : This notebook is used to downscale images in the train and test set, in order to speed-up training and inference
  - Use the `FACTOR` parameter to specify the downscaling factor. We recommend generating data of downscaling 2 and 4.
  - For training data, we save extra time by also computing downscaling rles. Use the `NAME` parameter to specify which rle to downscale.
  - It is only require to save the downscaled images once, use the `SAVE_IMG` parameters to this extent.

In [11]:
%load_ext autoreload
%autoreload 2

### Imports

In [21]:
import os
import gc
import cv2
import sys
import tifffile
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
from collections import Counter
from matplotlib import pyplot as plt

sys.path.append("../code/")

In [22]:
from data.dataset import load_image
from utils.rle import *
from params import *

In [26]:
FACTOR = 3

In [50]:
DATA_PATH = r"C:\Users\soodn\Downloads\Naveksha\Kaggle HuBMAP\Scripts\4. DeepLive\HubMap\input"
TIFF_PATH = r"C:\Users\soodn\Downloads\Naveksha\Kaggle HuBMAP\Scripts\4. DeepLive\HubMap\input"

### Train

In [51]:
out_dir = DATA_PATH + f"/train_{FACTOR}/"
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

In [52]:
NAME = "_onlyfc"  # unhealthy class
# NAME = "_fix"  # healthy class with fixed issues
# NAME = ""  # original data
SAVE_IMG = False

df_masks = pd.read_csv(DATA_PATH + "/train_colon" + NAME + ".csv").set_index("id")

In [53]:
df_masks

Unnamed: 0_level_0,encoding
id,Unnamed: 1_level_1
CL_HandE_1234_B004_bottomright,
CL_HandE_1234_B004_topleft,
CL_HandE_1234_B004_topright,
HandE_B005_CL_b_RGB_bottomright,
HandE_B005_CL_b_RGB_topleft,


In [49]:
masks = {}

for index, encs in tqdm(df_masks.iterrows(), total=len(df_masks)):
    # read image and generate the mask
    img = load_image(os.path.join(TIFF_PATH, index + ".tiff"))
    mask = enc2mask(encs, (img.shape[1], img.shape[0]))

    if SAVE_IMG:
        img = cv2.resize(
            img,
            (img.shape[1] // FACTOR, img.shape[0] // FACTOR),
            interpolation=cv2.INTER_AREA,
        )
        tifffile.imsave(out_dir + f"{index}.tiff", img)

    mask = cv2.resize(
        mask,
        (mask.shape[1] // FACTOR, mask.shape[0] // FACTOR),
        interpolation=cv2.INTER_NEAREST,
    )
    
    rle = mask2enc(mask)
    
    masks[index] = rle
    
#     break

  0%|          | 0/5 [00:00<?, ?it/s]

FileNotFoundError: [Errno 2] No such file or directory: '../input/HuBMAP-20-dataset_information.csv'

In [44]:
df_masks = pd.DataFrame.from_dict(masks).T.reset_index().rename(columns={0: "encoding", "index": "id"})

df_masks.to_csv(f"{DATA_PATH}train_{FACTOR}{NAME}.csv", index=False)

print(f"Saved data to {DATA_PATH}train_{FACTOR}{NAME}.csv")

Saved data to ../input/train_2_fix.csv


### Test

In [49]:
out_dir = DATA_PATH + f"test_{FACTOR}/"

if not os.path.exists(out_dir):
    os.mkdir(out_dir)

In [50]:
df = pd.read_csv(DATA_PATH + "sample_submission.csv")

In [51]:
for index in tqdm(df['id']):
    # read image and generate the mask
    img = load_image(os.path.join(TIFF_PATH_TEST, index + ".tiff"))

    img = cv2.resize(
        img,
        (img.shape[1] // FACTOR, img.shape[0] // FACTOR),
        interpolation=cv2.INTER_AREA,
    )
    
    tifffile.imsave(out_dir + f"{index}.tiff", img)

  0%|          | 0/5 [00:00<?, ?it/s]

In [52]:
for index in tqdm(EXTRA_IMGS):
    # read image and generate the mask
    img = load_image(os.path.join(TIFF_PATH_TEST, index + ".tiff"))

    img = cv2.resize(
        img,
        (img.shape[1] // FACTOR, img.shape[0] // FACTOR),
        interpolation=cv2.INTER_AREA,
    )
    
     tifffile.imsave(out_dir + f"{index}.tiff", img)

  0%|          | 0/2 [00:00<?, ?it/s]