<a href="https://colab.research.google.com/github/cwinsor/medical_image_uw_madison/blob/main/colab_01_data_preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing
This notebook re-implements data_preprocess.py as a colab notebook

In [1]:
import sys
import os
from shutil import rmtree

from google.colab import drive

import numpy as np
import pandas as pd
import cv2
from PIL import Image
import glob
from tqdm.auto import tqdm
import albumentations as A

In [2]:
assert ('google.colab' in sys.modules), "ERROR - the script expects to be run in Colab"

In [6]:
# we persist code, dataset and runs on google drive...
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [7]:
# # warning - do not kill the persist folder - you will lose all the run history!
# # the code below should only executed for initial setup!
# drive_persist_folder = '/content/gdrive/MyDrive/Colab_UW_Madison'
# os.chdir(drive_persist_folder)
# git_url = 'https://github.com/cwinsor/medical_image_uw_madison.git'
# !git clone --quiet $git_url
# os.chdir('medical_image_uw_madison')
# # populate the submodule
# !git submodule update --init

In [8]:
work_folder = '/content/gdrive/MyDrive/Colab_UW_Madison/medical_image_uw_madison/'
os.chdir(work_folder)
print("checking git status (git fetch --dry-run):")
!pwd
!git fetch --dry-run

checking git status (git fetch --dry-run):
/content/gdrive/MyDrive/Colab_UW_Madison/medical_image_uw_madison
remote: Enumerating objects: 11, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 9 (delta 6), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (9/9), 88.99 KiB | 134.00 KiB/s, done.
From https://github.com/cwinsor/medical_image_uw_madison
   18a51f5..84de5b7  main       -> origin/main


In [9]:
os.chdir(work_folder + 'project/Kaggle-UWMGIT/data/tract')
!pwd

/content/gdrive/MyDrive/Colab_UW_Madison/medical_image_uw_madison/project/Kaggle-UWMGIT/data/tract


In [10]:
def rle_decode(mask_rle, shape):
    s = np.array(mask_rle.split(), dtype=int)
    starts, lengths = s[0::2] - 1, s[1::2]
    ends = starts + lengths
    h, w = shape
    img = np.zeros((h * w,), dtype = np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo : hi] = 1
    return img.reshape(shape)

def rle_encode(img):
    pixels = img.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

# Side trip to download the data...

In [112]:
# download data
# see README.md in the Kaggle-UMWGIT
# and
# see https://www.kaggle.com/discussions/general/156610

In [113]:
# Go to your kaggle account, Scroll to API section and Click Expire API Token to remove previous tokens
# Click on Create New API Token - It will download kaggle.json file on your machine.

In [11]:
# Now upload the kaggle.json file
from google.colab import files
files.upload() #this will prompt you to upload the kaggle.json

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"cwinsor","key":"65ca34951bc1467168bd4478bfcd4720"}'}

In [12]:
# make sure kaggle.json file is present
!ls -lha kaggle.json

-rw------- 1 root root 63 Jul 20 16:38 kaggle.json


In [13]:
# Install kaggle API client
!pip install -q kaggle

# kaggle API client expects the file to be in ~/.kaggle
# so move it there
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/  # note mv not cp

# we need to set permissions
!chmod 600 /root/.kaggle/kaggle.json

In [14]:
!pwd
if os.path.isdir('train'):
    print("you already have the data downloaded")
else:
    # download the dataset from kaggle
    !kaggle competitions download -c uw-madison-gi-tract-image-segmentation
    # unzip
    !unzip uw-madison-gi-tract-image-segmentation.zip

/content/gdrive/MyDrive/Colab_UW_Madison/medical_image_uw_madison/project/Kaggle-UWMGIT/data/tract
you already have the data downloaded


# Return to "data_preprocess.py" ...

In [16]:
df_train = pd.read_csv("./train.csv")
df_train = df_train.sort_values(["id", "class"]).reset_index(drop = True)
df_train["patient"] = df_train.id.apply(lambda x: x.split("_")[0])
df_train["days"] = df_train.id.apply(lambda x: "_".join(x.split("_")[:2]))
num_slices = len(np.unique(df_train.id))
num_empty_slices = df_train.groupby("id").apply(lambda x: x.segmentation.isna().all()).sum()
num_patients = len(np.unique(df_train.patient))
num_days = len(np.unique(df_train.days))
print({
    "#slices:": num_slices,
    "#empty slices:": num_empty_slices,
    "#patients": num_patients,
    "#days": num_days
})

{'#slices:': 38496, '#empty slices:': 21906, '#patients': 85, '#days': 274}


In [17]:
all_image_files = sorted(glob.glob("./train/*/*/scans/*.png"), key = lambda x: x.split("/")[3] + "_" + x.split("/")[5])
size_x = [int(os.path.basename(_)[:-4].split("_")[-4]) for _ in all_image_files]
size_y = [int(os.path.basename(_)[:-4].split("_")[-3]) for _ in all_image_files]
spacing_x = [float(os.path.basename(_)[:-4].split("_")[-2]) for _ in all_image_files]
spacing_y = [float(os.path.basename(_)[:-4].split("_")[-1]) for _ in all_image_files]
df_train["image_files"] = np.repeat(all_image_files, 3)
df_train["spacing_x"] = np.repeat(spacing_x, 3)
df_train["spacing_y"] = np.repeat(spacing_y, 3)
df_train["size_x"] = np.repeat(size_x, 3)
df_train["size_y"] = np.repeat(size_y, 3)
df_train["slice"] = np.repeat([int(os.path.basename(_)[:-4].split("_")[-5]) for _ in all_image_files], 3)
df_train

ValueError: ignored

In [None]:
os.system("mkdir -p ./mmseg_train/images ./mmseg_train/labels")
for day, group in tqdm(df_train.groupby("days")):
    patient = group.patient.iloc[0]
    imgs = []
    msks = []
    for file_name in tqdm(group.image_files.unique(), leave = False):
        img = cv2.imread(file_name, cv2.IMREAD_ANYDEPTH)
        segms = group.loc[group.image_files == file_name]
        masks = {}
        for segm, label in zip(segms.segmentation, segms["class"]):
            if not pd.isna(segm):
                mask = rle_decode(segm, img.shape[:2])
                masks[label] = mask
            else:
                masks[label] = np.zeros(img.shape[:2], dtype = np.uint8)
        masks = np.stack([masks[k] for k in sorted(masks)], -1)
        imgs.append(img)
        msks.append(masks)

    imgs = np.stack(imgs, 0)
    msks = np.stack(msks, 0)
    for i in range(msks.shape[0]):
        img = imgs[[
            max(0, i - 2),
            i,
            min(imgs.shape[0] - 1, i + 2)
        ]].transpose(1, 2, 0)
        msk = msks[i]
        new_image_name = f"{day}_{i}.png"
        cv2.imwrite(f"./mmseg_train/images/{new_image_name}", img)
        cv2.imwrite(f"./mmseg_train/labels/{new_image_name}", msk)

  0%|          | 0/274 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:01<?, ?it/s]

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-131-6d46e15ea5ca>", line 15, in <cell line: 2>
    masks[label] = np.zeros(img.shape[:2], dtype = np.uint8)
AttributeError: 'NoneType' object has no attribute 'shape'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 2099, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'AttributeError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/ultratb.py", line 1101, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  

In [63]:
os.system("mkdir -p ./mmseg_train/splits")
all_image_files = glob.glob("./mmseg_train/images/*")
patients = [os.path.basename(_).split("_")[0] for _ in all_image_files]

In [65]:
from sklearn.model_selection import GroupKFold
split = list(GroupKFold(5).split(patients, groups = patients))

In [66]:
for fold, (train_idx, valid_idx) in enumerate(split):
    with open(f"./mmseg_train/splits/fold_{fold}.txt", "w") as f:
        for idx in train_idx:
            f.write(os.path.basename(all_image_files[idx])[:-4] + "\n")
    with open(f"./mmseg_train/splits/holdout_{fold}.txt", "w") as f:
        for idx in valid_idx:
            f.write(os.path.basename(all_image_files[idx])[:-4] + "\n")

In [67]:
os.system("mkdir -p ./mmseg_train/splits_notail ./mmseg_train/splits_noanno ./mmseg_train/splits_case")
tails = set()
noannos = set()
faults = set()
for day, group in df_train.groupby("days"):
    end_slice = group.slice.iloc[np.where(~group.segmentation.isna())[0][-1]]
    tail_slice = np.arange(end_slice + 1, end_slice + 6)
    tail_group = group[group.slice.isin(tail_slice)].drop_duplicates(["days", "image_files"])
    noanno_group = group[group.segmentation.isna()].drop_duplicates(["days", "image_files"])
    tails.update([os.path.basename(row.image_files)[:-4].replace("slice", row.days) for i, row in tail_group.iterrows()])
    noannos.update([os.path.basename(row.image_files)[:-4].replace("slice", row.days) for i, row in noanno_group.iterrows()])
    if day in ['case7_day0', 'case81_day30']:
        faults.update([os.path.basename(row.image_files)[:-4].replace("slice", row.days) for i, row in group.iterrows()])


In [68]:
for f in range(5):
    split = pd.read_csv(f"./mmseg_train/splits/fold_{f}.txt", header = None)
    x = list(set(split.iloc[:,0].tolist()) - tails)
    pd.DataFrame(x).to_csv(f"./mmseg_train/splits_notail/fold_{f}.txt", index = False, header = False)
    x = list(set(split.iloc[:,0].tolist()) - noannos)
    pd.DataFrame(x).to_csv(f"./mmseg_train/splits_noanno/fold_{f}.txt", index = False, header = False)
    x = list(set(split.iloc[:,0].tolist()) - faults)
    pd.DataFrame(x).to_csv(f"./mmseg_train/splits_case/fold_{f}.txt", index = False, header = False)


    split = pd.read_csv(f"./mmseg_train/splits/holdout_{f}.txt", header = None)
    x = list(set(split.iloc[:,0].tolist()) - tails)
    pd.DataFrame(x).to_csv(f"./mmseg_train/splits_notail/holdout_{f}.txt", index = False, header = False)
    x = list(set(split.iloc[:,0].tolist()) - noannos)
    pd.DataFrame(x).to_csv(f"./mmseg_train/splits_noanno/holdout_{f}.txt", index = False, header = False)
    x = list(set(split.iloc[:,0].tolist()) - faults)
    pd.DataFrame(x).to_csv(f"./mmseg_train/splits_case/holdout_{f}.txt", index = False, header = False)

    for d in ["", "_notail", "_noanno", "_case"]:
        os.system(f"cat ./mmseg_train/splits{d}/holdout_{f}.txt > ./mmseg_train/splits{d}/fold_all.txt")
        os.system(f"cat ./mmseg_train/splits{d}/fold_{f}.txt >> ./mmseg_train/splits{d}/fold_all.txt")