<a href="https://colab.research.google.com/github/cwinsor/medical_image_uw_madison/blob/main/colab_01_data_preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing
This notebook re-implements data_preprocess.py as a colab notebook

In [19]:
import sys
import os
from shutil import rmtree

from google.colab import drive

import numpy as np
import pandas as pd
import cv2
from PIL import Image
import glob
from tqdm.auto import tqdm
import albumentations as A

In [87]:
assert ('google.colab' in sys.modules), "ERROR - the script expects to be run in Colab"

In [88]:
# we persist code, dataset and runs on google drive...
work_folder = '/content/gdrive/MyDrive/Colab_UW_Madison'
os.chdir(work_folder)
!pwd

/content/gdrive/MyDrive/Colab_UW_Madison


In [94]:
git_url = 'https://github.com/cwinsor/medical_image_uw_madison.git'
git_folder = 'medical_image_uw_madison'

assert os.path.isdir(git_folder), ("ERROR - cannot find " + git_folder)
# warning - don't kill the persist folder... here's the command...  !git clone --quiet $git_url

In [89]:
!git fetch --dry-run

fatal: not a git repository (or any parent up to mount point /content)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


In [21]:
# the code


# populate the git and it's submodule

    !git clone --quiet $git_url
    os.chdir(git_folder)
    # !ls -la
    !git submodule update --init
    os.chdir('/content/')
    sys.path.append(git_folder)
else:
    print("host is traditional server")
    assert False, "not coded"

host is colab
update existing /content/medical_image_uw_madison
Submodule 'project/Kaggle-UWMGIT' (https://github.com/CarnoZhao/Kaggle-UWMGIT) registered for path 'project/Kaggle-UWMGIT'
Cloning into '/content/medical_image_uw_madison/project/Kaggle-UWMGIT'...
Submodule path 'project/Kaggle-UWMGIT': checked out '91bc93fec8d25bed200a7ae7963c9738ac798353'


In [39]:
os.chdir(git_folder + '//project//Kaggle-UWMGIT')
!pwd

/content/medical_image_uw_madison/project/Kaggle-UWMGIT


In [40]:
def rle_decode(mask_rle, shape):
    s = np.array(mask_rle.split(), dtype=int)
    starts, lengths = s[0::2] - 1, s[1::2]
    ends = starts + lengths
    h, w = shape
    img = np.zeros((h * w,), dtype = np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo : hi] = 1
    return img.reshape(shape)

def rle_encode(img):
    pixels = img.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

In [41]:
os.chdir("./data/tract")
!pwd

/content/medical_image_uw_madison/project/Kaggle-UWMGIT/data/tract


# Side trip to download the data...

In [None]:
# download data
# see README.md in the Kaggle-UMWGIT
# and
# see https://www.kaggle.com/discussions/general/156610

In [None]:
# Go to your kaggle account, Scroll to API section and Click Expire API Token to remove previous tokens
# Click on Create New API Token - It will download kaggle.json file on your machine.

In [48]:
# Mount your Google Drive files Following code make mount your google drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [49]:
# Now upload the kaggle.json file
from google.colab import files
files.upload() #this will prompt you to upload the kaggle.json

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"cwinsor","key":"65ca34951bc1467168bd4478bfcd4720"}'}

In [50]:
# make sure kaggle.json file is present
!ls -lha kaggle.json

-rw-r--r-- 1 root root 63 Jul 20 12:30 kaggle.json


In [51]:
# Install kaggle API client
!pip install -q kaggle

# kaggle API client expects the file to be in ~/.kaggle
# so move it there
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

# we need to set permissions
!chmod 600 /root/.kaggle/kaggle.json

In [53]:
# check your directory before downloading the datasets
!pwd

# list all available datasets
# !kaggle datasets list

# download the required dataset from kaggle
!kaggle competitions download -c uw-madison-gi-tract-image-segmentation
# !kaggle datasets download -d varsharainer/dna-sequencing-classifier

/content/medical_image_uw_madison/project/Kaggle-UWMGIT/data/tract
Downloading uw-madison-gi-tract-image-segmentation.zip to /content/medical_image_uw_madison/project/Kaggle-UWMGIT/data/tract
 99% 2.27G/2.30G [00:35<00:00, 29.7MB/s]
100% 2.30G/2.30G [00:35<00:00, 68.8MB/s]


In [54]:
# If your file is a zip file you can unzip with the following code
!unzip uw-madison-gi-tract-image-segmentation.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: train/case77/case77_day20/scans/slice_0121_266_266_1.50_1.50.png  
  inflating: train/case77/case77_day20/scans/slice_0122_266_266_1.50_1.50.png  
  inflating: train/case77/case77_day20/scans/slice_0123_266_266_1.50_1.50.png  
  inflating: train/case77/case77_day20/scans/slice_0124_266_266_1.50_1.50.png  
  inflating: train/case77/case77_day20/scans/slice_0125_266_266_1.50_1.50.png  
  inflating: train/case77/case77_day20/scans/slice_0126_266_266_1.50_1.50.png  
  inflating: train/case77/case77_day20/scans/slice_0127_266_266_1.50_1.50.png  
  inflating: train/case77/case77_day20/scans/slice_0128_266_266_1.50_1.50.png  
  inflating: train/case77/case77_day20/scans/slice_0129_266_266_1.50_1.50.png  
  inflating: train/case77/case77_day20/scans/slice_0130_266_266_1.50_1.50.png  
  inflating: train/case77/case77_day20/scans/slice_0131_266_266_1.50_1.50.png  
  inflating: train/case77/case77_day20/scans/slice_0132

# Return to "data_preprocess.py" ...

In [56]:
df_train = pd.read_csv("./train.csv")
df_train = df_train.sort_values(["id", "class"]).reset_index(drop = True)
df_train["patient"] = df_train.id.apply(lambda x: x.split("_")[0])
df_train["days"] = df_train.id.apply(lambda x: "_".join(x.split("_")[:2]))
num_slices = len(np.unique(df_train.id))
num_empty_slices = df_train.groupby("id").apply(lambda x: x.segmentation.isna().all()).sum()
num_patients = len(np.unique(df_train.patient))
num_days = len(np.unique(df_train.days))
print({
    "#slices:": num_slices,
    "#empty slices:": num_empty_slices,
    "#patients": num_patients,
    "#days": num_days
})

{'#slices:': 38496, '#empty slices:': 21906, '#patients': 85, '#days': 274}


In [59]:
all_image_files[0:3]

['./train/case101/case101_day20/scans/slice_0001_266_266_1.50_1.50.png',
 './train/case101/case101_day20/scans/slice_0002_266_266_1.50_1.50.png',
 './train/case101/case101_day20/scans/slice_0003_266_266_1.50_1.50.png']

In [60]:
spacing_x[0:3]

[1.5, 1.5, 1.5]

In [61]:
spacing_y[0:3]

[1.5, 1.5, 1.5]

In [57]:
all_image_files = sorted(glob.glob("./train/*/*/scans/*.png"), key = lambda x: x.split("/")[3] + "_" + x.split("/")[5])
size_x = [int(os.path.basename(_)[:-4].split("_")[-4]) for _ in all_image_files]
size_y = [int(os.path.basename(_)[:-4].split("_")[-3]) for _ in all_image_files]
spacing_x = [float(os.path.basename(_)[:-4].split("_")[-2]) for _ in all_image_files]
spacing_y = [float(os.path.basename(_)[:-4].split("_")[-1]) for _ in all_image_files]
df_train["image_files"] = np.repeat(all_image_files, 3)
df_train["spacing_x"] = np.repeat(spacing_x, 3)
df_train["spacing_y"] = np.repeat(spacing_y, 3)
df_train["size_x"] = np.repeat(size_x, 3)
df_train["size_y"] = np.repeat(size_y, 3)
df_train["slice"] = np.repeat([int(os.path.basename(_)[:-4].split("_")[-5]) for _ in all_image_files], 3)
df_train

Unnamed: 0,id,class,segmentation,patient,days,image_files,spacing_x,spacing_y,size_x,size_y,slice
0,case101_day20_slice_0001,large_bowel,,case101,case101_day20,./train/case101/case101_day20/scans/slice_0001...,1.5,1.5,266,266,1
1,case101_day20_slice_0001,small_bowel,,case101,case101_day20,./train/case101/case101_day20/scans/slice_0001...,1.5,1.5,266,266,1
2,case101_day20_slice_0001,stomach,,case101,case101_day20,./train/case101/case101_day20/scans/slice_0001...,1.5,1.5,266,266,1
3,case101_day20_slice_0002,large_bowel,,case101,case101_day20,./train/case101/case101_day20/scans/slice_0002...,1.5,1.5,266,266,2
4,case101_day20_slice_0002,small_bowel,,case101,case101_day20,./train/case101/case101_day20/scans/slice_0002...,1.5,1.5,266,266,2
...,...,...,...,...,...,...,...,...,...,...,...
115483,case9_day22_slice_0143,small_bowel,,case9,case9_day22,./train/case9/case9_day22/scans/slice_0143_360...,1.5,1.5,360,310,143
115484,case9_day22_slice_0143,stomach,,case9,case9_day22,./train/case9/case9_day22/scans/slice_0143_360...,1.5,1.5,360,310,143
115485,case9_day22_slice_0144,large_bowel,,case9,case9_day22,./train/case9/case9_day22/scans/slice_0144_360...,1.5,1.5,360,310,144
115486,case9_day22_slice_0144,small_bowel,,case9,case9_day22,./train/case9/case9_day22/scans/slice_0144_360...,1.5,1.5,360,310,144


In [62]:
os.system("mkdir -p ./mmseg_train/images ./mmseg_train/labels")
for day, group in tqdm(df_train.groupby("days")):
    patient = group.patient.iloc[0]
    imgs = []
    msks = []
    for file_name in tqdm(group.image_files.unique(), leave = False):
        img = cv2.imread(file_name, cv2.IMREAD_ANYDEPTH)
        segms = group.loc[group.image_files == file_name]
        masks = {}
        for segm, label in zip(segms.segmentation, segms["class"]):
            if not pd.isna(segm):
                mask = rle_decode(segm, img.shape[:2])
                masks[label] = mask
            else:
                masks[label] = np.zeros(img.shape[:2], dtype = np.uint8)
        masks = np.stack([masks[k] for k in sorted(masks)], -1)
        imgs.append(img)
        msks.append(masks)

    imgs = np.stack(imgs, 0)
    msks = np.stack(msks, 0)
    for i in range(msks.shape[0]):
        img = imgs[[
            max(0, i - 2),
            i,
            min(imgs.shape[0] - 1, i + 2)
        ]].transpose(1, 2, 0)
        msk = msks[i]
        new_image_name = f"{day}_{i}.png"
        cv2.imwrite(f"./mmseg_train/images/{new_image_name}", img)
        cv2.imwrite(f"./mmseg_train/labels/{new_image_name}", msk)

  0%|          | 0/274 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

In [63]:
os.system("mkdir -p ./mmseg_train/splits")
all_image_files = glob.glob("./mmseg_train/images/*")
patients = [os.path.basename(_).split("_")[0] for _ in all_image_files]

In [65]:
from sklearn.model_selection import GroupKFold
split = list(GroupKFold(5).split(patients, groups = patients))

In [66]:
for fold, (train_idx, valid_idx) in enumerate(split):
    with open(f"./mmseg_train/splits/fold_{fold}.txt", "w") as f:
        for idx in train_idx:
            f.write(os.path.basename(all_image_files[idx])[:-4] + "\n")
    with open(f"./mmseg_train/splits/holdout_{fold}.txt", "w") as f:
        for idx in valid_idx:
            f.write(os.path.basename(all_image_files[idx])[:-4] + "\n")

In [67]:
os.system("mkdir -p ./mmseg_train/splits_notail ./mmseg_train/splits_noanno ./mmseg_train/splits_case")
tails = set()
noannos = set()
faults = set()
for day, group in df_train.groupby("days"):
    end_slice = group.slice.iloc[np.where(~group.segmentation.isna())[0][-1]]
    tail_slice = np.arange(end_slice + 1, end_slice + 6)
    tail_group = group[group.slice.isin(tail_slice)].drop_duplicates(["days", "image_files"])
    noanno_group = group[group.segmentation.isna()].drop_duplicates(["days", "image_files"])
    tails.update([os.path.basename(row.image_files)[:-4].replace("slice", row.days) for i, row in tail_group.iterrows()])
    noannos.update([os.path.basename(row.image_files)[:-4].replace("slice", row.days) for i, row in noanno_group.iterrows()])
    if day in ['case7_day0', 'case81_day30']:
        faults.update([os.path.basename(row.image_files)[:-4].replace("slice", row.days) for i, row in group.iterrows()])


In [68]:
for f in range(5):
    split = pd.read_csv(f"./mmseg_train/splits/fold_{f}.txt", header = None)
    x = list(set(split.iloc[:,0].tolist()) - tails)
    pd.DataFrame(x).to_csv(f"./mmseg_train/splits_notail/fold_{f}.txt", index = False, header = False)
    x = list(set(split.iloc[:,0].tolist()) - noannos)
    pd.DataFrame(x).to_csv(f"./mmseg_train/splits_noanno/fold_{f}.txt", index = False, header = False)
    x = list(set(split.iloc[:,0].tolist()) - faults)
    pd.DataFrame(x).to_csv(f"./mmseg_train/splits_case/fold_{f}.txt", index = False, header = False)


    split = pd.read_csv(f"./mmseg_train/splits/holdout_{f}.txt", header = None)
    x = list(set(split.iloc[:,0].tolist()) - tails)
    pd.DataFrame(x).to_csv(f"./mmseg_train/splits_notail/holdout_{f}.txt", index = False, header = False)
    x = list(set(split.iloc[:,0].tolist()) - noannos)
    pd.DataFrame(x).to_csv(f"./mmseg_train/splits_noanno/holdout_{f}.txt", index = False, header = False)
    x = list(set(split.iloc[:,0].tolist()) - faults)
    pd.DataFrame(x).to_csv(f"./mmseg_train/splits_case/holdout_{f}.txt", index = False, header = False)

    for d in ["", "_notail", "_noanno", "_case"]:
        os.system(f"cat ./mmseg_train/splits{d}/holdout_{f}.txt > ./mmseg_train/splits{d}/fold_all.txt")
        os.system(f"cat ./mmseg_train/splits{d}/fold_{f}.txt >> ./mmseg_train/splits{d}/fold_all.txt")