In [95]:
import os
import glob
import random

import cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from tqdm import tqdm

import sys
sys.path.insert(0, "..")

from helper import get_mask, id2filename, create_metadata_table

In [96]:
dataset_path = "../../../Dataset/uw-madison-gi-tract-image-segmentation/"
df = create_metadata_table(dataset_path)

100%|██████████████████████████████████████████████████████████████████████████| 38496/38496 [00:16<00:00, 2277.58it/s]


In [97]:
case_days = df["case_day"].unique()
case_days.sort()
len(case_days)

274

In [98]:
df.groupby("case")["case_day"].unique()

case
case101    [case101_day20, case101_day22, case101_day26, ...
case102                                       [case102_day0]
case107         [case107_day0, case107_day19, case107_day21]
case108         [case108_day0, case108_day10, case108_day13]
case11             [case11_day0, case11_day12, case11_day13]
                                 ...                        
case89     [case89_day0, case89_day17, case89_day19, case...
case9                 [case9_day0, case9_day20, case9_day22]
case90             [case90_day0, case90_day22, case90_day29]
case91                                         [case91_day0]
case92                                         [case92_day0]
Name: case_day, Length: 85, dtype: object

In [99]:
nunique = df.groupby("case")["case_day"].nunique()
print(nunique.mean(), nunique.sum())

3.223529411764706 274


Test has around 50 cases. Let's assume test set has around

50 * 3 = 150 case_days'

In [100]:
nunique.sum()

274

In [101]:
train_ratio = nunique.sum() / ( nunique.sum() + 150 )
print("Training ratio:", train_ratio, 
      "\nNumber of training cases:", round(train_ratio*274),
     "\nNumber of test cases:", round((1-train_ratio)*274))

Training ratio: 0.6462264150943396 
Number of training cases: 177 
Number of test cases: 97


Lets divide the training data into different folds!

In [102]:
folds = list()
fold = list()
counter = 0
cases = df.groupby("case")["scan_dir_path"].unique()
cases = cases.sample(frac=1).reset_index(drop=True) # Shuffle
for case_days in cases:
    
    fold.append(case_days)
    
    if counter > 12:
        counter = 0
        folds.append(fold)
        fold = list()
    else:    
        counter += 1
folds[-1] += fold

In [103]:
print("Number of cases", "\tNumber of scans", "\tNumber of images")
for fold in folds:
    num_of_scans = sum([len (case_days) for case_days in fold])
    num_of_images = sum([sum ([len(os.listdir(case_day)) for case_day in case_days]) for case_days in fold])
    print(len(fold), "\t\t\t" + str(num_of_scans), "\t\t\t" + str(num_of_images))

Number of cases 	Number of scans 	Number of images
14 			43 			6064
14 			42 			6048
14 			52 			7424
14 			43 			5872
14 			48 			6912
15 			46 			6176


In [104]:
df.head()

Unnamed: 0,id,large_bowel,small_bowel,stomach,sliceHeight,sliceWidth,pixelSpacingHeight,pixelSpacingWidth,num_slices,case,case_day,img_path,scan_dir_path,case_path
0,case101_day20_slice_0001,,,,266,266,1.5,1.5,144,case101,case101_day20,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...
1,case101_day20_slice_0002,,,,266,266,1.5,1.5,144,case101,case101_day20,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...
2,case101_day20_slice_0003,,,,266,266,1.5,1.5,144,case101,case101_day20,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...
3,case101_day20_slice_0004,,,,266,266,1.5,1.5,144,case101,case101_day20,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...
4,case101_day20_slice_0005,,,,266,266,1.5,1.5,144,case101,case101_day20,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...


In [105]:
from pathlib import Path
from shutil import copytree

save_loc = "../data/02_intermediate"
cases =  list()
tot = 0
for i, fold in enumerate(folds):
    dirname = f"fold_{str(i).rjust(2, '0')}"
    save_path = os.path.join(save_loc, dirname)
    Path(save_path).mkdir(exist_ok=True, parents=True)
    
    fold_case_paths = [next(iter(set([os.sep.join(case_day.split(os.sep)[:-2]) for case_day in case_days]))) for case_days in fold]
    
    cases += fold_case_paths
    tot += len(fold_case_paths)
    assert len(set(cases)) == tot
    
    for fold_case_path in tqdm(fold_case_paths):
        """
        Move 'fold_case_path' to 'save_path'
        """
        copytree(fold_case_path, os.path.join(save_path, os.path.basename(fold_case_path)))

100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:32<00:00,  2.33s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:30<00:00,  2.17s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:38<00:00,  2.74s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:30<00:00,  2.18s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:35<00:00,  2.55s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:35<00:00,  2.38s/it]


NameError: name 'scan_dir_path' is not defined