In [1]:
import os
import glob
import random

import cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from tqdm import tqdm

import sys
sys.path.insert(0, "..")

from helper import get_mask, id2filename, create_metadata_table

In [3]:
dataset_path = "../../../Dataset/uw-madison-gi-tract-image-segmentation/"
df = create_metadata_table(dataset_path)

100%|██████████████████████████████████████████████████████████████████████████| 38496/38496 [00:08<00:00, 4442.98it/s]


In [10]:
case_days = df["case_day"].unique()
case_days.sort()
len(case_days)

274

In [12]:
df.groupby("case")["case_day"].unique()

case
case101    [case101_day20, case101_day22, case101_day26, ...
case102                                       [case102_day0]
case107         [case107_day0, case107_day19, case107_day21]
case108         [case108_day0, case108_day10, case108_day13]
case11             [case11_day0, case11_day12, case11_day13]
                                 ...                        
case89     [case89_day0, case89_day17, case89_day19, case...
case9                 [case9_day0, case9_day20, case9_day22]
case90             [case90_day0, case90_day22, case90_day29]
case91                                         [case91_day0]
case92                                         [case92_day0]
Name: case_day, Length: 85, dtype: object

In [17]:
nunique = df.groupby("case")["case_day"].nunique()
print(nunique.mean(), nunique.sum())

3.223529411764706 274


Test has around 50 cases. Let's assume test set has around

50 * 3 = 150 case_days'

In [37]:
train_ratio = nunique.sum() / ( nunique.sum() + 150 )
print("Training ratio:", train_ratio, 
      "\nNumber of training cases:", round(train_ratio*274),
     "\nNumber of test cases:", round((1-train_ratio)*274))

Training ratio: 0.6462264150943396 
Number of training cases: 177 
Number of test cases: 97


Lets divide the training data into different folds!

In [117]:
folds = list()
fold = list()
counter = 0
cases = df.groupby("case")["case_day"].unique()
cases = cases.sample(frac=1).reset_index(drop=True) # Shuffle
for case_days in cases:
    
    fold.append(case_days)
    
    if counter > 10:
        counter = 0
        folds.append(fold)
        fold = list()
    else:    
        counter += 1
folds[-1] += fold

In [118]:
print("Number of cases", "\tNumber of scans")
for fold in folds:
    num_of_scans = sum([len (case_days) for case_days in fold])
    print(len(fold), "\t\t\t" + str(num_of_scans))

Number of cases 	Number of scans
12 			37
12 			36
12 			42
12 			39
12 			39
12 			41
13 			40


14.166666666666666

In [119]:
df

Unnamed: 0,id,large_bowel,small_bowel,stomach,sliceHeight,sliceWidth,pixelSpacingHeight,pixelSpacingWidth,num_slices,case,case_day
0,case101_day20_slice_0001,,,,266,266,1.5,1.5,144,case101,case101_day20
1,case101_day20_slice_0002,,,,266,266,1.5,1.5,144,case101,case101_day20
2,case101_day20_slice_0003,,,,266,266,1.5,1.5,144,case101,case101_day20
3,case101_day20_slice_0004,,,,266,266,1.5,1.5,144,case101,case101_day20
4,case101_day20_slice_0005,,,,266,266,1.5,1.5,144,case101,case101_day20
...,...,...,...,...,...,...,...,...,...,...,...
38491,case9_day22_slice_0140,,,,360,310,1.5,1.5,144,case9,case9_day22
38492,case9_day22_slice_0141,,,,360,310,1.5,1.5,144,case9,case9_day22
38493,case9_day22_slice_0142,,,,360,310,1.5,1.5,144,case9,case9_day22
38494,case9_day22_slice_0143,,,,360,310,1.5,1.5,144,case9,case9_day22


In [171]:
from helper import get_pivot_table
dataframe_path = os.path.join(dataset_path, "train.csv")
pivot_df = get_pivot_table(dataframe_path)

img_paths = glob.glob( os.path.join(dataset_path, "train/*/*/scans/*.png"))
b = [a.split(os.sep)[1:3] + a.split(os.sep)[-1].replace(".png", "").split("_") + [len(os.listdir(os.sep.join(a.split(os.sep)[:-1])))] for a in tqdm(img_paths)]
df_more_data = pd.DataFrame([["_".join(a[1:4])] + a[4:] for a in b], columns=["id", "sliceHeight", "sliceWidth", "pixelSpacingHeight", "pixelSpacingWidth", "num_slices"])

big_df = pivot_df.merge(df_more_data, on="id")
big_df[["sliceHeight", "sliceWidth", "num_slices"]] = big_df[["sliceHeight", "sliceWidth", "num_slices"]].astype(int)
big_df[["pixelSpacingHeight", "pixelSpacingWidth"]] = big_df[["pixelSpacingHeight", "pixelSpacingWidth"]].astype(float)
big_df["case"] = big_df["id"].str.split("_").apply(lambda x: x[0])
big_df["case_day"] = big_df["id"].str.split("_").apply(lambda x: "_".join(x[:2]))

100%|██████████████████████████████████████████████████████████████████████████| 38496/38496 [00:08<00:00, 4488.96it/s]


In [172]:
df_paths = pd.DataFrame(
    [[
        "_".join([img_path.split(os.sep)[-3], "_".join(img_path.split(os.sep)[-1].split("_")[:2])]), 
        img_path, 
        os.sep.join(img_path.split(os.sep)[:-1]), 
        os.sep.join(img_path.split(os.sep)[:-3])
    ] for img_path in img_paths], 
    columns = ["id", "img_path", "scan_dir_path", "case_path"]
)
big_df = big_df.merge(df_paths, on="id")

In [173]:
big_df

Unnamed: 0,id,large_bowel,small_bowel,stomach,sliceHeight,sliceWidth,pixelSpacingHeight,pixelSpacingWidth,num_slices,case,case_day,img_path,scan_dir_path,case_path
0,case101_day20_slice_0001,,,,266,266,1.5,1.5,144,case101,case101_day20,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...
1,case101_day20_slice_0002,,,,266,266,1.5,1.5,144,case101,case101_day20,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...
2,case101_day20_slice_0003,,,,266,266,1.5,1.5,144,case101,case101_day20,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...
3,case101_day20_slice_0004,,,,266,266,1.5,1.5,144,case101,case101_day20,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...
4,case101_day20_slice_0005,,,,266,266,1.5,1.5,144,case101,case101_day20,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38491,case9_day22_slice_0140,,,,360,310,1.5,1.5,144,case9,case9_day22,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...
38492,case9_day22_slice_0141,,,,360,310,1.5,1.5,144,case9,case9_day22,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...
38493,case9_day22_slice_0142,,,,360,310,1.5,1.5,144,case9,case9_day22,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...
38494,case9_day22_slice_0143,,,,360,310,1.5,1.5,144,case9,case9_day22,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...,../../../Dataset/uw-madison-gi-tract-image-seg...
