In [54]:
import os
import sys
sys.path.append("/radraid2/dongwoolee/Colon")
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
from torchvision.transforms import ToTensor
from src.data.dataset import RealColonDataset
from src.utils.transformations import CustomTransform
from src.utils.helpers import set_random_seed
from sklearn.model_selection import train_test_split
import pandas as pd

# Train Val Test Split

## Train + Val: stratified splitting from all videos other than 004
## Test: videos starting with 004

In [None]:
def get_stat(df: pd.DataFrame):
    vids = len(df['video_id'].unique())
    frames = len(df)
    polyp_frames = df['is_polyps_frame'].sum()
    print(f"Videos: {vids}, Frames: {frames}, Polyps: {polyp_frames}")

full_csv = "/radraid2/dongwoolee/Colon/data/frames_polyps.csv"

full_df = pd.read_csv(full_csv)
test_df = full_df[full_df['video_id'].str.startswith("004")]
nontest_df = full_df[~full_df['video_id'].str.startswith("004")]
test_df.to_csv("/radraid2/dongwoolee/Colon/data/frames_test.csv", index=False)
nontest_df.to_csv("/radraid2/dongwoolee/Colon/data/frames_nontest.csv", index=False)

get_stat(full_df)
get_stat(test_df)
get_stat(nontest_df)

Videos: 53, Frames: 2523650, Polyps: 313273
Videos: 15, Frames: 515413, Polyps: 78883
Videos: 38, Frames: 2008237, Polyps: 234390


In [99]:
uvids = nontest_df['video_id'].unique()
metadata = []
for vid in uvids:
    vid_df = nontest_df[nontest_df['video_id']==vid]
    data = [vid, len(vid_df), sum(vid_df['is_polyps_frame'])]
    metadata.append(data)
df = pd.DataFrame(metadata, columns=["patient_id", "total_frames", "polyp_frames"])

# Compute the polyp ratio for each patient (as a percentage or per 1000 frames, here we keep it raw)
df['polyp_ratio'] = df['polyp_frames'] / df['total_frames']

# Create bins for polyp_ratio and total_frames. 
# The number of bins (q) can be tuned; here we use 5.
df['ratio_bin'] = pd.qcut(df['polyp_ratio'], q=2, duplicates='drop')
df['frame_bin'] = pd.qcut(df['total_frames'], q=3, duplicates='drop')

# Combine the bins into a single stratification column.
df['stratify_bin'] = df['ratio_bin'].astype(str) + '_' + df['frame_bin'].astype(str)

# First, split off the test set (say 20% of the patients), stratifying by the combined bin.
train, val = train_test_split(
    df, 
    test_size=0.18, 
    stratify=df['stratify_bin'], 
    random_state=42
)

# Print summary statistics for each set
def print_stats(name, d):
    total_frames = d['total_frames'].sum()
    total_polyp = d['polyp_frames'].sum()
    ratio = total_polyp / total_frames if total_frames > 0 else None
    print(f"{name}: Patients={len(d)}, Total Frames={total_frames}, Total Polyp Frames={total_polyp}, Polyp Ratio={ratio:.4f}")

print_stats("Train", train)
print_stats("Valid", val)
train_df = nontest_df[nontest_df["video_id"].isin(train["patient_id"].values)]
val_df = nontest_df[nontest_df["video_id"].isin(val["patient_id"].values)]
train_df.to_csv("/radraid2/dongwoolee/Colon/data/frames_train.csv", index=False)
val_df.to_csv("/radraid2/dongwoolee/Colon/data/frames_val.csv", index=False)

Train: Patients=31, Total Frames=1683094, Total Polyp Frames=195705, Polyp Ratio=0.1163
Valid: Patients=7, Total Frames=325143, Total Polyp Frames=38685, Polyp Ratio=0.1190


## Additional Experiments (Not Used)

In [70]:

# Create a new column "group" by extracting the first three characters from patient_id
df["group"] = df["patient_id"].str.split("-").str[0]

# Split the data: train+val from groups 001,002,003 and test from group 004
train_val_df = df[df["group"].isin(["001", "002", "003"])]
test_df = df[df["group"] == "004"]

def print_stats(name, data):
    total_frames = data["total_frames"].sum()
    total_polyp_frames = data["polyp_frames"].sum()
    polyp_ratio = total_polyp_frames / total_frames
    num_patients = data.shape[0]
    print(f"{name}: Patients={num_patients}, Total Frames={total_frames}, "
          f"Total Polyp Frames={total_polyp_frames}, Polyp Ratio={polyp_ratio:.4f}")

print("Statistics by Split:")
print_stats("Train+Val", train_val_df)
print_stats("Test", test_df)


Statistics by Split:
Train+Val: Patients=38, Total Frames=2008237, Total Polyp Frames=234390, Polyp Ratio=0.1167
Test: Patients=15, Total Frames=515413, Total Polyp Frames=78883, Polyp Ratio=0.1530


In [2]:
# Set a global random seed here.
GLOBAL_SEED = 42
set_random_seed(GLOBAL_SEED)

train_transform = T.Compose([
    CustomTransform(pad_method="zeros", max_size=(1352,1080), target_size=(224,224), augment=True),
    ToTensor()
])

test_transform = T.Compose([
    CustomTransform(pad_method="zeros", max_size=(1352,1080), target_size=(224,224), augment=False),
    ToTensor()
])