# Exploring the Real-Colon Dataset
Use this script to load a few random sample from the real-colon dataset

In [None]:
# Import built-in modules
%matplotlib inline
import os
import cv2
from matplotlib import pyplot as plt

# Import repo scripts
import export_coco_format

## Data format
The dataset contains the full recordings of 60 colonoscopies from 4  are organized in the following structure:

- `video_info.csv`: a csv with the metadata for each patient
- `lesion_info.csv`: a csv with the metadata for each lesion
- `DDD_VVV_frames`: the folder with all the frames for video {VVV} from dataset {DDD}
- `DDD_VVV_annotations`: the folder with all the annotations for video {VVV} from dataset {DDD}


Check how many video folders are available in the dataset folder (full dataset is composed of 60 videos)

In [None]:
base_dataset_path = "./real_colon_dataset"
video_folders = [x for x in sorted(os.listdir(base_dataset_path)) if x.endswith("frames")]
print(f"Found {len(video_folders)} video folders")

Define a function to draw a green box around each lesion

In [None]:
def draw_boxes(filename, boxes):
    img = cv2.imread(filename)
    for box in boxes:
        xmin = box['box_ltrb'][0]
        ymin = box['box_ltrb'][1]
        xmax = box['box_ltrb'][2]
        ymax = box['box_ltrb'][3]
        cv2.rectangle(img=img, rec=(xmin, ymin, xmax - xmin, ymax - ymin), color=(0, 255, 0), thickness=5)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    return img

For each video group (1-4) randomly pick 1 video and select 2 images to be visualize. 
To give priority to frames with at least a box, we take 20 random frames and search for images with boxes, if any those are displayed. 

In [None]:
import random
to_show = []
pool_frames_size = 20
frames_per_video_to_show = 2
for dataset in range(2,5):
    vv = random.randint(1,15)
    c_video_id = f"{dataset:03d}-{vv:03d}"
    frame_folder = os.path.join(base_dataset_path, c_video_id + "_frames")
    annotation_folder = os.path.join(base_dataset_path, c_video_id + "_annotations")
    c_ann = sorted(os.listdir(annotation_folder))
    print(f"Random selection: video {c_video_id} with {len(c_ann)} frames in folder")
    random.shuffle(c_ann)
    c_to_show = []
    for y in c_ann[:pool_frames_size]:
        if len(to_show)== frames_per_video_to_show:
            break
        c_ann_data = export_coco_format.parsevocfile(os.path.join(annotation_folder, y))
        if len(c_ann_data["boxes"]) > 0:
            c_to_show.append((frame_folder, c_ann_data))
    if len(c_to_show)< frames_per_video_to_show:
        for iii in range(frames_per_video_to_show-len(c_to_show)):
            c_to_show.append((frame_folder, export_coco_format.parsevocfile(os.path.join(annotation_folder, c_ann[iii]))))
    to_show += c_to_show

# Show the images with box overlaid
for imd in to_show:
    image_name = os.path.join(imd[0], imd[1]["img_name"])
    print(f"Drawing boxes on image {image_name}. Exist {os.path.exists(image_name)}")
    ima = draw_boxes(image_name, imd[1]["boxes"])
    plt.imshow(ima)
    plt.show()